Lexer


1   /*
2    * @(#)Lexer.java   1.11 2000/08/16
3    *
4    */
5   
6   package org.w3c.tidy;
7   
8   /**
9    *
10   * Lexer for html parser
11   *
12   * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13   * See Tidy.java for the copyright notice.
14   * Derived from <a HREF="http://www.w3.org/People/Raggett/tidy">
15   * HTML Tidy Release 4 Aug 2000</a>
16   *
17   * @author  Dave Raggett <dsr@w3.org>
18   * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19   * @version 1.0, 1999/05/22
20   * @version 1.0.1, 1999/05/29
21   * @version 1.1, 1999/06/18 Java Bean
22   * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23   * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24   * @version 1.4, 1999/09/04 DOM support
25   * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26   * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27   * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28   * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29   * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30   * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31   * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
32   */
33  
34  /*
35    Given a file stream fp it returns a sequence of tokens.
36  
37       GetToken(fp) gets the next token
38       UngetToken(fp) provides one level undo
39  
40    The tags include an attribute list:
41  
42      - linked list of attribute/value nodes
43      - each node has 2 null-terminated strings.
44      - entities are replaced in attribute values
45  
46    white space is compacted if not in preformatted mode
47    If not in preformatted mode then leading white space
48    is discarded and subsequent white space sequences
49    compacted to single space chars.
50  
51    If XmlTags is no then Tag names are folded to upper
52    case and attribute names to lower case.
53  
54   Not yet done:
55      -   Doctype subset and marked sections
56  */
57  
58  import java.io.PrintWriter  ;
59  import java.util.Stack  ;
60  import java.util.Vector  ;
61  
62  public class Lexer {
63  
64  
65      public StreamIn in;   /* file stream */
66      public PrintWriter   errout;   /* error output stream */
67      public short badAccess; /* for accessibility errors */
68      public short badLayout; /* for bad style errors */
69      public short badChars;  /* for bad char encodings */
70      public short badForm;   /* for mismatched/mispositioned form tags */
71      public short warnings;  /* count of warnings in this document */
72      public short errors;    /* count of errors */
73      public int   lines;     /* lines seen */
74      public int   columns;   /* at start of current token */
75      public boolean waswhite;  /* used to collapse contiguous white space */
76      public boolean pushed;    /* true after token has been pushed back */
77      public boolean insertspace;   /* when space is moved after end tag */
78      public boolean excludeBlocks;  /* Netscape compatibility */
79      public boolean exiled;    /* true if moved out of table */
80      public boolean isvoyager; /* true if xmlns attribute on html element */
81      public short versions;  /* bit vector of HTML versions */
82      public int doctype;    /* version as given by doctype (if any) */
83      public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
84      public int txtstart;  /* start of current node */
85      public int txtend;    /* end of current node */
86      public short state;     /* state of lexer's finite state machine */
87      public Node token;
88  
89      /* 
90        lexer character buffer
91  
92        parse tree nodes span onto this buffer
93        which contains the concatenated text
94        contents of all of the elements.
95  
96       lexsize must be reset for each file.
97      */
98      public byte[] lexbuf;   /* byte buffer of UTF-8 chars */
99      public int lexlength;   /* allocated */
100     public int lexsize;     /* used */
101 
102     /* Inline stack for compatibility with Mosaic */
103     public Node inode;        /* for deferring text node */
104     public int insert;        /* for inferring inline tags */
105     public Stack   istack;
106     public int istackbase;    /* start of frame */
107 
108     public Style styles;      /* used for cleaning up presentation markup */
109 
110     public Configuration configuration;
111     protected int seenBodyEndTag; /* used by parser */
112     private Vector   nodeList;
113 
114     public Lexer(StreamIn in, Configuration configuration)
115     {
116         this.in = in;
117         this.lines = 1;
118         this.columns = 1;
119         this.state = LEX_CONTENT;
120         this.badAccess = 0;
121         this.badLayout = 0;
122         this.badChars = 0;
123         this.badForm = 0;
124         this.warnings = 0;
125         this.errors = 0;
126         this.waswhite = false;
127         this.pushed = false;
128         this.insertspace = false;
129         this.exiled = false;
130         this.isvoyager = false;
131         this.versions = Dict.VERS_EVERYTHING;
132         this.doctype = Dict.VERS_UNKNOWN;
133         this.badDoctype = false;
134         this.txtstart = 0;
135         this.txtend = 0;
136         this.token = null;
137         this.lexbuf =  null;
138         this.lexlength = 0;
139         this.lexsize = 0;
140         this.inode = null;
141         this.insert = -1;
142         this.istack = new Stack  ();
143         this.istackbase = 0;
144         this.styles = null;
145         this.configuration = configuration;
146         this.seenBodyEndTag = 0;
147         this.nodeList = new Vector  ();
148     }
149 
150     public Node newNode()
151     {
152         Node node = new Node();
153         nodeList.addElement(node);
154         return node;
155     }
156 
157     public Node newNode(short type, byte[] textarray, int start, int end)
158     {
159         Node node = new Node(type, textarray, start, end);
160         nodeList.addElement(node);
161         return node;
162     }
163 
164     public Node newNode(short type, byte[] textarray, int start, int end, String   element)
165     {
166         Node node = new Node(type, textarray, start, end, element, configuration.tt);
167         nodeList.addElement(node);
168         return node;
169     }
170 
171     public Node cloneNode(Node node)
172     {
173         Node cnode = (Node)node.clone();
174         nodeList.addElement(cnode);
175         for (AttVal att = cnode.attributes; att != null; att = att.next) {
176             if (att.asp != null)
177                 nodeList.addElement(att.asp);
178             if (att.php != null)
179                 nodeList.addElement(att.php);
180         }
181         return cnode;
182     }
183 
184     public AttVal cloneAttributes(AttVal attrs)
185     {
186         AttVal cattrs = (AttVal)attrs.clone();
187         for (AttVal att = cattrs; att != null; att = att.next) {
188             if (att.asp != null)
189                 nodeList.addElement(att.asp);
190             if (att.php != null)
191                 nodeList.addElement(att.php);
192         }
193         return cattrs;
194     }
195 
196     protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
197     {
198         Node node;
199         for (int i = 0; i < nodeList.size(); i++) {
200             node = (Node)(nodeList.elementAt(i));
201             if (node.textarray == oldtextarray)
202                 node.textarray = newtextarray;
203         }
204     }
205 
206     /* used for creating preformatted text from Word2000 */
207     public Node newLineNode()
208     {
209         Node node = newNode();
210 
211         node.textarray = this.lexbuf;
212         node.start = this.lexsize;
213         addCharToLexer((int)'\n');
214         node.end = this.lexsize;
215         return node;
216     }
217 
218     // Should always be able convert to/from UTF-8, so encoding exceptions are
219     // converted to an Error to avoid adding throws declarations in
220     // lots of methods.
221     
222     public static byte[] getBytes(String   str) {
223         try {
224             return str.getBytes("UTF8");
225         } catch (java.io.UnsupportedEncodingException   e) {
226             throw new Error  ("string to UTF-8 conversion failed: " + e.getMessage());
227         }
228     }
229 
230     public static String   getString(byte[] bytes, int offset, int length) {
231         try {
232             return new String  (bytes, offset, length, "UTF8");
233         } catch (java.io.UnsupportedEncodingException   e) {
234             throw new Error  ("UTF-8 to string conversion failed: " + e.getMessage());
235         }
236     }
237 
238     public boolean endOfInput()
239     {
240         return this.in.isEndOfStream();
241     }
242 
243     public void addByte(int c)
244     {
245         if (this.lexsize + 1 >= this.lexlength)
246         {
247             while (this.lexsize + 1 >= this.lexlength)
248             {
249                 if (this.lexlength == 0)
250                     this.lexlength = 8192;
251                 else
252                     this.lexlength = this.lexlength * 2;
253             }
254 
255             byte[] temp = this.lexbuf;
256             this.lexbuf = new byte[ this.lexlength ];
257             if (temp != null)
258             {
259                 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
260                 updateNodeTextArrays(temp, this.lexbuf);
261             }
262         }
263 
264         this.lexbuf[this.lexsize++] = (byte)c;
265         this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
266     }
267 
268     public void changeChar(byte c)
269     {
270         if (this.lexsize > 0)
271         {
272             this.lexbuf[this.lexsize-1] = c;
273         }
274     }
275 
276     /* store char c as UTF-8 encoded byte stream */
277     public void addCharToLexer(int c)
278     {
279         if (c < 128)
280             addByte(c);
281         else if (c <= 0x7FF)
282         {
283             addByte(0xC0 | (c >> 6));
284             addByte(0x80 | (c & 0x3F));
285         }
286         else if (c <= 0xFFFF)
287         {
288             addByte(0xE0 | (c >> 12));
289             addByte(0x80 | ((c >> 6) & 0x3F));
290             addByte(0x80 | (c & 0x3F));
291         }
292         else if (c <= 0x1FFFFF)
293         {
294             addByte(0xF0 | (c >> 18));
295             addByte(0x80 | ((c >> 12) & 0x3F));
296             addByte(0x80 | ((c >> 6) & 0x3F));
297             addByte(0x80 | (c & 0x3F));
298         }
299         else
300         {
301             addByte(0xF8 | (c >> 24));
302             addByte(0x80 | ((c >> 18) & 0x3F));
303             addByte(0x80 | ((c >> 12) & 0x3F));
304             addByte(0x80 | ((c >> 6) & 0x3F));
305             addByte(0x80 | (c & 0x3F));
306         }
307     }
308 
309     public void addStringToLexer(String   str)
310     {
311         for ( int i = 0; i < str.length(); i++ ) {
312             addCharToLexer( (int)str.charAt(i) );
313         }
314     }
315 
316     /*
317       No longer attempts to insert missing ';' for unknown
318       enitities unless one was present already, since this
319       gives unexpected results.
320 
321       For example:   <a HREF="something.htm?foo&bar&fred">
322       was tidied to: <a HREF="something.htm?foo&amp;bar;&amp;fred;">
323       rather than:   <a HREF="something.htm?foo&amp;bar&amp;fred">
324 
325       My thanks for Maurice Buxton for spotting this.
326     */
327     public void parseEntity(short mode)
328     {
329         short map;
330         int start;
331         boolean first = true;
332         boolean semicolon = false;
333         boolean numeric = false;
334         int c, ch, startcol;
335         String   str;
336 
337         start = this.lexsize - 1;  /* to start at "&" */
338         startcol = this.in.curcol - 1;
339 
340         while (true)
341         {
342             c = this.in.readChar();
343             if (c == StreamIn.EndOfStream) break;
344             if (c == ';')
345             {
346                 semicolon = true;
347                 break;
348             }
349 
350             if (first && c == '#')
351             {
352                 addCharToLexer(c);
353                 first = false;
354                 numeric = true;
355                 continue;
356             }
357 
358             first = false;
359             map = MAP((char)c);
360 
361             /* AQ: Added flag for numeric entities so that numeric entities
362                with missing semi-colons are recognized.
363                Eg. "&#114e&#112;..." is recognized as "rep"
364             */
365             if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
366             {
367                 addCharToLexer(c);
368                 continue;
369             }
370             if (!numeric && ((map & NAMECHAR) != 0))
371             {
372                 addCharToLexer(c);
373                 continue;
374             }
375 
376             /* otherwise put it back */
377 
378             this.in.ungetChar(c);
379             break;
380         }
381 
382         str = getString( this.lexbuf, start, this.lexsize - start );
383         ch = EntityTable.getDefaultEntityTable().entityCode( str );
384 
385         /* deal with unrecognized entities */
386         if (ch <= 0)
387         {
388             /* set error position just before offending chararcter */
389             this.lines = this.in.curline;
390             this.columns = startcol;
391 
392             if (this.lexsize > start +1 )
393             {
394                 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
395 
396                 if (semicolon)
397                     addCharToLexer(';');
398             }
399             else /* naked & */
400             {
401                 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
402             }
403         }
404         else
405         {
406             if (c != ';')    /* issue warning if not terminated by ';' */
407             {
408                 /* set error position just before offending chararcter */
409                 this.lines = this.in.curline;
410                 this.columns = startcol;
411                 Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
412             }
413 
414             this.lexsize = start;
415 
416             if (ch == 160 && (mode & Preformatted) != 0)
417                 ch = ' ';
418 
419             addCharToLexer(ch);
420 
421             if (ch == '&' && !this.configuration.QuoteAmpersand)
422             {
423                 addCharToLexer('a');
424                 addCharToLexer('m');
425                 addCharToLexer('p');
426                 addCharToLexer(';');
427             }
428         }
429     }
430 
431     public char parseTagName()
432     {
433         short map;
434         int c;
435 
436         /* fold case of first char in buffer */
437 
438         c = this.lexbuf[this.txtstart];
439         map = MAP((char)c);
440 
441         if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
442         {
443             c += (int)((int)'a' - (int)'A');
444             this.lexbuf[this.txtstart] = (byte)c;
445         }
446 
447         while (true)
448         {
449             c = this.in.readChar();
450             if (c == StreamIn.EndOfStream) break;
451             map = MAP((char)c);
452 
453             if ((map & NAMECHAR) == 0)
454                 break;
455 
456             /* fold case of subsequent chars */
457 
458             if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
459                 c += (int)((int)'a' - (int)'A');
460 
461             addCharToLexer(c);
462         }
463 
464         this.txtend = this.lexsize;
465         return (char)c;
466     }
467 
468     public void addStringLiteral(String   str)
469     {
470         for ( int i = 0; i < str.length(); i++ ) {
471             addCharToLexer( (int)str.charAt(i) );
472         }
473     }
474 
475     /* choose what version to use for new doctype */
476     public short HTMLVersion()
477     {
478         short versions;
479 
480         versions = this.versions;
481 
482         if ((versions & Dict.VERS_HTML20) != 0)
483             return Dict.VERS_HTML20;
484 
485         if ((versions & Dict.VERS_HTML32) != 0)
486             return Dict.VERS_HTML32;
487 
488         if ((versions & Dict.VERS_HTML40_STRICT) != 0)
489             return Dict.VERS_HTML40_STRICT;
490 
491         if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
492             return Dict.VERS_HTML40_LOOSE;
493 
494         if ((versions & Dict.VERS_FRAMES) != 0)
495             return Dict.VERS_FRAMES;
496 
497         return Dict.VERS_UNKNOWN;
498     }
499 
500     public String   HTMLVersionName()
501     {
502         short guessed;
503         int j;
504 
505         guessed = apparentVersion();
506 
507         for (j = 0; j < W3CVersion.length; ++j)
508         {
509             if (guessed == W3CVersion[j].code)
510             {
511                 if (this.isvoyager)
512                     return W3CVersion[j].voyagerName;
513 
514                 return W3CVersion[j].name;
515             }
516         }
517 
518         return null;
519     }
520 
521     /* add meta element for Tidy */
522     public boolean addGenerator(Node root)
523     {
524         AttVal attval;
525         Node node;
526         Node head = root.findHEAD(configuration.tt);
527 
528         if (head != null)
529         {
530             for (node = head.content; node != null; node = node.next)
531             {
532                 if (node.tag == configuration.tt.tagMeta)
533                 {
534                     attval = node.getAttrByName("name");
535 
536                     if (attval != null && attval.value != null &&
537                         Lexer.wstrcasecmp(attval.value, "generator") == 0)
538                     {
539                         attval = node.getAttrByName("content");
540 
541                         if (attval != null && attval.value != null &&
542                             attval.value.length() >= 9 &&
543                             Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
544                         {
545                             return false;
546                         }
547                     }
548                 }
549             }
550 
551             node = this.inferredTag("meta");
552             node.addAttribute("content", "HTML Tidy, see www.w3.org");
553             node.addAttribute("name", "generator");
554             Node.insertNodeAtStart(head, node);
555             return true;
556         }
557 
558         return false;
559     }
560 
561     /* return true if substring s is in p and isn't all in upper case */
562     /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
563     /* len is how many chars to check in p */
564     private static boolean findBadSubString(String   s, String   p, int len)
565     {
566         int n = s.length();
567         int i = 0;
568         String   ps;
569 
570         while (n < len)
571         {
572             ps = p.substring(i, i + n);
573             if (wstrcasecmp(s, ps) == 0)
574                 return (!ps.equals(s.substring(0, n)));
575 
576             ++i;
577             --len;
578         }
579 
580         return false;
581     }
582 
583     public boolean checkDocTypeKeyWords(Node doctype)
584     {
585         int len = doctype.end - doctype.start;
586         String   s = getString(this.lexbuf, doctype.start, len);
587 
588         return !(
589             findBadSubString("SYSTEM", s, len) ||
590             findBadSubString("PUBLIC", s, len) ||
591             findBadSubString("//DTD", s, len) ||
592             findBadSubString("//W3C", s, len) ||
593             findBadSubString("//EN", s, len)
594             );
595     }
596 
597     /* examine <!DOCTYPE> to identify version */
598     public short findGivenVersion(Node doctype)
599     {
600         String   p, s;
601         int i, j;
602         int len;
603         String   str1;
604         String   str2;
605 
606         /* if root tag for doctype isn't html give up now */
607         str1 = getString(this.lexbuf, doctype.start, 5);
608         if (wstrcasecmp(str1, "html ") != 0)
609             return 0;
610 
611         if (!checkDocTypeKeyWords(doctype))
612             Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
613 
614         /* give up if all we are given is the system id for the doctype */
615         str1 = getString(this.lexbuf, doctype.start + 5, 7);
616         if (wstrcasecmp(str1, "SYSTEM ") == 0)
617         {
618             /* but at least ensure the case is correct */
619             if (!str1.substring(0, 6).equals("SYSTEM"))
620                 System.arraycopy( getBytes("SYSTEM"), 0,
621                                   this.lexbuf, doctype.start + 5, 6 );
622             return 0;  /* unrecognized */
623         }
624 
625         if (wstrcasecmp(str1, "PUBLIC ") == 0)
626         {
627             if (!str1.substring(0, 6).equals("PUBLIC"))
628                 System.arraycopy( getBytes("PUBLIC "), 0,
629                                   this.lexbuf, doctype.start + 5, 6 );
630         }
631         else
632             this.badDoctype = true;
633 
634         for (i = doctype.start; i < doctype.end; ++i)
635         {
636             if (this.lexbuf[i] == (byte)'"')
637             {
638                 str1 = getString( this.lexbuf, i + 1, 12 );
639                 str2 = getString( this.lexbuf, i + 1, 13 );
640                 if (str1.equals("-//W3C//DTD "))
641                 {
642                     /* compute length of identifier e.g. "HTML 4.0 Transitional" */
643                     for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
644                     len = j - i - 13;
645                     p = getString( this.lexbuf, i + 13, len );
646 
647                     for (j = 1; j < W3CVersion.length; ++j)
648                     {
649                         s = W3CVersion[j].name;
650                         if (len == s.length() && s.equals(p))
651                             return W3CVersion[j].code;
652                     }
653 
654                     /* else unrecognized version */
655                 }
656                 else if (str2.equals("-//IETF//DTD "))
657                 {
658                     /* compute length of identifier e.g. "HTML 2.0" */
659                     for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
660                     len = j - i - 14;
661 
662                     p = getString( this.lexbuf, i + 14, len );
663                     s = W3CVersion[0].name;
664                     if (len == s.length() && s.equals(p))
665                         return W3CVersion[0].code;
666 
667                     /* else unrecognized version */
668                 }
669                 break;
670             }
671         }
672 
673         return 0;
674     }
675 
676     public void fixHTMLNameSpace(Node root, String   profile)
677     {
678         Node node;
679         AttVal prev, attr;
680 
681         for (node = root.content; 
682                 node != null && node.tag != configuration.tt.tagHtml; node = node.next);
683 
684         if (node != null)
685         {
686             prev = null;
687 
688             for (attr = node.attributes; attr != null; attr = attr.next)
689             {
690                 if (attr.attribute.equals("xmlns"))
691                     break;
692 
693                 prev = attr;
694             }
695 
696             if (attr != null)
697             {
698                 if (!attr.value.equals(profile))
699                 {
700                     Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
701                     attr.value = profile;
702                 }
703             }
704             else
705             {
706                 attr = new AttVal( node.attributes, null, (int)'"',
707                                    "xmlns", profile );
708                 attr.dict =
709                     AttributeTable.getDefaultAttributeTable().findAttribute( attr );
710                 node.attributes = attr;
711             }
712         }
713     }
714 
715     public boolean setXHTMLDocType(Node root)
716     {
717         String   fpi = " ";
718         String   sysid = "";
719         String   namespace = XHTML_NAMESPACE;
720         Node doctype;
721 
722         doctype = root.findDocType();
723 
724         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
725         {
726             if (doctype != null)
727                 Node.discardElement(doctype);
728             return true;
729         }
730 
731         if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
732         {
733             /* see what flavor of XHTML this document matches */
734             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
735             {  /* use XHTML strict */
736                 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
737                 sysid = voyager_strict;
738             }
739             else if ((this.versions & Dict.VERS_LOOSE) != 0)
740             {
741                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
742                 sysid = voyager_loose;
743             }
744             else if ((this.versions & Dict.VERS_FRAMES) != 0)
745             {   /* use XHTML frames */
746                 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
747                 sysid = voyager_frameset;
748             }
749             else /* lets assume XHTML transitional */
750             {
751                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
752                 sysid = voyager_loose;
753             }
754         }
755         else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
756         {
757             fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
758             sysid = voyager_strict;
759         }
760         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
761         {
762             fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
763             sysid = voyager_loose;
764         }
765 
766         fixHTMLNameSpace(root, namespace);
767 
768         if (doctype == null)
769         {
770             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
771             doctype.next = root.content;
772             doctype.parent = root;
773             doctype.prev = null;
774             root.content = doctype;
775         }
776 
777         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
778             configuration.docTypeStr != null)
779         {
780             fpi = configuration.docTypeStr;
781             sysid = "";
782         }
783 
784         this.txtstart = this.lexsize;
785         this.txtend = this.lexsize;
786 
787         /* add public identifier */
788         addStringLiteral("html PUBLIC ");
789 
790         /* check if the fpi is quoted or not */
791         if (fpi.charAt(0) == '"')
792             addStringLiteral(fpi);
793         else
794         {
795             addStringLiteral("\"");
796             addStringLiteral(fpi);
797             addStringLiteral("\"");
798         }
799 
800         if (sysid.length() + 6 >= this.configuration.wraplen)
801             addStringLiteral("\n\"");
802         else
803             addStringLiteral("\n    \"");
804 
805         /* add system identifier */
806         addStringLiteral(sysid);
807         addStringLiteral("\"");
808 
809         this.txtend = this.lexsize;
810 
811         doctype.start = this.txtstart;
812         doctype.end = this.txtend;
813 
814         return false;
815     }
816 
817     public short apparentVersion()
818     {
819         switch (this.doctype)
820         {
821         case Dict.VERS_UNKNOWN:
822             return HTMLVersion();
823 
824         case Dict.VERS_HTML20:
825             if ((this.versions & Dict.VERS_HTML20) != 0)
826                 return Dict.VERS_HTML20;
827 
828             break;
829 
830         case Dict.VERS_HTML32:
831             if ((this.versions & Dict.VERS_HTML32) != 0)
832                 return Dict.VERS_HTML32;
833 
834             break; /* to replace old version by new */
835 
836         case Dict.VERS_HTML40_STRICT:
837             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
838                 return Dict.VERS_HTML40_STRICT;
839 
840             break;
841 
842         case Dict.VERS_HTML40_LOOSE:
843             if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
844                 return Dict.VERS_HTML40_LOOSE;
845 
846             break; /* to replace old version by new */
847 
848         case Dict.VERS_FRAMES:
849             if ((this.versions & Dict.VERS_FRAMES) != 0)
850                 return Dict.VERS_FRAMES;
851 
852             break;
853         }
854 
855         Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
856         return this.HTMLVersion();
857     }
858 
859     /* fixup doctype if missing */
860     public boolean fixDocType(Node root)
861     {
862         Node doctype;
863         int guessed = Dict.VERS_HTML40_STRICT, i;
864 
865         if (this.badDoctype)
866             Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
867 
868         if (configuration.XmlOut)
869             return true;
870 
871         doctype = root.findDocType();
872 
873         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
874         {
875             if (doctype != null)
876                 Node.discardElement(doctype);
877             return true;
878         }
879 
880         if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
881         {
882             Node.discardElement(doctype);
883             doctype = null;
884             guessed = Dict.VERS_HTML40_STRICT;
885         }
886         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
887         {
888             Node.discardElement(doctype);
889             doctype = null;
890             guessed = Dict.VERS_HTML40_LOOSE;
891         }
892         else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
893         {
894             if (doctype != null)
895             {
896                 if (this.doctype == Dict.VERS_UNKNOWN)
897                     return false;
898 
899                 switch (this.doctype)
900                 {
901                 case Dict.VERS_UNKNOWN:
902                     return false;
903 
904                 case Dict.VERS_HTML20:
905                     if ((this.versions & Dict.VERS_HTML20) != 0)
906                         return true;
907 
908                     break; /* to replace old version by new */
909 
910                 case Dict.VERS_HTML32:
911                     if ((this.versions & Dict.VERS_HTML32) != 0)
912                         return true;
913 
914                     break; /* to replace old version by new */
915 
916                 case Dict.VERS_HTML40_STRICT:
917                     if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
918                         return true;
919 
920                     break; /* to replace old version by new */
921 
922                 case Dict.VERS_HTML40_LOOSE:
923                     if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
924                         return true;
925 
926                     break; /* to replace old version by new */
927 
928                 case Dict.VERS_FRAMES:
929                     if ((this.versions & Dict.VERS_FRAMES) != 0)
930                         return true;
931 
932                     break; /* to replace old version by new */
933                 }
934 
935                 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
936             }
937 
938             /* choose new doctype */
939             guessed = HTMLVersion();
940         }
941 
942         if (guessed == Dict.VERS_UNKNOWN)
943             return false;
944 
945         /* for XML use the Voyager system identifier */
946         if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
947         {
948             if (doctype != null)
949                 Node.discardElement(doctype);
950 
951             for (i = 0; i < W3CVersion.length; ++i)
952             {
953                 if (guessed == W3CVersion[i].code)
954                 {
955                     fixHTMLNameSpace(root, W3CVersion[i].profile);
956                     break;
957                 }
958             }
959 
960             return true;
961         }
962 
963         if (doctype == null)
964         {
965             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
966             doctype.next = root.content;
967             doctype.parent = root;
968             doctype.prev = null;
969             root.content = doctype;
970         }
971 
972         this.txtstart = this.lexsize;
973         this.txtend = this.lexsize;
974 
975         /* use the appropriate public identifier */
976         addStringLiteral("html PUBLIC ");
977 
978         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
979             configuration.docTypeStr != null)
980             addStringLiteral(configuration.docTypeStr);
981         else if (guessed == Dict.VERS_HTML20)
982             addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
983         else
984         {
985             addStringLiteral("\"-//W3C//DTD ");
986 
987             for (i = 0; i < W3CVersion.length; ++i)
988             {
989                 if (guessed == W3CVersion[i].code)
990                 {
991                     addStringLiteral(W3CVersion[i].name);
992                     break;
993                 }
994             }
995 
996             addStringLiteral("//EN\"");
997         }
998 
999         this.txtend = this.lexsize;
1000
1001        doctype.start = this.txtstart;
1002        doctype.end = this.txtend;
1003
1004        return true;
1005    }
1006
1007    /* ensure XML document starts with <?XML version="1.0"?> */
1008    public boolean fixXMLPI(Node root)
1009    {
1010        Node xml;
1011        int s;
1012
1013        if( root.content != null && root.content.type == Node.ProcInsTag)
1014        {
1015            s = root.content.start;
1016
1017            if (this.lexbuf[s] == (byte)'x' &&
1018                this.lexbuf[s+1] == (byte)'m' &&
1019                this.lexbuf[s+2] == (byte)'l')
1020                return true;
1021        }
1022
1023        xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
1024        xml.next = root.content;
1025
1026        if (root.content != null)
1027        {
1028            root.content.prev = xml;
1029            xml.next = root.content;
1030        }
1031    
1032        root.content = xml;
1033
1034        this.txtstart = this.lexsize;
1035        this.txtend = this.lexsize;
1036        addStringLiteral("xml version=\"1.0\"");
1037        if (this.configuration.CharEncoding == Configuration.LATIN1)
1038            addStringLiteral(" encoding=\"ISO-8859-1\"");
1039        this.txtend = this.lexsize;
1040
1041        xml.start = this.txtstart;
1042        xml.end = this.txtend;
1043        return false;
1044    }
1045
1046    public Node inferredTag(String   name)
1047    {
1048        Node node;
1049
1050        node = newNode(Node.StartTag,
1051                        this.lexbuf,
1052                        this.txtstart,
1053                        this.txtend,
1054                        name);
1055        node.implicit = true;
1056        return node;
1057    }
1058
1059    public static boolean expectsContent(Node node)
1060    {
1061        if (node.type != Node.StartTag)
1062            return false;
1063
1064        /* unknown element? */
1065        if (node.tag == null)
1066            return true;
1067
1068        if ((node.tag.model & Dict.CM_EMPTY) != 0)
1069            return false;
1070
1071        return true;
1072    }
1073
1074    /*
1075      create a text node for the contents of
1076      a CDATA element like style or script
1077      which ends with </foo> for some foo.
1078    */
1079    public Node getCDATA(Node container)
1080    {
1081        int c, lastc, start, len, i;
1082        String   str;
1083        boolean endtag = false;
1084
1085        this.lines = this.in.curline;
1086        this.columns = this.in.curcol;
1087        this.waswhite = false;
1088        this.txtstart = this.lexsize;
1089        this.txtend = this.lexsize;
1090
1091        lastc = (int)'\0';
1092        start = -1;
1093
1094        while (true)
1095        {
1096            c = this.in.readChar();
1097            if (c == StreamIn.EndOfStream) break;
1098            /* treat \r\n as \n and \r as \n */
1099
1100            if (c == (int)'/' && lastc == (int)'<')
1101            {
1102                if (endtag)
1103                {
1104                    this.lines = this.in.curline;
1105                    this.columns = this.in.curcol - 3;
1106
1107                    Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1108                }
1109
1110                start = this.lexsize + 1;  /* to first letter */
1111                endtag = true;
1112            }
1113            else if (c == (int)'>' && start >= 0)
1114            {
1115                len = this.lexsize - start;
1116                if (len == container.element.length())
1117                {
1118                    str = getString( this.lexbuf, start, len );
1119                    if (Lexer.wstrcasecmp(str, container.element) == 0)
1120                    {
1121                        this.txtend = start - 2;
1122                        break;
1123                    }
1124                }
1125
1126                this.lines = this.in.curline;
1127                this.columns = this.in.curcol - 3;
1128
1129                Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1130
1131                /* if javascript insert backslash before / */
1132
1133                if (ParserImpl.isJavaScript(container))
1134                {
1135                    for (i = this.lexsize; i > start-1; --i)
1136                        this.lexbuf[i] = this.lexbuf[i-1];
1137
1138                    this.lexbuf[start-1] = (byte)'\\';
1139                    this.lexsize++;
1140                }
1141
1142                start = -1;
1143            }
1144            else if (c == (int)'\r')
1145            {
1146                c = this.in.readChar();
1147
1148                if (c != (int)'\n')
1149                    this.in.ungetChar(c);
1150
1151                c = (int)'\n';
1152            }
1153
1154            addCharToLexer((int)c);
1155            this.txtend = this.lexsize;
1156            lastc = c;
1157        }
1158
1159        if (c == StreamIn.EndOfStream)
1160            Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1161
1162        if (this.txtend > this.txtstart)
1163        {
1164            this.token = newNode(Node.TextNode,
1165                                  this.lexbuf,
1166                                  this.txtstart,
1167                                  this.txtend);
1168            return this.token;
1169        }
1170
1171        return null;
1172    }
1173
1174    public void ungetToken()
1175    {
1176        this.pushed = true;
1177    }
1178
1179    public static final short IgnoreWhitespace    = 0;
1180    public static final short MixedContent        = 1;
1181    public static final short Preformatted        = 2;
1182    public static final short IgnoreMarkup        = 3;
1183
1184    /*
1185      modes for GetToken()
1186
1187      MixedContent   -- for elements which don't accept PCDATA
1188      Preformatted       -- white space preserved as is
1189      IgnoreMarkup       -- for CDATA elements such as script, style
1190    */
1191
1192    public Node getToken(short mode)
1193    {
1194        short map;
1195        int c = 0;
1196        int lastc;
1197        int badcomment = 0;
1198        MutableBoolean isempty = new MutableBoolean();
1199        AttVal attributes;
1200
1201        if (this.pushed)
1202        {
1203            /* duplicate inlines in preference to pushed text nodes when appropriate */
1204            if (this.token.type != Node.TextNode ||
1205                (this.insert == -1 && this.inode == null))
1206            {
1207                this.pushed = false;
1208                return this.token;
1209            }
1210        }
1211
1212        /* at start of block elements, unclosed inline
1213           elements are inserted into the token stream */
1214     
1215        if (this.insert != -1 || this.inode != null)
1216            return insertedToken();
1217
1218        this.lines = this.in.curline;
1219        this.columns = this.in.curcol;
1220        this.waswhite = false;
1221
1222        this.txtstart = this.lexsize;
1223        this.txtend = this.lexsize;
1224
1225        while (true)
1226        {
1227            c = this.in.readChar();
1228            if (c == StreamIn.EndOfStream) break;
1229            if (this.insertspace && mode != IgnoreWhitespace)
1230            {
1231                addCharToLexer(' ');
1232                this.waswhite = true;
1233                this.insertspace = false;
1234            }
1235
1236            /* treat \r\n as \n and \r as \n */
1237
1238            if (c == '\r')
1239            {
1240                c = this.in.readChar();
1241
1242                if (c != '\n')
1243                    this.in.ungetChar(c);
1244
1245                c = '\n';
1246            }
1247
1248            addCharToLexer(c);
1249
1250            switch (this.state)
1251            {
1252            case LEX_CONTENT:  /* element content */
1253                map = MAP((char)c);
1254
1255                /*
1256                 Discard white space if appropriate. Its cheaper
1257                 to do this here rather than in parser methods
1258                 for elements that don't have mixed content.
1259                */
1260                if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) 
1261                      && this.lexsize == this.txtstart + 1)
1262                {
1263                    --this.lexsize;
1264                    this.waswhite = false;
1265                    this.lines = this.in.curline;
1266                    this.columns = this.in.curcol;
1267                    continue;
1268                }
1269
1270                if (c == '<')
1271                {
1272                    this.state = LEX_GT;
1273                    continue;
1274                }
1275
1276                if ((map & WHITE) != 0)
1277                {
1278                    /* was previous char white? */
1279                    if (this.waswhite)
1280                    {
1281                        if (mode != Preformatted && mode != IgnoreMarkup)
1282                        {
1283                            --this.lexsize;
1284                            this.lines = this.in.curline;
1285                            this.columns = this.in.curcol;
1286                        }
1287                    }
1288                    else /* prev char wasn't white */
1289                    {
1290                        this.waswhite = true;
1291                        lastc = c;
1292
1293                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
1294                            changeChar((byte)' ');
1295                    }
1296
1297                    continue;
1298                }
1299                else if (c == '&' && mode != IgnoreMarkup)
1300                    parseEntity(mode);
1301
1302                /* this is needed to avoid trimming trailing whitespace */
1303                if (mode == IgnoreWhitespace)
1304                    mode = MixedContent;
1305
1306                this.waswhite = false;
1307                continue;
1308
1309            case LEX_GT:  /* < */
1310
1311                /* check for endtag */
1312                if (c == '/')
1313                {
1314                    c = this.in.readChar();
1315                    if (c == StreamIn.EndOfStream)
1316                    {
1317                        this.in.ungetChar(c);
1318                        continue;
1319                    }
1320
1321                    addCharToLexer(c);
1322                    map = MAP((char)c);
1323
1324                    if ((map & LETTER) != 0)
1325                    {
1326                        this.lexsize -= 3;
1327                        this.txtend = this.lexsize;
1328                        this.in.ungetChar(c);
1329                        this.state = LEX_ENDTAG;
1330                        this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
1331                        this.in.curcol -= 2;
1332
1333                        /* if some text before the </ return it now */
1334                        if (this.txtend > this.txtstart)
1335                        {
1336                            /* trim space char before end tag */
1337                            if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
1338                            {
1339                                this.lexsize -= 1;
1340                                this.txtend = this.lexsize;
1341                            }
1342
1343                            this.token = newNode(Node.TextNode,
1344                                                  this.lexbuf,
1345                                                  this.txtstart,
1346                                                  this.txtend);
1347                            return this.token;
1348                        }
1349
1350                        continue;       /* no text so keep going */
1351                    }
1352
1353                    /* otherwise treat as CDATA */
1354                    this.waswhite = false;
1355                    this.state = LEX_CONTENT;
1356                    continue;
1357                }
1358
1359                if (mode == IgnoreMarkup)
1360                {
1361                    /* otherwise treat as CDATA */
1362                    this.waswhite = false;
1363                    this.state = LEX_CONTENT;
1364                    continue;
1365                }
1366
1367                /*
1368                   look out for comments, doctype or marked sections
1369                   this isn't quite right, but its getting there ...
1370                */
1371                if (c == '!')
1372                {
1373                    c = this.in.readChar();
1374
1375                    if (c == '-')
1376                    {
1377                        c = this.in.readChar();
1378
1379                        if (c == '-')
1380                        {
1381                            this.state = LEX_COMMENT;  /* comment */
1382                            this.lexsize -= 2;
1383                            this.txtend = this.lexsize;
1384
1385                            /* if some text before < return it now */
1386                            if (this.txtend > this.txtstart)
1387                            {
1388                                this.token = newNode(Node.TextNode,
1389                                                      this.lexbuf,
1390                                                      this.txtstart,
1391                                                      this.txtend);
1392                                return this.token;
1393                            }
1394
1395                            this.txtstart = this.lexsize;
1396                            continue;
1397                        }
1398
1399                        Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1400                    }
1401                    else if (c == 'd' || c == 'D')
1402                    {
1403                        this.state = LEX_DOCTYPE; /* doctype */
1404                        this.lexsize -= 2;
1405                        this.txtend = this.lexsize;
1406                        mode = IgnoreWhitespace;
1407
1408                        /* skip until white space or '>' */
1409
1410                        for (;;)
1411                        {
1412                            c = this.in.readChar();
1413
1414                            if (c == StreamIn.EndOfStream || c == '>')
1415                            {
1416                                this.in.ungetChar(c);
1417                                break;
1418                            }
1419
1420                            map = MAP((char)c);
1421
1422                            if ((map & WHITE) == 0)
1423                                continue;
1424
1425                            /* and skip to end of whitespace */
1426
1427                            for (;;)
1428                            {
1429                                c = this.in.readChar();
1430
1431                                if (c == StreamIn.EndOfStream || c == '>')
1432                                {
1433                                    this.in.ungetChar(c);
1434                                    break;
1435                                }
1436
1437                                map = MAP((char)c);
1438
1439                                if ((map & WHITE) != 0)
1440                                    continue;
1441
1442                                this.in.ungetChar(c);
1443                                    break;
1444                            }
1445
1446                            break;
1447                        }
1448
1449                        /* if some text before < return it now */
1450                        if (this.txtend > this.txtstart)
1451                        {
1452                                this.token = newNode(Node.TextNode,
1453                                                      this.lexbuf,
1454                                                      this.txtstart,
1455                                                      this.txtend);
1456                                return this.token;
1457                        }
1458
1459                        this.txtstart = this.lexsize;
1460                        continue;
1461                    }
1462                    else if (c == '[')
1463                    {
1464                        /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1465                        this.lexsize -= 2;
1466                        this.state = LEX_SECTION;
1467                        this.txtend = this.lexsize;
1468
1469                        /* if some text before < return it now */
1470                        if (this.txtend > this.txtstart)
1471                        {
1472                                this.token = newNode(Node.TextNode,
1473                                                      this.lexbuf,
1474                                                      this.txtstart,
1475                                                      this.txtend);
1476                                return this.token;
1477                        }
1478
1479                        this.txtstart = this.lexsize;
1480                        continue;
1481                    }
1482
1483                    /* otherwise swallow chars up to and including next '>' */
1484                    while (true)
1485                    {
1486                        c = this.in.readChar();
1487                        if (c == '>') break;
1488                        if (c == -1)
1489                        {
1490                            this.in.ungetChar(c);
1491                            break;
1492                        }
1493                    }
1494
1495                    this.lexsize -= 2;
1496                    this.lexbuf[this.lexsize] = (byte)'\0';
1497                    this.state = LEX_CONTENT;
1498                    continue;
1499                }
1500
1501                /*
1502                   processing instructions
1503                */
1504
1505                if (c == '?')
1506                {
1507                    this.lexsize -= 2;
1508                    this.state = LEX_PROCINSTR;
1509                    this.txtend = this.lexsize;
1510
1511                    /* if some text before < return it now */
1512                    if (this.txtend > this.txtstart)
1513                    {
1514                        this.token = newNode(Node.TextNode,
1515                                              this.lexbuf,
1516                                              this.txtstart,
1517                                              this.txtend);
1518                        return this.token;
1519                    }
1520
1521                    this.txtstart = this.lexsize;
1522                    continue;
1523                }
1524
1525                /* Microsoft ASP's e.g. <% ... server-code ... %> */
1526                if (c == '%')
1527                {
1528                    this.lexsize -= 2;
1529                    this.state = LEX_ASP;
1530                    this.txtend = this.lexsize;
1531
1532                    /* if some text before < return it now */
1533                    if (this.txtend > this.txtstart)
1534                    {
1535                        this.token = newNode(Node.TextNode,
1536                                              this.lexbuf,
1537                                              this.txtstart,
1538                                              this.txtend);
1539                        return this.token;
1540                    }
1541
1542                    this.txtstart = this.lexsize;
1543                    continue;
1544                }
1545
1546                /* Netscapes JSTE e.g. <# ... server-code ... #> */
1547                if (c == '#')
1548                {
1549                    this.lexsize -= 2;
1550                    this.state = LEX_JSTE;
1551                    this.txtend = this.lexsize;
1552
1553                    /* if some text before < return it now */
1554                    if (this.txtend > this.txtstart)
1555                    {
1556                        this.token = newNode(Node.TextNode,
1557                                              this.lexbuf,
1558                                              this.txtstart,
1559                                              this.txtend);
1560                        return this.token;
1561                    }
1562
1563                    this.txtstart = this.lexsize;
1564                    continue;
1565                }
1566
1567                map = MAP((char)c);
1568
1569                /* check for start tag */
1570                if ((map & LETTER) != 0)
1571                {
1572                    this.in.ungetChar(c);     /* push back letter */
1573                    this.lexsize -= 2;      /* discard "<" + letter */
1574                    this.txtend = this.lexsize;
1575                    this.state = LEX_STARTTAG;         /* ready to read tag name */
1576
1577                    /* if some text before < return it now */
1578                    if (this.txtend > this.txtstart)
1579                    {
1580                        this.token = newNode(Node.TextNode,
1581                                              this.lexbuf,
1582                                              this.txtstart,
1583                                              this.txtend);
1584                        return this.token;
1585                    }
1586
1587                    continue;       /* no text so keep going */
1588                }
1589
1590                /* otherwise treat as CDATA */
1591                this.state = LEX_CONTENT;
1592                this.waswhite = false;
1593                continue;
1594
1595            case LEX_ENDTAG:  /* </letter */
1596                this.txtstart = this.lexsize - 1;
1597                this.in.curcol += 2;
1598                c = parseTagName();
1599                this.token = newNode(Node.EndTag, /* create endtag token */
1600                                      this.lexbuf,
1601                                      this.txtstart,
1602                                      this.txtend,
1603                                      getString(this.lexbuf,
1604                                                 this.txtstart,
1605                                                 this.txtend - this.txtstart));
1606                this.lexsize = this.txtstart;
1607                this.txtend = this.txtstart;
1608
1609                /* skip to '>' */
1610                while (c != '>')
1611                {
1612                    c = this.in.readChar();
1613
1614                    if (c == StreamIn.EndOfStream)
1615                        break;
1616                }
1617
1618                if (c == StreamIn.EndOfStream)
1619                {
1620                    this.in.ungetChar(c);
1621                    continue;
1622                }
1623
1624                this.state = LEX_CONTENT;
1625                this.waswhite = false;
1626                return this.token;  /* the endtag token */
1627
1628            case LEX_STARTTAG: /* first letter of tagname */
1629                this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
1630                c = parseTagName();
1631                isempty.value = false;
1632                attributes = null;
1633                this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
1634                                      this.lexbuf,
1635                                      this.txtstart,
1636                                      this.txtend,
1637                                      getString(this.lexbuf,
1638                                                 this.txtstart,
1639                                                 this.txtend - this.txtstart));
1640
1641                /* parse attributes, consuming closing ">" */
1642                if (c != '>')
1643                {
1644                    if (c == '/')
1645                        this.in.ungetChar(c);
1646
1647                    attributes = parseAttrs(isempty);
1648                }
1649
1650                if (isempty.value)
1651                    this.token.type = Node.StartEndTag;
1652
1653                this.token.attributes = attributes;
1654                this.lexsize = this.txtstart;
1655                this.txtend = this.txtstart;
1656
1657                /* swallow newline following start tag */
1658                /* special check needed for CRLF sequence */
1659                /* this doesn't apply to empty elements */
1660
1661                if (expectsContent(this.token) ||
1662                    this.token.tag == configuration.tt.tagBr)
1663                {
1664
1665                    c = this.in.readChar();
1666
1667                    if (c == '\r')
1668                    {
1669                        c = this.in.readChar();
1670
1671                        if (c != '\n')
1672                            this.in.ungetChar(c);
1673                    }
1674                    else if (c != '\n' && c != '\f')
1675                        this.in.ungetChar(c);
1676
1677                    this.waswhite = true;  /* to swallow leading whitespace */
1678                }
1679                else
1680                    this.waswhite = false;
1681
1682                this.state = LEX_CONTENT;
1683
1684                if (this.token.tag == null)
1685                    Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1686                else if (!this.configuration.XmlTags)
1687                {
1688                    this.versions &= this.token.tag.versions;
1689                    
1690                    if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
1691                    {
1692                        if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
1693                                                this.token.tag == configuration.tt.tagWbr))
1694                            Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
1695                    }
1696
1697                    if (this.token.tag.chkattrs != null)
1698                    {
1699                        this.token.checkUniqueAttributes(this);
1700                        this.token.tag.chkattrs.check(this, this.token);
1701                    }
1702                    else
1703                        this.token.checkAttributes(this);
1704                }
1705
1706                return this.token;  /* return start tag */
1707
1708            case LEX_COMMENT:  /* seen <!-- so look for --> */
1709
1710                if (c != '-')
1711                    continue;
1712
1713                c = this.in.readChar();
1714                addCharToLexer(c);
1715
1716                if (c != '-')
1717                    continue;
1718
1719                end_comment: while (true) {
1720                    c = this.in.readChar();
1721
1722                    if (c == '>')
1723                    {
1724                        if (badcomment != 0)
1725                            Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1726
1727                        this.txtend = this.lexsize - 2; // AQ 8Jul2000
1728                        this.lexbuf[this.lexsize] = (byte)'\0';
1729                        this.state = LEX_CONTENT;
1730                        this.waswhite = false;
1731                        this.token = newNode(Node.CommentTag,
1732                                              this.lexbuf,
1733                                              this.txtstart,
1734                                              this.txtend);
1735
1736                        /* now look for a line break */
1737
1738                        c = this.in.readChar();
1739
1740                        if (c == '\r')
1741                        {
1742                            c = this.in.readChar();
1743
1744                            if (c != '\n')
1745                                this.token.linebreak = true;
1746                        }
1747
1748                        if (c == '\n')
1749                            this.token.linebreak = true;
1750                        else
1751                            this.in.ungetChar(c);
1752
1753                        return this.token;
1754                    }
1755
1756                    /* note position of first such error in the comment */
1757                    if (badcomment == 0)
1758                    {
1759                        this.lines = this.in.curline;
1760                        this.columns = this.in.curcol - 3;
1761                    }
1762
1763                    badcomment++;
1764                    if (this.configuration.FixComments)
1765                        this.lexbuf[this.lexsize - 2] = (byte)'=';
1766
1767                    addCharToLexer(c);
1768
1769                    /* if '-' then look for '>' to end the comment */
1770                    if (c != '-')
1771                        break end_comment;
1772
1773                }
1774                /* otherwise continue to look for --> */
1775                this.lexbuf[this.lexsize - 2] = (byte)'=';
1776                continue;
1777
1778            case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
1779                map = MAP((char)c);
1780
1781                if ((map & WHITE) != 0)
1782                {
1783                    if (this.waswhite)
1784                        this.lexsize -= 1;
1785
1786                    this.waswhite = true;
1787                }
1788                else
1789                    this.waswhite = false;
1790
1791                if (c != '>')
1792                    continue;
1793
1794                this.lexsize -= 1;
1795                this.txtend = this.lexsize;
1796                this.lexbuf[this.lexsize] = (byte)'\0';
1797                this.state = LEX_CONTENT;
1798                this.waswhite = false;
1799                this.token = newNode(Node.DocTypeTag,
1800                                      this.lexbuf,
1801                                      this.txtstart,
1802                                      this.txtend);
1803                /* make a note of the version named by the doctype */
1804                this.doctype = findGivenVersion(this.token);
1805                return this.token;
1806
1807            case LEX_PROCINSTR:  /* seen <? so look for '>' */
1808                /* check for PHP preprocessor instructions <?php ... ?> */
1809
1810                if  (this.lexsize - this.txtstart == 3)
1811                {
1812                    if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
1813                    {
1814                        this.state = LEX_PHP;
1815                        continue;
1816                    }
1817                }
1818
1819                if (this.configuration.XmlPIs)  /* insist on ?> as terminator */
1820                {
1821                    if (c != '?')
1822                        continue;
1823
1824                    /* now look for '>' */
1825                    c = this.in.readChar();
1826
1827                    if (c == StreamIn.EndOfStream)
1828                    {
1829                        Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
1830                        this.in.ungetChar(c);
1831                        continue;
1832                    }
1833
1834                    addCharToLexer(c);
1835                }
1836
1837                if (c != '>')
1838                    continue;
1839
1840                this.lexsize -= 1;
1841                this.txtend = this.lexsize;
1842                this.lexbuf[this.lexsize] = (byte)'\0';
1843                this.state = LEX_CONTENT;
1844                this.waswhite = false;
1845                this.token = newNode(Node.ProcInsTag,
1846                                      this.lexbuf,
1847                                      this.txtstart,
1848                                      this.txtend);
1849                return this.token;
1850
1851            case LEX_ASP:  /* seen <% so look for "%>" */
1852                if (c != '%')
1853                    continue;
1854
1855                /* now look for '>' */
1856                c = this.in.readChar();
1857
1858
1859                if (c != '>')
1860                {
1861                    this.in.ungetChar(c);
1862                    continue;
1863                }
1864
1865                this.lexsize -= 1;
1866                this.txtend = this.lexsize;
1867                this.lexbuf[this.lexsize] = (byte)'\0';
1868                this.state = LEX_CONTENT;
1869                this.waswhite = false;
1870                this.token = newNode(Node.AspTag,
1871                                      this.lexbuf,
1872                                      this.txtstart,
1873                                      this.txtend);
1874                return this.token;
1875
1876            case LEX_JSTE:  /* seen <# so look for "#>" */
1877                if (c != '#')
1878                    continue;
1879
1880                /* now look for '>' */
1881                c = this.in.readChar();
1882
1883
1884                if (c != '>')
1885                {
1886                    this.in.ungetChar(c);
1887                    continue;
1888                }
1889
1890                this.lexsize -= 1;
1891                this.txtend = this.lexsize;
1892                this.lexbuf[this.lexsize] = (byte)'\0';
1893                this.state = LEX_CONTENT;
1894                this.waswhite = false;
1895                this.token = newNode(Node.JsteTag,
1896                                      this.lexbuf,
1897                                      this.txtstart,
1898                                      this.txtend);
1899                return this.token;
1900
1901            case LEX_PHP: /* seen "<?php" so look for "?>" */
1902                if (c != '?')
1903                    continue;
1904
1905                /* now look for '>' */
1906                c = this.in.readChar();
1907
1908                if (c != '>')
1909                {
1910                    this.in.ungetChar(c);
1911                    continue;
1912                }
1913
1914                this.lexsize -= 1;
1915                this.txtend = this.lexsize;
1916                this.lexbuf[this.lexsize] = (byte)'\0';
1917                this.state = LEX_CONTENT;
1918                this.waswhite = false;
1919                this.token = newNode(Node.PhpTag,
1920                                      this.lexbuf,
1921                                      this.txtstart,
1922                                      this.txtend);
1923                return this.token;
1924
1925            case LEX_SECTION: /* seen "<![" so look for "]>" */
1926                if (c == '[')
1927                {
1928                    if (this.lexsize == (this.txtstart + 6) &&
1929                        (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
1930                    {
1931                        this.state = LEX_CDATA;
1932                        this.lexsize -= 6;
1933                        continue;
1934                    }
1935                }
1936
1937                if (c != ']')
1938                    continue;
1939
1940                /* now look for '>' */
1941                c = this.in.readChar();
1942
1943                if (c != '>')
1944                {
1945                    this.in.ungetChar(c);
1946                    continue;
1947                }
1948
1949                this.lexsize -= 1;
1950                this.txtend = this.lexsize;
1951                this.lexbuf[this.lexsize] = (byte)'\0';
1952                this.state = LEX_CONTENT;
1953                this.waswhite = false;
1954                this.token = newNode(Node.SectionTag,
1955                                      this.lexbuf,
1956                                      this.txtstart,
1957                                      this.txtend);
1958                return this.token;
1959
1960            case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1961                if (c != ']')
1962                    continue;
1963
1964                /* now look for ']' */
1965                c = this.in.readChar();
1966
1967                if (c != ']')
1968                {
1969                    this.in.ungetChar(c);
1970                    continue;
1971                }
1972
1973                /* now look for '>' */
1974                c = this.in.readChar();
1975
1976                if (c != '>')
1977                {
1978                    this.in.ungetChar(c);
1979                    continue;
1980                }
1981
1982                this.lexsize -= 1;
1983                this.txtend = this.lexsize;
1984                this.lexbuf[this.lexsize] = (byte)'\0';
1985                this.state = LEX_CONTENT;
1986                this.waswhite = false;
1987                this.token = newNode(Node.CDATATag,
1988                                      this.lexbuf,
1989                                      this.txtstart,
1990                                      this.txtend);
1991                return this.token;
1992            }
1993        }
1994
1995        if (this.state == LEX_CONTENT)  /* text string */
1996        {
1997            this.txtend = this.lexsize;
1998
1999            if (this.txtend > this.txtstart)
2000            {
2001                this.in.ungetChar(c);
2002
2003                if (this.lexbuf[this.lexsize - 1] == (byte)' ')
2004                {
2005                    this.lexsize -= 1;
2006                    this.txtend = this.lexsize;
2007                }
2008
2009                this.token = newNode(Node.TextNode,
2010                                      this.lexbuf,
2011                                      this.txtstart,
2012                                      this.txtend);
2013                return this.token;
2014            }
2015        }
2016        else if (this.state == LEX_COMMENT) /* comment */
2017        {
2018            if (c == StreamIn.EndOfStream)
2019                Report.warning(this, null, null, Report.MALFORMED_COMMENT);
2020
2021            this.txtend = this.lexsize;
2022            this.lexbuf[this.lexsize] = (byte)'\0';
2023            this.state = LEX_CONTENT;
2024            this.waswhite = false;
2025            this.token = newNode(Node.CommentTag,
2026                                  this.lexbuf,
2027                                  this.txtstart,
2028                                  this.txtend);
2029            return this.token;
2030        }
2031
2032        return null;
2033    }
2034
2035    /*
2036     parser for ASP within start tags
2037
2038     Some people use ASP for to customize attributes
2039     Tidy isn't really well suited to dealing with ASP
2040     This is a workaround for attributes, but won't
2041     deal with the case where the ASP is used to tailor
2042     the attribute value. Here is an example of a work
2043     around for using ASP in attribute values:
2044
2045      href="<%=rsSchool.Fields("ID").Value%>"
2046
2047     where the ASP that generates the attribute value
2048     is masked from Tidy by the quotemarks.
2049
2050    */
2051
2052    public Node parseAsp()
2053    {
2054        int c;
2055        Node asp = null;
2056
2057        this.txtstart = this.lexsize;
2058
2059        for (;;)
2060        {
2061            c = this.in.readChar();
2062            addCharToLexer(c);
2063
2064
2065            if (c != '%')
2066                continue;
2067
2068            c = this.in.readChar();
2069            addCharToLexer(c);
2070
2071            if (c == '>')
2072                break;
2073        }
2074
2075        this.lexsize -= 2;
2076        this.txtend = this.lexsize;
2077
2078        if (this.txtend > this.txtstart)
2079            asp = newNode(Node.AspTag,
2080                           this.lexbuf,
2081                           this.txtstart,
2082                           this.txtend);
2083
2084        this.txtstart = this.txtend;
2085        return asp;
2086    }   
2087 
2088    /*
2089     PHP is like ASP but is based upon XML
2090     processing instructions, e.g. <?php ... ?>
2091    */
2092    public Node parsePhp()
2093    {
2094        int c;
2095        Node php = null;
2096
2097        this.txtstart = this.lexsize;
2098
2099        for (;;)
2100        {
2101            c = this.in.readChar();
2102            addCharToLexer(c);
2103
2104
2105            if (c != '?')
2106                continue;
2107
2108            c = this.in.readChar();
2109            addCharToLexer(c);
2110
2111            if (c == '>')
2112                break;
2113        }
2114
2115        this.lexsize -= 2;
2116        this.txtend = this.lexsize;
2117
2118        if (this.txtend > this.txtstart)
2119            php = newNode(Node.PhpTag,
2120                           this.lexbuf,
2121                           this.txtstart,
2122                           this.txtend);
2123
2124        this.txtstart = this.txtend;
2125        return php;
2126    }   
2127
2128    /* consumes the '>' terminating start tags */
2129    public String   parseAttribute(MutableBoolean isempty, MutableObject asp,
2130                                 MutableObject php)
2131    {
2132        int start = 0;
2133        // int len = 0;   Removed by BUGFIX for 126265
2134        short map;
2135        String   attr;
2136        int c = 0;
2137
2138        asp.setObject(null);  /* clear asp pointer */
2139        php.setObject(null);  /* clear php pointer */
2140        /* skip white space before the attribute */
2141
2142        for (;;)
2143        {
2144            c = this.in.readChar();
2145
2146            if (c == '/')
2147            {
2148                c = this.in.readChar();
2149
2150                if (c == '>')
2151                {
2152                    isempty.value = true;
2153                    return null;
2154                }
2155
2156                this.in.ungetChar(c);
2157                c = '/';
2158                break;
2159            }
2160
2161            if (c == '>')
2162                return null;
2163
2164            if (c =='<')
2165            {
2166                c = this.in.readChar();
2167
2168                if (c == '%')
2169                {
2170                    asp.setObject(parseAsp());
2171                    return null;
2172                }
2173                else if (c == '?')
2174                {
2175                    php.setObject(parsePhp());
2176                    return null;
2177                }
2178
2179                this.in.ungetChar(c);
2180                Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2181                return null;
2182            }
2183
2184            if (c == '"' || c == '\'')
2185            {
2186                Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2187                continue;
2188            }
2189
2190            if (c == StreamIn.EndOfStream)
2191            {
2192                Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2193                this.in.ungetChar(c);
2194                return null;
2195            }
2196
2197            map = MAP((char)c);
2198
2199            if ((map & WHITE) == 0)
2200                break;
2201        }
2202
2203        start = this.lexsize;
2204
2205        for (;;)
2206        {
2207         /* but push back '=' for parseValue() */
2208            if (c == '=' || c == '>')
2209            {
2210                this.in.ungetChar(c);
2211                break;
2212            }
2213
2214            if (c == '<' || c == StreamIn.EndOfStream)
2215            {
2216                this.in.ungetChar(c);
2217                break;
2218            }
2219
2220            map = MAP((char)c);
2221
2222            if ((map & WHITE) != 0)
2223                break;
2224
2225         /* what should be done about non-namechar characters? */
2226         /* currently these are incorporated into the attr name */
2227
2228            if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
2229                c += (int)('a' - 'A');
2230
2231            //  ++len;    Removed by BUGFIX for 126265 
2232            addCharToLexer(c);
2233
2234            c = this.in.readChar();
2235        }
2236
2237        // Following line added by GLP to fix BUG 126265.  This is a temporary comment
2238        // and should be removed when Tidy is fixed.
2239        int len = this.lexsize - start;
2240        attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
2241        this.lexsize = start;
2242
2243        return attr;
2244    }
2245
2246    /*
2247     invoked when < is seen in place of attribute value
2248     but terminates on whitespace if not ASP, PHP or Tango
2249     this routine recognizes ' and " quoted strings
2250    */
2251    public int parseServerInstruction()
2252    {
2253        int c, map, delim = '"';
2254        boolean isrule = false;
2255
2256        c = this.in.readChar();
2257        addCharToLexer(c);
2258
2259        /* check for ASP, PHP or Tango */
2260        if (c == '%' || c == '?' || c == '@')
2261            isrule = true;
2262
2263        for (;;)
2264        {
2265            c = this.in.readChar();
2266
2267            if (c == StreamIn.EndOfStream)
2268                break;
2269
2270            if (c == '>')
2271            {
2272                if (isrule)
2273                    addCharToLexer(c);
2274                else
2275                    this.in.ungetChar(c);
2276
2277                break;
2278            }
2279
2280            /* if not recognized as ASP, PHP or Tango */
2281            /* then also finish value on whitespace */
2282            if (!isrule)
2283            {
2284                map = MAP((char)c);
2285
2286                if ((map & WHITE) != 0)
2287                    break;
2288            }
2289
2290            addCharToLexer(c);
2291
2292            if (c == '"')
2293            {
2294                do
2295                {
2296                    c = this.in.readChar();
2297                    addCharToLexer(c);
2298                }
2299                while (c != '"');
2300                delim = '\'';
2301                continue;
2302            }
2303
2304            if (c == '\'')
2305            {
2306                do
2307                {
2308                    c = this.in.readChar();
2309                    addCharToLexer(c);
2310                }
2311                while (c != '\'');
2312            }
2313        }
2314
2315        return delim;
2316    }
2317
2318    /* values start with "=" or " = " etc. */
2319    /* doesn't consume the ">" at end of start tag */
2320
2321    public String   parseValue(String   name, boolean foldCase,
2322                             MutableBoolean isempty, MutableInteger pdelim)
2323    {
2324        int len = 0;
2325        int start;
2326        short map;
2327        boolean seen_gt = false;
2328        boolean munge = true;
2329        int c = 0;
2330        int lastc, delim, quotewarning;
2331        String   value;
2332
2333        delim = 0;
2334        pdelim.value = (int)'"';
2335
2336        /*
2337         Henry Zrepa reports that some folk are using the
2338         embed element with script attributes where newlines
2339         are significant and must be preserved
2340        */
2341        if (configuration.LiteralAttribs)
2342            munge = false;
2343
2344        /* skip white space before the '=' */
2345
2346        for (;;)
2347        {
2348            c = this.in.readChar();
2349
2350            if (c == StreamIn.EndOfStream)
2351            {
2352                this.in.ungetChar(c);
2353                break;
2354            }
2355
2356            map = MAP((char)c);
2357
2358            if ((map & WHITE) == 0)
2359               break;
2360        }
2361
2362    /*
2363      c should be '=' if there is a value
2364      other legal possibilities are white
2365      space, '/' and '>'
2366    */
2367
2368        if (c != '=')
2369        {
2370            this.in.ungetChar(c);
2371            return null;
2372        }
2373
2374     /* skip white space after '=' */
2375
2376        for (;;)
2377        {
2378            c = this.in.readChar();
2379
2380            if (c == StreamIn.EndOfStream)
2381            {
2382                this.in.ungetChar(c);
2383                break;
2384            }
2385
2386            map = MAP((char)c);
2387
2388            if ((map & WHITE) == 0)
2389               break;
2390        }
2391
2392     /* check for quote marks */
2393
2394        if (c == '"' || c == '\'')
2395            delim = c;
2396        else if (c == '<')
2397        {
2398            start = this.lexsize;
2399            addCharToLexer(c);
2400            pdelim.value = parseServerInstruction();
2401            len = this.lexsize - start;
2402            this.lexsize = start;
2403            return (len > 0 ? getString(this.lexbuf, start, len) : null);
2404        }
2405        else
2406            this.in.ungetChar(c);
2407
2408     /*
2409       and read the value string
2410       check for quote mark if needed
2411     */
2412
2413        quotewarning = 0;
2414        start = this.lexsize;
2415        c = '\0';
2416
2417        for (;;)
2418        {
2419            lastc = c;  /* track last character */
2420            c = this.in.readChar();
2421
2422            if (c == StreamIn.EndOfStream)
2423            {
2424                Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2425                this.in.ungetChar(c);
2426                break;
2427            }
2428
2429            if (delim == (char)0)
2430            {
2431                if (c == '>')
2432                {
2433                    this.in.ungetChar(c);
2434                    break;
2435                }
2436
2437                if (c == '"' || c == '\'')
2438                {
2439                    Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2440                    break;
2441                }
2442
2443                if (c == '<')
2444                {
2445                    /* this.in.ungetChar(c); */
2446                    Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2447                    /* break; */
2448                }
2449
2450                /*
2451                 For cases like <br clear=all/> need to avoid treating /> as
2452                 part of the attribute value, however care is needed to avoid
2453                 so treating <a HREF=http://www.acme.com/> in this way, which
2454                 would map the <a> tag to <a HREF="http://www.acme.com"/>
2455                */
2456                if (c == '/')
2457                {
2458                    /* peek ahead in case of /> */
2459                    c = this.in.readChar();
2460
2461                    if (c == '>' &&
2462                        !AttributeTable.getDefaultAttributeTable().isUrl(name))
2463                    {
2464                        isempty.value = true;
2465                        this.in.ungetChar(c);
2466                        break;
2467                    }
2468
2469                    /* unget peeked char */
2470                    this.in.ungetChar(c);
2471                    c = '/';
2472                }
2473            }
2474            else  /* delim is '\'' or '"' */
2475            {
2476                if (c == delim)
2477                    break;
2478
2479                /* treat CRLF, CR and LF as single line break */
2480
2481                if (c == '\r')
2482                {
2483                    c = this.in.readChar();
2484                    if (c != '\n')
2485                        this.in.ungetChar(c);
2486
2487                    c = '\n';
2488                }
2489
2490                if (c == '\n' || c == '<' || c == '>')
2491                    ++quotewarning;
2492
2493                if (c == '>')
2494                    seen_gt = true;
2495            }
2496
2497            if (c == '&')
2498            {
2499                addCharToLexer(c);
2500                parseEntity((short)0);
2501                continue;
2502            }
2503
2504            /*
2505             kludge for JavaScript attribute values
2506             with line continuations in string literals
2507            */
2508            if (c == '\\')
2509            {
2510                c = this.in.readChar();
2511
2512                if (c != '\n')
2513                {
2514                    this.in.ungetChar(c);
2515                    c = '\\';
2516                }
2517            }
2518
2519            map = MAP((char)c);
2520
2521            if ((map & WHITE) != 0)
2522            {
2523                if (delim == (char)0)
2524                    break;
2525
2526                if (munge)
2527                {
2528                    c = ' ';
2529
2530                    if (lastc == ' ')
2531                        continue;
2532                }
2533            }
2534            else if (foldCase && (map & UPPERCASE) != 0)
2535                c += (int)('a' - 'A');
2536
2537            addCharToLexer(c);
2538        }
2539
2540        if (quotewarning > 10 && seen_gt && munge)
2541        {
2542            /*
2543               there is almost certainly a missing trailling quote mark
2544               as we have see too many newlines, < or > characters.
2545
2546               an exception is made for Javascript attributes and the
2547               javascript URL scheme which may legitimately include < and >
2548            */
2549            if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
2550                !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
2551                  (getString(this.lexbuf, start, 11)).equals("javascript:")))
2552                    Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
2553        }
2554
2555        len = this.lexsize - start;
2556        this.lexsize = start;
2557
2558        if (len > 0 || delim != 0)
2559            value = getString(this.lexbuf, start, len);
2560        else
2561            value = null;
2562
2563        /* note delimiter if given */
2564        if (delim != 0)
2565            pdelim.value = delim;
2566        else
2567            pdelim.value = (int)'"';
2568
2569        return value;
2570    }
2571
2572    /* attr must be non-null */
2573    public static boolean isValidAttrName(String   attr)
2574    {
2575        short map;
2576        char c;
2577        int i;
2578
2579        /* first character should be a letter */
2580        c = attr.charAt(0);
2581        map = MAP(c);
2582
2583        if (!((map & LETTER) != 0))
2584            return false;
2585
2586        /* remaining characters should be namechars */
2587        for( i = 1; i < attr.length(); i++)
2588        {
2589            c = attr.charAt(i);
2590            map = MAP(c);
2591
2592            if((map & NAMECHAR) != 0)
2593                continue;
2594
2595            return false;
2596        }
2597
2598        return true;
2599    }
2600
2601    /* swallows closing '>' */
2602
2603    public AttVal parseAttrs(MutableBoolean isempty)
2604    {
2605        AttVal av, list;
2606        String   attribute, value;
2607        MutableInteger delim = new MutableInteger();
2608        MutableObject asp = new MutableObject();
2609        MutableObject php = new MutableObject();
2610
2611        list = null;
2612
2613        for (; !endOfInput();)
2614        {
2615            attribute = parseAttribute(isempty, asp, php);
2616
2617            if (attribute == null)
2618            {
2619                /* check if attributes are created by ASP markup */
2620                if (asp.getObject() != null)
2621                {
2622                    av = new AttVal(list, null, (Node)asp.getObject(), null,
2623                                    '\0', null, null );
2624                    list = av;
2625                    continue;
2626                }
2627
2628                /* check if attributes are created by PHP markup */
2629                if (php.getObject() != null)
2630                {
2631                    av = new AttVal(list, null, null, (Node)php.getObject(),
2632                                    '\0', null, null );
2633                    list = av;
2634                    continue;
2635                }
2636
2637                break;
2638            }
2639
2640            value = parseValue(attribute, false, isempty, delim);
2641
2642            if (attribute != null && isValidAttrName(attribute))
2643            {
2644                av = new AttVal( list, null, null, null,
2645                                 delim.value, attribute, value );
2646                av.dict =
2647                    AttributeTable.getDefaultAttributeTable().findAttribute(av);
2648                list = av;
2649            }
2650            else
2651            {
2652                av = new AttVal( null, null, null, null,
2653                                 0, attribute, value );
2654                Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
2655            }
2656        }
2657
2658        return list;
2659    }
2660
2661    /*
2662      push a copy of an inline node onto stack
2663      but don't push if implicit or OBJECT or APPLET
2664      (implicit tags are ones generated from the istack)
2665
2666      One issue arises with pushing inlines when
2667      the tag is already pushed. For instance:
2668
2669          <p><em>text
2670          <p><em>more text
2671
2672      Shouldn't be mapped to
2673
2674          <p><em>text</em></p>
2675          <p><em><em>more text</em></em>
2676    */
2677    public void pushInline( Node node )
2678    {
2679        IStack is;
2680
2681        if (node.implicit)
2682            return;
2683
2684        if (node.tag == null)
2685            return;
2686
2687        if ((node.tag.model & Dict.CM_INLINE) == 0 )
2688            return;
2689
2690        if ((node.tag.model & Dict.CM_OBJECT) != 0)
2691            return;
2692
2693        if (node.tag != configuration.tt.tagFont && isPushed(node))
2694            return;
2695
2696        // make sure there is enough space for the stack
2697        is = new IStack();
2698        is.tag = node.tag;
2699        is.element = node.element;
2700        if (node.attributes != null)
2701            is.attributes = cloneAttributes(node.attributes);
2702        this.istack.push( is );
2703    }
2704
2705    /* pop inline stack */
2706    public void popInline( Node node )
2707    {
2708        AttVal av;
2709        IStack is;
2710
2711        if (node != null) {
2712
2713            if (node.tag == null)
2714                return;
2715
2716            if ((node.tag.model & Dict.CM_INLINE) == 0)
2717                return;
2718
2719            if ((node.tag.model & Dict.CM_OBJECT) != 0)
2720                return;
2721
2722            // if node is </a> then pop until we find an <a>
2723            if (node.tag == configuration.tt.tagA) {
2724
2725                while (this.istack.size() > 0) {
2726                    is = (IStack)this.istack.pop();
2727                    if (is.tag == configuration.tt.tagA) {
2728                        break;
2729                    }
2730                }
2731
2732                if (this.insert >= this.istack.size())
2733                    this.insert = -1;
2734                return;
2735            }
2736        }
2737
2738        if (this.istack.size() > 0) {
2739            is = (IStack)this.istack.pop();
2740            if (this.insert >= this.istack.size())
2741                this.insert = -1;
2742        }
2743    }
2744
2745    public boolean isPushed( Node node )
2746    {
2747        int i;
2748        IStack is;
2749
2750        for (i = this.istack.size() - 1; i >= 0; --i) {
2751            is = (IStack)this.istack.elementAt(i);
2752            if (is.tag == node.tag)
2753                return true;
2754        }
2755
2756        return false;
2757    }
2758
2759    /*
2760      This has the effect of inserting "missing" inline
2761      elements around the contents of blocklevel elements
2762      such as P, TD, TH, DIV, PRE etc. This procedure is
2763      called at the start of ParseBlock. when the inline
2764      stack is not empty, as will be the case in:
2765
2766        <i><h1>italic heading</h1></i>
2767
2768      which is then treated as equivalent to
2769
2770        <h1><i>italic heading</i></h1>
2771
2772      This is implemented by setting the lexer into a mode
2773      where it gets tokens from the inline stack rather than
2774      from the input stream.
2775    */
2776    public int inlineDup( Node node )
2777    {
2778        int n;
2779
2780        n = this.istack.size() - this.istackbase;
2781        if ( n > 0 ) {
2782            this.insert = this.istackbase;
2783            this.inode = node;
2784        }
2785
2786        return n;
2787    }
2788
2789    public Node insertedToken()
2790    {
2791        Node node;
2792        IStack is;
2793        int n;
2794
2795        // this will only be null if inode != null
2796        if (this.insert == -1) {
2797            node = this.inode;
2798            this.inode = null;
2799            return node;
2800        }
2801
2802        // is this is the "latest" node then update
2803        // the position, otherwise use current values
2804
2805        if (this.inode == null) {
2806            this.lines = this.in.curline;
2807            this.columns = this.in.curcol;
2808        }
2809
2810        node = newNode(Node.StartTag,
2811                        this.lexbuf,
2812                        this.txtstart,
2813                        this.txtend);   // GLP:  Bugfix 126261.  Remove when this change
2814                                        //       is fixed in istack.c in the original Tidy
2815        node.implicit = true;
2816        is = (IStack)this.istack.elementAt( this.insert );
2817        node.element = is.element;
2818        node.tag = is.tag;
2819        if (is.attributes != null)
2820            node.attributes = cloneAttributes(is.attributes);
2821
2822        // advance lexer to next item on the stack
2823        n = this.insert;
2824
2825        // and recover state if we have reached the end
2826        if (++n < this.istack.size() ) {
2827            this.insert = n;
2828        } else {
2829            this.insert = -1;
2830        }
2831
2832        return node;
2833    }
2834
2835    /* AQ: Try this for speed optimization */
2836    public static int wstrcasecmp(String   s1, String   s2)
2837    {
2838        return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2839    }
2840
2841    public static int wstrcaselexcmp(String   s1, String   s2)
2842    {
2843        char c;
2844        int i = 0;
2845
2846        while ( i < s1.length() && i < s2.length() ) {
2847            c = s1.charAt(i);
2848            if ( toLower(c) != toLower( s2.charAt(i) ) ) {
2849                break;
2850            }
2851            i += 1;
2852        }
2853        if ( i == s1.length() && i == s2.length() ) {
2854            return 0;
2855        } else if ( i == s1.length() ) {
2856            return -1;
2857        } else if ( i == s2.length() ) {
2858            return 1;
2859        } else {
2860            return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
2861        }
2862    }
2863
2864    public static boolean wsubstr(String   s1, String   s2)
2865    {
2866        int i;
2867        int len1 = s1.length();
2868        int len2 = s2.length();
2869
2870        for (i = 0; i <= len1 - len2; ++i)
2871        {
2872            if (s2.equalsIgnoreCase(s1.substring(i)))
2873                return true;
2874        }
2875
2876        return false;
2877    }
2878
2879    public boolean canPrune(Node element)
2880    {
2881        if (element.type == Node.TextNode)
2882            return true;
2883
2884        if (element.content != null)
2885            return false;
2886
2887        if (element.tag == configuration.tt.tagA && element.attributes != null)
2888            return false;
2889
2890        if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
2891            return false;
2892
2893        if (element.tag == null)
2894            return false;
2895
2896        if ((element.tag.model & Dict.CM_ROW) != 0)
2897            return false;
2898
2899        if (element.tag == configuration.tt.tagApplet)
2900            return false;
2901
2902        if (element.tag == configuration.tt.tagObject)
2903            return false;
2904
2905        if (element.attributes != null &&
2906            (element.getAttrByName("id") != null ||
2907               element.getAttrByName("name") != null) )
2908            return false;
2909
2910        return true;
2911    }
2912
2913    /* duplicate name attribute as an id */
2914    public void fixId(Node node)
2915    {
2916        AttVal name = node.getAttrByName("name");
2917        AttVal id = node.getAttrByName("id");
2918
2919        if (name != null)
2920        {
2921            if (id != null)
2922            {
2923                if (!id.value.equals(name.value))
2924                    Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
2925            }
2926            else if (this.configuration.XmlOut)
2927                node.addAttribute("id", name.value);
2928        }
2929    }
2930
2931    /*
2932     defer duplicates when entering a table or other
2933     element where the inlines shouldn't be duplicated
2934    */
2935    public void deferDup()
2936    {
2937        this.insert = -1;
2938        this.inode = null;
2939    }
2940
2941    /* Private methods and fields */
2942
2943    /* lexer char types */
2944    private static final short DIGIT       = 1;
2945    private static final short LETTER      = 2;
2946    private static final short NAMECHAR    = 4;
2947    private static final short WHITE       = 8;
2948    private static final short NEWLINE     = 16;
2949    private static final short LOWERCASE   = 32;
2950    private static final short UPPERCASE   = 64;
2951
2952    /* lexer GetToken states */
2953
2954    private static final short LEX_CONTENT     = 0;
2955    private static final short LEX_GT          = 1;
2956    private static final short LEX_ENDTAG      = 2;
2957    private static final short LEX_STARTTAG    = 3;
2958    private static final short LEX_COMMENT     = 4;
2959    private static final short LEX_DOCTYPE     = 5;
2960    private static final short LEX_PROCINSTR   = 6;
2961    private static final short LEX_ENDCOMMENT  = 7;
2962    private static final short LEX_CDATA       = 8;
2963    private static final short LEX_SECTION     = 9;
2964    private static final short LEX_ASP         = 10;
2965    private static final short LEX_JSTE        = 11;
2966    private static final short LEX_PHP         = 12;
2967
2968    /* used to classify chars for lexical purposes */
2969    private static short[] lexmap = new short[128];
2970
2971    private static void mapStr(String   str, short code)
2972    {
2973        int j;
2974
2975        for ( int i = 0; i < str.length(); i++ ) {
2976            j = (int)str.charAt(i);
2977            lexmap[j] |= code;
2978        }
2979    }
2980
2981    static {
2982        mapStr("\r\n\f", (short)(NEWLINE|WHITE));
2983        mapStr(" \t", WHITE);
2984        mapStr("-.:_", NAMECHAR);
2985        mapStr("0123456789", (short)(DIGIT|NAMECHAR));
2986        mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
2987        mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
2988    }
2989
2990    private static short MAP( char c )
2991    {
2992        return ((int)c < 128 ? lexmap[(int)c] : 0);
2993    }
2994
2995    private static boolean isWhite(char c)
2996    {
2997        short m = MAP(c);
2998
2999        return (m & WHITE) != 0;
3000    }
3001
3002    private static boolean isDigit(char c)
3003    {
3004        short m;
3005
3006        m = MAP(c);
3007
3008        return (m & DIGIT) != 0;
3009    }
3010
3011    private static boolean isLetter(char c)
3012    {
3013        short m;
3014
3015        m = MAP(c);
3016
3017        return (m & LETTER) != 0;
3018    }
3019
3020    private static char toLower(char c)
3021    {
3022        short m = MAP(c);
3023
3024        if ((m & UPPERCASE) != 0)
3025            c = (char)( (int)c + (int)'a' - (int)'A' );
3026
3027        return c;
3028    }
3029
3030    private static char toUpper(char c)
3031    {
3032        short m = MAP(c);
3033
3034        if ((m & LOWERCASE) != 0)
3035            c = (char)( (int)c + (int)'A' - (int)'a' );
3036
3037        return c;
3038    }
3039
3040    public static char foldCase(char c, boolean tocaps, boolean xmlTags)
3041    {
3042        short m;
3043
3044        if (!xmlTags)
3045        {
3046            m = MAP(c);
3047
3048            if (tocaps)
3049            {
3050                if ((m & LOWERCASE) != 0)
3051                    c = (char)( (int)c + (int)'A' - (int)'a' );
3052            }
3053            else /* force to lower case */
3054            {
3055                if ((m & UPPERCASE) != 0)
3056                    c = (char)( (int)c + (int)'a' - (int)'A' );
3057            }
3058        }
3059
3060        return c;
3061    }
3062
3063
3064    private static class W3CVersionInfo
3065    {
3066        String   name;
3067        String   voyagerName;
3068        String   profile;
3069        short code;
3070
3071        public W3CVersionInfo( String   name,
3072                               String   voyagerName,
3073                               String   profile,
3074                               short code )
3075        {
3076            this.name = name;
3077            this.voyagerName = voyagerName;
3078            this.profile = profile;
3079            this.code = code;
3080        }
3081    }
3082
3083    /* the 3 URIs  for the XHTML 1.0 DTDs */
3084    private static final String   voyager_loose    = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
3085    private static final String   voyager_strict   = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
3086    private static final String   voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
3087
3088    private static final String   XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
3089
3090    private static Lexer.W3CVersionInfo[] W3CVersion =
3091    {
3092        new W3CVersionInfo("HTML 4.01",
3093                           "XHTML 1.0 Strict",
3094                           voyager_strict,
3095                           Dict.VERS_HTML40_STRICT),
3096        new W3CVersionInfo("HTML 4.01 Transitional",
3097                           "XHTML 1.0 Transitional",
3098                           voyager_loose,
3099                           Dict.VERS_HTML40_LOOSE),
3100        new W3CVersionInfo("HTML 4.01 Frameset",
3101                           "XHTML 1.0 Frameset",
3102                           voyager_frameset,
3103                           Dict.VERS_FRAMES),
3104        new W3CVersionInfo("HTML 4.0",
3105                           "XHTML 1.0 Strict",
3106                           voyager_strict,
3107                           Dict.VERS_HTML40_STRICT),
3108        new W3CVersionInfo("HTML 4.0 Transitional",
3109                           "XHTML 1.0 Transitional",
3110                           voyager_loose,
3111                           Dict.VERS_HTML40_LOOSE),
3112        new W3CVersionInfo("HTML 4.0 Frameset",
3113                           "XHTML 1.0 Frameset",
3114                           voyager_frameset,
3115                           Dict.VERS_FRAMES),
3116        new W3CVersionInfo("HTML 3.2",
3117                           "XHTML 1.0 Transitional",
3118                           voyager_loose,
3119                           Dict.VERS_HTML32),
3120        new W3CVersionInfo("HTML 2.0",
3121                           "XHTML 1.0 Strict",
3122                           voyager_strict,
3123                           Dict.VERS_HTML20)
3124    };
3125
3126}
3127
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags