Parser


1   /*
2    * @(#)Parser.java  1.43 05/05/27
3    *
4    * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
5    * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   package javax.swing.text.html.parser;
9   
10  import javax.swing.text.SimpleAttributeSet  ;
11  import javax.swing.text.html.HTML  ;
12  import javax.swing.text.ChangedCharSetException  ;
13  import java.io.*;
14  import java.util.Hashtable  ;
15  import java.util.Properties  ;
16  import java.util.Vector  ;
17  import java.util.Enumeration  ;
18  import java.net.URL  ;
19  
20  import sun.misc.MessageUtils;
21  
22  /**
23   * A simple DTD-driven HTML parser. The parser reads an
24   * HTML file from an InputStream and calls various methods
25   * (which should be overridden in a subclass) when tags and
26   * data are encountered.
27   * <p>
28   * Unfortunately there are many badly implemented HTML parsers
29   * out there, and as a result there are many badly formatted
30   * HTML files. This parser attempts to parse most HTML files.
31   * This means that the implementation sometimes deviates from
32   * the SGML specification in favor of HTML.
33   * <p>
34   * The parser treats \r and \r\n as \n. Newlines after starttags
35   * and before end tags are ignored just as specified in the SGML/HTML
36   * specification.
37   * <p>
38   * The html spec does not specify how spaces are to be coalesced very well.
39   * Specifically, the following scenarios are not discussed (note that a
40   * space should be used here, but I am using &amp;nbsp to force the space to
41   * be displayed):
42   * <p>
43   * '&lt;b>blah&nbsp;&lt;i>&nbsp;&lt;strike>&nbsp;foo' which can be treated as:
44   * '&lt;b>blah&nbsp;&lt;i>&lt;strike>foo' 
45   * <p>as well as:
46   * '&lt;p>&lt;a HREF="xx">&nbsp;&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
47   * which appears to be treated as:
48   * '&lt;p>&lt;a HREF="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
49   * <p>
50   * If <code>strict</code> is false, when a tag that breaks flow,
51   * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
52   * encountered, all whitespace will be ignored until a non whitespace
53   * character is encountered. This appears to give behavior closer to
54   * the popular browsers.
55   *
56   * @see DTD
57   * @see TagElement
58   * @see SimpleAttributeSet
59   * @version 1.43, 05/27/05
60   * @author Arthur van Hoff
61   * @author Sunita Mani
62   */
63  public
64  class Parser implements DTDConstants   {
65  
66      private char text[] = new char[1024];
67      private int textpos = 0;
68      private TagElement   last;
69      private boolean space;
70  
71      private char str[] = new char[128];
72      private int strpos = 0;
73  
74      protected DTD   dtd = null;
75  
76      private int ch;
77      private int ln;
78      private Reader in;
79  
80      private Element   recent;
81      private TagStack   stack;
82      private boolean skipTag = false;
83      private TagElement   lastFormSent = null;
84      private SimpleAttributeSet   attributes = new SimpleAttributeSet  ();
85  
86      // State for <html>, <head> and <body>.  Since people like to slap
87      // together HTML documents without thinking, occasionally they
88      // have multiple instances of these tags.  These booleans track
89      // the first sightings of these tags so they can be safely ignored
90      // by the parser if repeated.
91      private boolean seenHtml = false;
92      private boolean seenHead = false;
93      private boolean seenBody = false;
94  
95      /**
96       * The html spec does not specify how spaces are coalesced very well.
97       * If strict == false, ignoreSpace is used to try and mimic the behavior
98       * of the popular browsers.
99       * <p>
100      * The problematic scenarios are:
101      * '&lt;b>blah &lt;i> &lt;strike> foo' which can be treated as:
102      * '&lt;b>blah &lt;i>&lt;strike>foo'
103      * as well as:
104      * '&lt;p>&lt;a HREF="xx"> &lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
105      * which appears to be treated as:
106      * '&lt;p>&lt;a HREF="xx">&lt;em>Using&lt;/em>&lt;/a>&lt;/p>'
107      * <p>
108      * When a tag that breaks flow, or trailing whitespace is encountered
109      * ignoreSpace is set to true. From then on, all whitespace will be
110      * ignored.
111      * ignoreSpace will be set back to false the first time a
112      * non whitespace character is encountered. This appears to give
113      * behavior closer to the popular browsers.
114      */
115     private boolean ignoreSpace;
116 
117     /**
118      * This flag determines whether or not the Parser will be strict
119      * in enforcing SGML compatibility.  If false, it will be lenient
120      * with certain common classes of erroneous HTML constructs.
121      * Strict or not, in either case an error will be recorded.
122      *
123      */
124     protected boolean strict = false;
125 
126 
127     /** Number of \r\n's encountered. */
128     private int crlfCount;
129     /** Number of \r's encountered. A \r\n will not increment this. */
130     private int crCount;
131     /** Number of \n's encountered. A \r\n will not increment this. */
132     private int lfCount;
133 
134     //
135     // To correctly identify the start of a tag/comment/text we need two
136     // ivars. Two are needed as handleText isn't invoked until the tag
137     // after the text has been parsed, that is the parser parses the text,
138     // then a tag, then invokes handleText followed by handleStart.
139     //
140     /** The start position of the current block. Block is overloaded here,
141      * it really means the current start position for the current comment,
142      * tag, text. Use getBlockStartPosition to access this. */
143     private int currentBlockStartPos;
144     /** Start position of the last block. */
145     private int lastBlockStartPos;
146 
147     /**
148      * array for mapping numeric references in range
149      * 130-159 to displayable Unicode characters.
150      */
151     private static final char[] cp1252Map = {
152         8218,  // &#130;
153         402,   // &#131;
154         8222,  // &#132;
155         8230,  // &#133;
156         8224,  // &#134;
157         8225,  // &#135;
158         710,   // &#136;
159         8240,  // &#137;
160         352,   // &#138;
161         8249,  // &#139;
162         338,   // &#140;
163         141,   // &#141;
164         142,   // &#142;
165         143,   // &#143;
166         144,   // &#144;
167         8216,  // &#145;
168         8217,  // &#146;
169         8220,  // &#147;
170         8221,  // &#148;
171         8226,  // &#149;
172         8211,  // &#150;
173         8212,  // &#151;
174         732,   // &#152;
175         8482,  // &#153;
176         353,   // &#154;
177         8250,  // &#155;
178         339,   // &#156;
179         157,   // &#157;
180         158,   // &#158;
181         376    // &#159;
182     };
183 
184     public Parser(DTD   dtd) {
185     this.dtd = dtd;
186     }
187 
188 
189     /**
190      * @return the line number of the line currently being parsed
191      */
192     protected int getCurrentLine() {
193     return ln;
194     }
195 
196     /**
197      * Returns the start position of the current block. Block is
198      * overloaded here, it really means the current start position for
199      * the current comment tag, text, block.... This is provided for
200      * subclassers that wish to know the start of the current block when
201      * called with one of the handleXXX methods.
202      */
203     int getBlockStartPosition() {
204     return Math.max(0, lastBlockStartPos - 1);
205     }
206 
207     /**
208      * Makes a TagElement.
209      */
210     protected TagElement   makeTag(Element   elem, boolean fictional) {
211     return new TagElement  (elem, fictional);
212     }
213 
214     protected TagElement   makeTag(Element   elem) {
215     return makeTag(elem, false);
216     }
217 
218     protected SimpleAttributeSet   getAttributes() {
219     return attributes;
220     }
221 
222     protected void flushAttributes() {
223     attributes.removeAttributes(attributes);
224     }
225 
226     /**
227      * Called when PCDATA is encountered.
228      */
229     protected void handleText(char text[]) {
230     }
231 
232     /**
233      * Called when an HTML title tag is encountered.
234      */
235     protected void handleTitle(char text[]) {
236     // default behavior is to call handleText. Subclasses
237     // can override if necessary.
238     handleText(text);
239     }
240 
241     /**
242      * Called when an HTML comment is encountered.
243      */
244     protected void handleComment(char text[]) {
245     }
246 
247     protected void handleEOFInComment() {
248     // We've reached EOF.  Our recovery strategy is to
249     // see if we have more than one line in the comment;
250     // if so, we pretend that the comment was an unterminated
251     // single line comment, and reparse the lines after the
252     // first line as normal HTML content.
253 
254     int commentEndPos = strIndexOf('\n');
255     if (commentEndPos >= 0) {
256         handleComment(getChars(0, commentEndPos));
257         try {
258         in.close();
259         in = new CharArrayReader(getChars(commentEndPos + 1));
260         ch = '>';
261         } catch (IOException e) {
262         error("ioexception");
263         }
264 
265         resetStrBuffer();
266     } else {
267         // no newline, so signal an error
268         error("eof.comment");
269     }
270     }
271 
272     /**
273      * Called when an empty tag is encountered.
274      */
275     protected void handleEmptyTag(TagElement   tag) throws ChangedCharSetException   {
276     }
277 
278     /**
279      * Called when a start tag is encountered.
280      */
281     protected void handleStartTag(TagElement   tag) {
282     }
283 
284     /**
285      * Called when an end tag is encountered.
286      */
287     protected void handleEndTag(TagElement   tag) {
288     }
289 
290     /**
291      * An error has occurred.
292      */
293     protected void handleError(int ln, String   msg) {
294     /*
295     Thread.dumpStack();
296     System.out.println("**** " + stack);
297     System.out.println("line " + ln + ": error: " + msg);
298     System.out.println();
299     */
300     }
301 
302     /**
303      * Output text.
304      */
305     void handleText(TagElement   tag) {
306     if (tag.breaksFlow()) {
307         space = false;
308             if (!strict) {
309                 ignoreSpace = true;
310             }
311     }
312     if (textpos == 0) {
313         if ((!space) || (stack == null) || last.breaksFlow() ||
314         !stack.advance(dtd.pcdata)) {
315         last = tag;
316         space = false;
317         lastBlockStartPos = currentBlockStartPos;
318         return;
319         }
320     }
321     if (space) {
322             if (!ignoreSpace) {
323                 // enlarge buffer if needed
324                 if (textpos + 1 > text.length) {
325                     char newtext[] = new char[text.length + 200];
326                     System.arraycopy(text, 0, newtext, 0, text.length);
327                     text = newtext;
328                 }
329 
330                 // output pending space
331                 text[textpos++] = ' ';
332                 if (!strict && !tag.getElement().isEmpty()) {
333                     ignoreSpace = true;
334                 }
335             }
336             space = false;
337     }
338     char newtext[] = new char[textpos];
339     System.arraycopy(text, 0, newtext, 0, textpos);
340     // Handles cases of bad html where the title tag
341     // was getting lost when we did error recovery.
342     if (tag.getElement().getName().equals("title")) {
343         handleTitle(newtext);
344         } else {
345         handleText(newtext);
346     }
347     lastBlockStartPos = currentBlockStartPos;
348     textpos = 0;
349     last = tag;
350     space = false;
351     }
352 
353     /**
354      * Invoke the error handler.
355      */
356     protected void error(String   err, String   arg1, String   arg2,
357     String   arg3) {
358     // big hack, but this should never get used...
359     handleError (ln, err + arg1 + arg2 + arg3);
360     }
361 
362     protected void error(String   err, String   arg1, String   arg2) {
363     error(err, arg1, arg2, "?");
364     }
365     protected void error(String   err, String   arg1) {
366     error(err, arg1, "?", "?");
367     }
368     protected void error(String   err) {
369     error(err, "?", "?", "?");
370     }
371 
372 
373     /**
374      * Handle a start tag. The new tag is pushed
375      * onto the tag stack. The attribute list is
376      * checked for required attributes.
377      */
378     protected void startTag(TagElement   tag) throws ChangedCharSetException   {
379     Element   elem = tag.getElement();
380 
381     // If the tag is an empty tag and texpos != 0
382     // this implies that there is text before the
383     // start tag that needs to be processed before
384     // handling the tag.
385     //
386     if (!elem.isEmpty() || textpos != 0) {
387         handleText(tag);
388     } else {
389         // this variable gets updated in handleText().
390         // Since in this case we do not call handleText()
391         // we need to update it here.
392         //
393         last = tag;
394         // Note that we should really check last.breakFlows before
395         // assuming this should be false.
396         space = false;
397     }
398     lastBlockStartPos = currentBlockStartPos;
399 
400     // check required attributes
401     for (AttributeList   a = elem.atts ; a != null ; a = a.next) {
402 
403         if ((a.modifier == REQUIRED) && ((attributes.isEmpty()) || (!attributes.isDefined(a.name)))) {
404         error("req.att ", a.getName(), elem.getName());
405         }
406     }
407 
408     if (elem.isEmpty()) {
409         handleEmptyTag(tag);
410             /*
411     } else if (elem.getName().equals("form")) {
412         handleStartTag(tag);
413             */
414     } else {
415         recent = elem;
416         stack = new TagStack  (tag, stack);
417         handleStartTag(tag);
418     }
419     }
420 
421     /**
422      * Handle an end tag. The end tag is popped
423      * from the tag stack.
424      */
425     protected void endTag(boolean omitted) {
426     handleText(stack.tag);
427 
428     if (omitted && !stack.elem.omitEnd()) {
429         error("end.missing", stack.elem.getName());
430     } else if (!stack.terminate()) {
431         error("end.unexpected", stack.elem.getName());
432     }
433 
434     // handle the tag
435     handleEndTag(stack.tag);
436     stack = stack.next;
437     recent = (stack != null) ? stack.elem : null;
438     }
439 
440 
441     boolean ignoreElement(Element   elem) {
442 
443         String   stackElement = stack.elem.getName();
444     String   elemName = elem.getName();
445     /* We ignore all elements that are not valid in the context of
446        a table except <td>, <th> (these we handle in
447        legalElementContext()) and #pcdata.  We also ignore the
448        <font> tag in the context of <ul> and <ol> We additonally
449        ignore the <meta> and the <style> tag if the body tag has
450        been seen. **/
451     if ((elemName.equals("html") && seenHtml) ||
452         (elemName.equals("head") && seenHead) ||
453         (elemName.equals("body") && seenBody)) {
454         return true;
455     }
456     if (elemName.equals("dt") || elemName.equals("dd")) {
457         TagStack   s = stack;
458         while (s != null && !s.elem.getName().equals("dl")) {
459         s = s.next;
460         }
461         if (s == null) {
462         return true;
463         }
464     }
465 
466     if (((stackElement.equals("table")) &&
467          (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
468         ((elemName.equals("font")) &&
469          (stackElement.equals("ul") || stackElement.equals("ol"))) ||
470         (elemName.equals("meta") && stack != null) ||
471         (elemName.equals("style") && seenBody) ||
472         (stackElement.equals("table") && elemName.equals("a"))) {
473         return true;
474     }
475     return false;
476     }
477 
478 
479     /**
480      * Marks the first time a tag has been seen in a document
481      */
482 
483     protected void markFirstTime(Element   elem) {
484     String   elemName = elem.getName();
485     if (elemName.equals("html")) {
486         seenHtml = true;
487     } else if (elemName.equals("head")) {
488         seenHead = true;
489     } else if (elemName.equals("body")) {
490             if (buf.length == 1) {
491                 // Refer to note in definition of buf for details on this.
492                 char[] newBuf = new char[256];
493 
494                 newBuf[0] = buf[0];
495                 buf = newBuf;
496             }
497         seenBody = true;
498     }
499     }
500 
501     /**
502      * Create a legal content for an element.
503      */
504     boolean legalElementContext(Element   elem) throws ChangedCharSetException   {
505 
506     // System.out.println("-- legalContext -- " + elem);
507 
508     // Deal with the empty stack
509     if (stack == null) {
510         // System.out.println("-- stack is empty");
511         if (elem != dtd.html) {
512         // System.out.println("-- pushing html");
513         startTag(makeTag(dtd.html, true));
514         return legalElementContext(elem);
515         }
516         return true;
517     }
518 
519     // Is it allowed in the current context
520     if (stack.advance(elem)) {
521         // System.out.println("-- legal context");
522         markFirstTime(elem);
523         return true;
524     }
525     boolean insertTag = false;
526 
527     // The use of all error recovery strategies are contingent
528     // on the value of the strict property.
529     //
530     // These are commonly occuring errors.  if insertTag is true,
531     // then we want to adopt an error recovery strategy that
532     // involves attempting to insert an additional tag to
533     // legalize the context.  The two errors addressed here
534     // are:
535     // 1) when a <td> or <th> is seen soon after a <table> tag.
536     //    In this case we insert a <tr>.
537     // 2) when any other tag apart from a <tr> is seen
538     //    in the context of a <tr>.  In this case we would
539     //    like to add a <td>.  If a <tr> is seen within a
540     //    <tr> context, then we will close out the current
541     //    <tr>.
542     //
543     // This insertion strategy is handled later in the method.
544     // The reason for checking this now, is that in other cases
545     // we would like to apply other error recovery strategies for example
546     // ignoring tags.
547     //
548     // In certain cases it is better to ignore a tag than try to
549     // fix the situation.  So the first test is to see if this
550     // is what we need to do.
551     //
552     String   stackElemName = stack.elem.getName();
553     String   elemName = elem.getName();
554 
555 
556     if (!strict &&
557         ((stackElemName.equals("table") && elemName.equals("td")) ||
558          (stackElemName.equals("table") && elemName.equals("th")) ||
559          (stackElemName.equals("tr") && !elemName.equals("tr")))){
560          insertTag = true;
561     }
562 
563 
564     if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
565                       elem.getName().equals("body"))) {
566         if (skipTag = ignoreElement(elem)) {
567             error("tag.ignore", elem.getName());
568         return skipTag;
569         }
570     }
571 
572     // Check for anything after the start of the table besides tr, td, th
573     // or caption, and if those aren't there, insert the <tr> and call
574     // legalElementContext again.
575     if (!strict && stackElemName.equals("table") &&
576         !elemName.equals("tr") && !elemName.equals("td") &&
577         !elemName.equals("th") && !elemName.equals("caption")) {
578         Element   e = dtd.getElement("tr");
579         TagElement   t = makeTag(e, true);
580         legalTagContext(t);
581         startTag(t);
582         error("start.missing", elem.getName());
583         return legalElementContext(elem);
584     }
585 
586     // They try to find a legal context by checking if the current
587     // tag is valid in an enclosing context.  If so
588     // close out the tags by outputing end tags and then
589     // insert the curent tag.  If the tags that are
590     // being closed out do not have an optional end tag
591     // specification in the DTD then an html error is
592     // reported.
593     //
594     if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
595         for (TagStack   s = stack.next ; s != null ; s = s.next) {
596         if (s.advance(elem)) {
597             while (stack != s) {
598             endTag(true);
599             }
600             return true;
601         }
602         if (!s.terminate() || (strict && !s.elem.omitEnd())) {
603             break;
604         }
605         }
606     }
607 
608     // Check if we know what tag is expected next.
609     // If so insert the tag.  Report an error if the
610     // tag does not have its start tag spec in the DTD as optional.
611     //
612     Element   next = stack.first();
613     if (next != null && (!strict || next.omitStart()) &&
614        !(next==dtd.head && elem==dtd.pcdata) ) {
615         // System.out.println("-- omitting start tag: " + next);
616         TagElement   t = makeTag(next, true);
617         legalTagContext(t);
618         startTag(t);
619         if (!next.omitStart()) {
620         error("start.missing", elem.getName());
621         }
622         return legalElementContext(elem);
623     }
624 
625 
626     // Traverse the list of expected elements and determine if adding
627     // any of these elements would make for a legal context.
628     //
629 
630     if (!strict) {
631         ContentModel   content = stack.contentModel();
632         Vector   elemVec = new Vector  ();
633         if (content != null) {
634         content.getElements(elemVec);
635         for (Enumeration   v = elemVec.elements(); v.hasMoreElements();) {
636             Element   e = (Element  )v.nextElement();
637 
638             // Ensure that this element has not been included as
639             // part of the exclusions in the DTD.
640             //
641             if (stack.excluded(e.getIndex())) {
642             continue;
643             }
644 
645             boolean reqAtts = false;
646 
647             for (AttributeList   a = e.getAttributes(); a != null ; a = a.next) {
648                 if (a.modifier == REQUIRED) {
649                 reqAtts = true;
650                 break;
651                 }
652             }
653             // Ensure that no tag that has required attributes
654             // gets inserted.
655                 //
656             if (reqAtts) {
657                 continue;
658             }
659 
660             ContentModel   m = e.getContent();
661             if (m != null && m.first(elem)) {
662             // System.out.println("-- adding a legal tag: " + e);
663             TagElement   t = makeTag(e, true);
664             legalTagContext(t);
665             startTag(t);
666             error("start.missing", e.getName());
667             return legalElementContext(elem);
668             }
669         }
670         }
671     }
672 
673     // Check if the stack can be terminated.  If so add the appropriate
674     // end tag.  Report an error if the tag being ended does not have its
675     // end tag spec in the DTD as optional.
676     //
677     if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
678         // System.out.println("-- omitting end tag: " + stack.elem);
679         if (!stack.elem.omitEnd()) {
680         error("end.missing", elem.getName());
681         }
682 
683         endTag(true);
684         return legalElementContext(elem);
685     }
686 
687     // At this point we know that something is screwed up.
688     return false;
689     }
690 
691     /**
692      * Create a legal context for a tag.
693      */
694     void legalTagContext(TagElement   tag) throws ChangedCharSetException   {
695     if (legalElementContext(tag.getElement())) {
696         markFirstTime(tag.getElement());
697         return;
698     }
699 
700     // Avoid putting a block tag in a flow tag.
701     if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
702         endTag(true);
703         legalTagContext(tag);
704         return;
705     }
706 
707     // Avoid putting something wierd in the head of the document.
708     for (TagStack   s = stack ; s != null ; s = s.next) {
709         if (s.tag.getElement() == dtd.head) {
710         while (stack != s) {
711             endTag(true);
712         }
713         endTag(true);
714         legalTagContext(tag);
715         return;
716         }
717     }
718 
719     // Everything failed
720     error("tag.unexpected", tag.getElement().getName());
721     }
722 
723     /**
724      * Error context. Something went wrong, make sure we are in
725      * the document's body context
726      */
727     void errorContext() throws ChangedCharSetException   {
728     for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
729         handleEndTag(stack.tag);
730     }
731     if (stack == null) {
732         legalElementContext(dtd.body);
733         startTag(makeTag(dtd.body, true));
734     }
735     }
736 
737     /**
738      * Add a char to the string buffer.
739      */
740     void addString(int c) {
741     if (strpos  == str.length) {
742         char newstr[] = new char[str.length + 128];
743         System.arraycopy(str, 0, newstr, 0, str.length);
744         str = newstr;
745     }
746     str[strpos++] = (char)c;
747     }
748 
749     /**
750      * Get the string that's been accumulated.
751      */
752     String   getString(int pos) {
753     char newStr[] = new char[strpos - pos];
754     System.arraycopy(str, pos, newStr, 0, strpos - pos);
755     strpos = pos;
756     return new String  (newStr);
757     }
758 
759     char[] getChars(int pos) {
760     char newStr[] = new char[strpos - pos];
761     System.arraycopy(str, pos, newStr, 0, strpos - pos);
762     strpos = pos;
763     return newStr;
764     }
765 
766     char[] getChars(int pos, int endPos) {
767     char newStr[] = new char[endPos - pos];
768     System.arraycopy(str, pos, newStr, 0, endPos - pos);
769     // REMIND: it's not clear whether this version should set strpos or not
770     // strpos = pos;
771     return newStr;
772     }
773 
774     void resetStrBuffer() {
775     strpos = 0;
776     }
777 
778     int strIndexOf(char target) {
779     for (int i = 0; i < strpos; i++) {
780         if (str[i] == target) {
781         return i;
782         }
783     }
784 
785     return -1;
786     }
787 
788     /**
789      * Skip space.
790      * [5] 297:5
791      */
792     void skipSpace() throws IOException {
793     while (true) {
794         switch (ch) {
795           case '\n':
796         ln++;
797         ch = readCh();
798         lfCount++;
799         break;
800 
801           case '\r':
802         ln++;
803         if ((ch = readCh()) == '\n') {
804             ch = readCh();
805             crlfCount++;
806         }
807         else {
808             crCount++;
809         }
810         break;
811           case ' ':
812           case '\t':
813         ch = readCh();
814         break;
815 
816           default:
817         return;
818         }
819     }
820     }
821 
822     /**
823      * Parse identifier. Uppercase characters are folded
824      * to lowercase when lower is true. Returns falsed if
825      * no identifier is found. [55] 346:17
826      */
827     boolean parseIdentifier(boolean lower) throws IOException {
828     switch (ch) {
829       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
830       case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
831       case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
832       case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
833       case 'Y': case 'Z':
834         if (lower) {
835         ch = 'a' + (ch - 'A');
836         }
837 
838       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
839       case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
840       case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
841       case 's': case 't': case 'u': case 'v': case 'w': case 'x':
842       case 'y': case 'z':
843         break;
844 
845       default:
846         return false;
847     }
848 
849     while (true) {
850         addString(ch);
851 
852         switch (ch = readCh()) {
853           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
854           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
855           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
856           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
857           case 'Y': case 'Z':
858         if (lower) {
859             ch = 'a' + (ch - 'A');
860         }
861 
862           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
863           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
864           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
865           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
866           case 'y': case 'z':
867 
868           case '0': case '1': case '2': case '3': case '4':
869           case '5': case '6': case '7': case '8': case '9':
870 
871           case '.': case '-':
872 
873           case '_': // not officially allowed
874         break;
875 
876           default:
877         return true;
878         }
879     }
880     }
881 
882     /**
883      * Parse an entity reference. [59] 350:17
884      */
885     private char[] parseEntityReference() throws IOException {
886     int pos = strpos;
887 
888     if ((ch = readCh()) == '#') {
889             int n = 0;
890             ch = readCh();
891             if ((ch >= '0') && (ch <= '9') ||
892                     ch == 'x' || ch == 'X') {
893 
894                 if ((ch >= '0') && (ch <= '9')) {
895                     // parse decimal reference
896                     while ((ch >= '0') && (ch <= '9')) {
897                         n = (n * 10) + ch - '0';
898                         ch = readCh();
899                     }
900                 } else {
901                     // parse hexadecimal reference
902                     ch = readCh();
903                     char lch = (char) Character.toLowerCase(ch);
904                     while ((lch >= '0') && (lch <= '9') ||
905                             (lch >= 'a') && (lch <= 'f')) {
906                         if (lch >= '0' && lch <= '9') {
907                             n = (n * 16) + lch - '0';
908                         } else {
909                             n = (n * 16) + lch - 'a' + 10;
910                         }
911                         ch = readCh();
912                         lch = (char) Character.toLowerCase(ch);
913                     }
914                 }
915                 switch (ch) {
916                     case '\n':
917                         ln++;
918                         ch = readCh();
919                         lfCount++;
920                         break;
921 
922                     case '\r':
923                         ln++;
924                         if ((ch = readCh()) == '\n') {
925                             ch = readCh();
926                             crlfCount++;
927                         } 
928                         else {
929                             crCount++;
930                         }
931                         break;
932 
933                     case ';':
934                         ch = readCh();
935                         break;
936                 }
937                 char data[] = {mapNumericReference((char) n)};
938                 return data;
939             }
940             addString('#');
941             if (!parseIdentifier(false)) {
942                 error("ident.expected");
943                 strpos = pos;
944                 char data[] = {'&', '#'};
945                 return data;
946             }
947     } else if (!parseIdentifier(false)) {
948         char data[] = {'&'};
949         return data;
950     }
951     switch (ch) {
952       case '\n':
953         ln++;
954         ch = readCh();
955         lfCount++;
956         break;
957 
958       case '\r':
959         ln++;
960         if ((ch = readCh()) == '\n') {
961         ch = readCh();
962         crlfCount++;
963         }
964         else {
965         crCount++;
966         }
967         break;
968 
969       case ';':
970         ch = readCh();
971         break;
972     }
973 
974     String   nm = getString(pos);
975     Entity   ent = dtd.getEntity(nm);
976 
977     // entities are case sensitive - however if strict
978     // is false then we will try to make a match by
979     // converting the string to all lowercase.
980     //
981     if (!strict && (ent == null)) {
982         ent = dtd.getEntity(nm.toLowerCase());
983     }
984     if ((ent == null) || !ent.isGeneral()) {
985 
986         if (nm.length() == 0) {
987         error("invalid.entref", nm);
988         return new char[0];
989         }
990         /* given that there is not a match restore the entity reference */
991         String   str = "&" + nm;
992 
993         char b[] = new char[str.length()];
994         str.getChars(0, b.length, b, 0);
995         return b;
996     }
997     return ent.getData();
998     }
999 
1000    /**
1001     * Converts numeric character reference to Unicode character.
1002     *
1003     * Normally the code in a reference should be always converted
1004     * to the Unicode character with the same code, but due to
1005     * wide usage of Cp1252 charset most browsers map numeric references
1006     * in the range 130-159 (which are control chars in Unicode set)
1007     * to displayable characters with other codes.
1008     *
1009     * @param c the code of numeric character reference.
1010     * @return the character corresponding to the reference code.
1011     */
1012    private char mapNumericReference(char c) {
1013        if (c < 130 || c > 159) {
1014            return c;
1015        }
1016        return cp1252Map[c - 130];
1017    }
1018
1019    /**
1020     * Parse a comment. [92] 391:7
1021     */
1022    void parseComment() throws IOException {
1023
1024    while (true) {
1025        int c = ch;
1026        switch (c) {
1027          case '-':
1028          /** Presuming that the start string of a comment "<!--" has
1029              already been parsed, the '-' character is valid only as
1030              part of a comment termination and further more it must
1031              be present in even numbers. Hence if strict is true, we
1032              presume the comment has been terminated and return.
1033              However if strict is false, then there is no even number
1034              requirement and this character can appear anywhere in the
1035              comment.  The parser reads on until it sees the following
1036              pattern: "-->" or "--!>".
1037           **/
1038        if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1039            if ((ch = readCh()) == '>') {
1040            return;
1041            }
1042            if (ch == '!') {
1043            if ((ch = readCh()) == '>') {
1044                return;
1045            } else {
1046                /* to account for extra read()'s that happened */
1047                addString('-');
1048                addString('!');
1049                continue;
1050            }
1051            }
1052            break;
1053        }
1054
1055        if ((ch = readCh()) == '-') {
1056            ch = readCh();
1057            if (strict || ch == '>') {
1058            return;
1059            }
1060            if (ch == '!') {
1061            if ((ch = readCh()) == '>') {
1062                return;
1063            } else {
1064                /* to account for extra read()'s that happened */
1065                addString('-');
1066                addString('!');
1067                continue;
1068            }
1069            }
1070            /* to account for the extra read() */
1071            addString('-');
1072        }
1073        break;
1074
1075          case -1:
1076          handleEOFInComment();
1077          return;
1078
1079          case '\n':
1080        ln++;
1081        ch = readCh();
1082        lfCount++;
1083        break;
1084
1085          case '>':
1086        ch = readCh();
1087        break;
1088
1089          case '\r':
1090        ln++;
1091        if ((ch = readCh()) == '\n') {
1092            ch = readCh();
1093            crlfCount++;
1094        }
1095        else {
1096            crCount++;
1097        }
1098        c = '\n';
1099        break;
1100          default:
1101        ch = readCh();
1102        break;
1103        }
1104
1105        addString(c);
1106    }
1107    }
1108
1109    /**
1110     * Parse literal content. [46] 343:1 and [47] 344:1
1111     */
1112    void parseLiteral(boolean replace) throws IOException {
1113    while (true) {
1114        int c = ch;
1115        switch (c) {
1116          case -1:
1117        error("eof.literal", stack.elem.getName());
1118        endTag(true);
1119        return;
1120
1121          case '>':
1122        ch = readCh();
1123        int i = textpos - (stack.elem.name.length() + 2), j = 0;
1124
1125        // match end tag
1126        if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1127            while ((++i < textpos) &&
1128               (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1129            if (i == textpos) {
1130            textpos -= (stack.elem.name.length() + 2);
1131            if ((textpos > 0) && (text[textpos-1] == '\n')) {
1132                textpos--;
1133            }
1134            endTag(false);
1135            return;
1136            }
1137        }
1138        break;
1139
1140          case '&':
1141        char data[] = parseEntityReference();
1142        if (textpos + data.length > text.length) {
1143            char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1144            System.arraycopy(text, 0, newtext, 0, text.length);
1145            text = newtext;
1146        }
1147        System.arraycopy(data, 0, text, textpos, data.length);
1148        textpos += data.length;
1149        continue;
1150
1151          case '\n':
1152        ln++;
1153        ch = readCh();
1154        lfCount++;
1155        break;
1156
1157          case '\r':
1158        ln++;
1159        if ((ch = readCh()) == '\n') {
1160            ch = readCh();
1161            crlfCount++;
1162        }
1163        else {
1164            crCount++;
1165        }
1166                c = '\n';
1167        break;
1168          default:
1169        ch = readCh();
1170        break;
1171        }
1172
1173        // output character
1174        if (textpos == text.length) {
1175        char newtext[] = new char[text.length + 128];
1176        System.arraycopy(text, 0, newtext, 0, text.length);
1177        text = newtext;
1178        }
1179        text[textpos++] = (char)c;
1180    }
1181    }
1182
1183    /**
1184     * Parse attribute value. [33] 331:1
1185     */
1186    String   parseAttributeValue(boolean lower) throws IOException {
1187    int delim = -1;
1188
1189    // Check for a delimiter
1190    switch(ch) {
1191      case '\'':
1192      case '"':
1193        delim = ch;
1194        ch = readCh();
1195        break;
1196    }
1197
1198    // Parse the rest of the value
1199    while (true) {
1200        int c = ch;
1201
1202        switch (c) {
1203          case '\n':
1204        ln++;
1205        ch = readCh();
1206        lfCount++;
1207        if (delim < 0) {
1208            return getString(0);
1209        }
1210        break;
1211
1212          case '\r':
1213        ln++;
1214
1215        if ((ch = readCh()) == '\n') {
1216            ch = readCh();
1217            crlfCount++;
1218        }
1219        else {
1220            crCount++;
1221        }
1222        if (delim < 0) {
1223            return getString(0);
1224        }
1225        break;
1226
1227          case '\t':
1228          if (delim < 0)
1229              c = ' ';
1230          case ' ':
1231        ch = readCh();
1232        if (delim < 0) {
1233            return getString(0);
1234        }
1235        break;
1236
1237          case '>':
1238          case '<':
1239        if (delim < 0) {
1240            return getString(0);
1241        }
1242        ch = readCh();
1243        break;
1244
1245          case '\'':
1246          case '"':
1247        ch = readCh();
1248        if (c == delim) {
1249            return getString(0);
1250        } else if (delim == -1) {
1251            error("attvalerr");
1252            if (strict || ch == ' ') {
1253            return getString(0);
1254            } else {
1255            continue;
1256            }
1257        }
1258        break;
1259
1260        case '=':
1261        if (delim < 0) {
1262            /* In SGML a construct like <img SRC=/cgi-bin/foo?x=1>
1263               is considered invalid since an = sign can only be contained
1264               in an attributes value if the string is quoted.
1265               */
1266            error("attvalerr");
1267            /* If strict is true then we return with the string we have thus far.
1268               Otherwise we accept the = sign as part of the attribute's value and
1269               process the rest of the img tag. */
1270            if (strict) {
1271            return getString(0);
1272            }
1273        }
1274        ch = readCh();
1275        break;
1276
1277          case '&':
1278        if (strict && delim < 0) {
1279            ch = readCh();
1280            break;
1281        }
1282
1283        char data[] = parseEntityReference();
1284        for (int i = 0 ; i < data.length ; i++) {
1285            c = data[i];
1286            addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1287        }
1288        continue;
1289
1290          case -1:
1291        return getString(0);
1292
1293          default:
1294        if (lower && (c >= 'A') && (c <= 'Z')) {
1295            c = 'a' + c - 'A';
1296        }
1297        ch = readCh();
1298        break;
1299        }
1300        addString(c);
1301    }
1302    }
1303
1304
1305    /**
1306     * Parse attribute specification List. [31] 327:17
1307     */
1308    void parseAttributeSpecificationList(Element   elem) throws IOException {
1309
1310    while (true) {
1311        skipSpace();
1312
1313        switch (ch) {
1314          case '/':
1315          case '>':
1316          case '<':
1317          case -1:
1318        return;
1319
1320          case '-':
1321        if ((ch = readCh()) == '-') {
1322            ch = readCh();
1323            parseComment();
1324            strpos = 0;
1325        } else {
1326            error("invalid.tagchar", "-", elem.getName());
1327            ch = readCh();
1328        }
1329        continue;
1330        }
1331
1332        AttributeList   att = null;
1333        String   attname = null;
1334        String   attvalue = null;
1335
1336        if (parseIdentifier(true)) {
1337        attname = getString(0);
1338        skipSpace();
1339        if (ch == '=') {
1340            ch = readCh();
1341            skipSpace();
1342            att = elem.getAttribute(attname);
1343//  Bug ID 4102750
1344//  Load the NAME of an Attribute Case Sensitive
1345//  The case of the NAME  must be intact
1346//  MG 021898
1347            attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1348//          attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1349        } else {
1350            attvalue = attname;
1351            att = elem.getAttributeByValue(attvalue);
1352            if (att == null) {
1353            att = elem.getAttribute(attname);
1354            if (att != null) {
1355                attvalue = att.getValue();
1356            }
1357            else {
1358                // Make it null so that NULL_ATTRIBUTE_VALUE is
1359                // used
1360                attvalue = null;
1361            }
1362            }
1363        }
1364        } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1365        ch = readCh();
1366        continue;
1367        } else if (!strict && ch == '"') { // allows for quoted attributes
1368        ch = readCh();
1369        skipSpace();
1370        if (parseIdentifier(true)) {
1371            attname = getString(0);
1372            if (ch == '"') {
1373            ch = readCh();
1374            }
1375            skipSpace();
1376            if (ch == '=') {
1377            ch = readCh();
1378            skipSpace();
1379            att = elem.getAttribute(attname);
1380            attvalue = parseAttributeValue((att != null) &&
1381                        (att.type != CDATA) &&
1382                        (att.type != NOTATION));
1383            } else {
1384            attvalue = attname;
1385            att = elem.getAttributeByValue(attvalue);
1386            if (att == null) {
1387                att = elem.getAttribute(attname);
1388                if (att != null) {
1389                attvalue = att.getValue();
1390                }
1391            }
1392            }
1393        } else {
1394            char str[] = {(char)ch};
1395            error("invalid.tagchar", new String  (str), elem.getName());
1396            ch = readCh();
1397            continue;
1398        }
1399        } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1400        ch = readCh();
1401        skipSpace();
1402        attname = elem.getName();
1403        att = elem.getAttribute(attname);
1404        attvalue = parseAttributeValue((att != null) &&
1405                           (att.type != CDATA) &&
1406                           (att.type != NOTATION));
1407        } else if (!strict && (ch == '=')) {
1408        ch = readCh();
1409        skipSpace();
1410        attvalue = parseAttributeValue(true);
1411        error("attvalerr");
1412        return;
1413        } else {
1414        char str[] = {(char)ch};
1415        error("invalid.tagchar", new String  (str), elem.getName());
1416        if (!strict) {
1417            ch = readCh();
1418            continue;
1419        } else {
1420            return;
1421        }
1422        }
1423
1424        if (att != null) {
1425        attname = att.getName();
1426        } else {
1427        error("invalid.tagatt", attname, elem.getName());
1428        }
1429
1430        // Check out the value
1431        if (attributes.isDefined(attname)) {
1432        error("multi.tagatt", attname, elem.getName());
1433        }
1434        if (attvalue == null) {
1435        attvalue = ((att != null) && (att.value != null)) ? att.value :
1436            HTML.NULL_ATTRIBUTE_VALUE;
1437        } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1438        error("invalid.tagattval", attname, elem.getName());
1439        }
1440        HTML.Attribute   attkey = HTML.getAttributeKey(attname);
1441        if (attkey == HTML.Attribute.CLASS) {
1442        attvalue = attvalue.toLowerCase();
1443        }
1444        if (attkey == null) {
1445        attributes.addAttribute(attname, attvalue);
1446        } else {
1447        attributes.addAttribute(attkey, attvalue);
1448        }
1449    }
1450    }
1451
1452    /**
1453     * Parses th Document Declaration Type markup declaration.
1454     * Currently ignores it.
1455     */
1456    public String   parseDTDMarkup() throws IOException {
1457
1458    StringBuffer   strBuff = new StringBuffer  ();
1459    ch = readCh();
1460    while(true) {
1461        switch (ch) {
1462        case '>':
1463        ch = readCh();
1464        return strBuff.toString();
1465        case -1:
1466        error("invalid.markup");
1467        return strBuff.toString();
1468        case '\n':
1469        ln++;
1470        ch = readCh();
1471        lfCount++;
1472        break;
1473        case '"':
1474        ch = readCh();
1475        break;
1476        case '\r':
1477        ln++;
1478        if ((ch = readCh()) == '\n') {
1479            ch = readCh();
1480            crlfCount++;
1481        }
1482        else {
1483            crCount++;
1484        }
1485        break;
1486        default:
1487        strBuff.append((char)(ch & 0xFF));
1488        ch = readCh();
1489        break;
1490        }
1491    }
1492    }
1493
1494    /**
1495     * Parse markup declarations.
1496     * Currently only handles the Document Type Declaration markup.
1497     * Returns true if it is a markup declaration false otherwise.
1498     */
1499    protected boolean parseMarkupDeclarations(StringBuffer   strBuff) throws IOException {
1500
1501    /* Currently handles only the DOCTYPE */
1502    if ((strBuff.length() == "DOCTYPE".length()) &&
1503        (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1504        parseDTDMarkup();
1505        return true;
1506    }
1507    return false;
1508    }
1509
1510    /**
1511     * Parse an invalid tag.
1512     */
1513    void parseInvalidTag() throws IOException {
1514    // ignore all data upto the close bracket '>'
1515    while (true) {
1516        skipSpace();
1517        switch (ch) {
1518          case '>':
1519          case -1:
1520          ch = readCh();
1521            return;
1522          case '<':
1523          return;
1524          default:
1525          ch = readCh();
1526
1527        }
1528    }
1529    }
1530
1531    /**
1532     * Parse a start or end tag.
1533     */
1534    void parseTag() throws IOException {
1535    Element   elem = null;
1536    boolean net = false;
1537    boolean warned = false;
1538    boolean unknown = false;
1539
1540    switch (ch = readCh()) {
1541      case '!':
1542        switch (ch = readCh()) {
1543          case '-':
1544        // Parse comment. [92] 391:7
1545        while (true) {
1546            if (ch == '-') {
1547            if (!strict || ((ch = readCh()) == '-')) {
1548                ch = readCh();
1549                if (!strict && ch == '-') {
1550                ch = readCh();
1551                }
1552                // send over any text you might see
1553                // before parsing and sending the
1554                // comment
1555                if (textpos != 0) {
1556                char newtext[] = new char[textpos];
1557                System.arraycopy(text, 0, newtext, 0, textpos);
1558                handleText(newtext);
1559                lastBlockStartPos = currentBlockStartPos;
1560                textpos = 0;
1561                }
1562                parseComment();
1563                last = makeTag(dtd.getElement("comment"), true);
1564                handleComment(getChars(0));
1565                continue;
1566            } else if (!warned) {
1567                warned = true;
1568                error("invalid.commentchar", "-");
1569            }
1570            }
1571            skipSpace();
1572            switch (ch) {
1573              case '-':
1574            continue;
1575              case '>':
1576            ch = readCh();
1577              case -1:
1578            return;
1579              default:
1580            ch = readCh();
1581            if (!warned) {
1582                warned = true;
1583                error("invalid.commentchar",
1584                  String.valueOf((char)ch));
1585            }
1586            break;
1587            }
1588        }
1589
1590          default:
1591        // deal with marked sections
1592        StringBuffer   strBuff = new StringBuffer  ();
1593        while (true) {
1594            strBuff.append((char)ch);
1595            if (parseMarkupDeclarations(strBuff)) {
1596            return;
1597            }
1598            switch(ch) {
1599              case '>':
1600            ch = readCh();
1601              case -1:
1602            error("invalid.markup");
1603            return;
1604              case '\n':
1605            ln++;
1606            ch = readCh();
1607            lfCount++;
1608            break;
1609              case '\r':
1610            ln++;
1611            if ((ch = readCh()) == '\n') {
1612                ch = readCh();
1613                crlfCount++;
1614            }
1615            else {
1616                crCount++;
1617            }
1618            break;
1619
1620              default:
1621            ch = readCh();
1622            break;
1623            }
1624        }
1625        }
1626
1627      case '/':
1628        // parse end tag [19] 317:4
1629        switch (ch = readCh()) {
1630          case '>':
1631        ch = readCh();
1632          case '<':
1633        // empty end tag. either </> or </<
1634        if (recent == null) {
1635            error("invalid.shortend");
1636            return;
1637        }
1638        elem = recent;
1639        break;
1640
1641          default:
1642        if (!parseIdentifier(true)) {
1643            error("expected.endtagname");
1644            return;
1645        }
1646        skipSpace();
1647        switch (ch) {
1648          case '>':
1649            ch = readCh();
1650          case '<':
1651            break;
1652
1653          default:
1654            error("expected", "'>'");
1655            while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1656            ch = readCh();
1657            }
1658            if (ch == '>') {
1659            ch = readCh();
1660            }
1661            break;
1662        }
1663        String   elemStr = getString(0);
1664        if (!dtd.elementExists(elemStr)) {
1665            error("end.unrecognized", elemStr);
1666            // Ignore RE before end tag
1667            if ((textpos > 0) && (text[textpos-1] == '\n')) {
1668            textpos--;
1669            }
1670            elem = dtd.getElement("unknown");
1671            elem.name = elemStr;
1672            unknown = true;
1673        } else {
1674            elem = dtd.getElement(elemStr);
1675        }
1676        break;
1677        }
1678
1679
1680        // If the stack is null, we're seeing end tags without any begin
1681        // tags.  Ignore them.
1682
1683        if (stack == null) {
1684        error("end.extra.tag", elem.getName());
1685        return;
1686        }
1687
1688        // Ignore RE before end tag
1689            if ((textpos > 0) && (text[textpos-1] == '\n')) {
1690        // In a pre tag, if there are blank lines
1691        // we do not want to remove the newline
1692        // before the end tag.  Hence this code.
1693        //
1694        if (stack.pre) {
1695            if ((textpos > 1) && (text[textpos-2] != '\n')) {
1696            textpos--;
1697            }
1698        } else {
1699            textpos--;
1700        }
1701        }
1702
1703        // If the end tag is a form, since we did not put it
1704        // on the tag stack, there is no corresponding start
1705        // start tag to find. Hence do not touch the tag stack.
1706        //
1707
1708            /*
1709        if (!strict && elem.getName().equals("form")) {
1710        if (lastFormSent != null) {
1711            handleEndTag(lastFormSent);
1712            return;
1713        } else {
1714            // do nothing.
1715            return;
1716        }
1717        }
1718            */
1719
1720        if (unknown) {
1721        // we will not see a corresponding start tag
1722        // on the the stack.  If we are seeing an
1723        // end tag, lets send this on as an empty
1724        // tag with the end tag attribute set to
1725        // true.
1726        TagElement   t = makeTag(elem);
1727        handleText(t);
1728        attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1729        handleEmptyTag(makeTag(elem));
1730        unknown = false;
1731        return;
1732        }
1733
1734        // find the corresponding start tag
1735
1736        // A commonly occuring error appears to be the insertion
1737        // of extra end tags in a table.  The intent here is ignore
1738        // such extra end tags.
1739        //
1740        if (!strict) {
1741        String   stackElem = stack.elem.getName();
1742
1743        if (stackElem.equals("table")) {
1744            // If it isnt a valid end tag ignore it and return
1745            //
1746            if (!elem.getName().equals(stackElem)) {
1747            error("tag.ignore", elem.getName());
1748            return;
1749            }
1750        }
1751
1752
1753
1754        if (stackElem.equals("tr") ||
1755            stackElem.equals("td")) {
1756            if ((!elem.getName().equals("table")) &&
1757            (!elem.getName().equals(stackElem))) {
1758            error("tag.ignore", elem.getName());
1759            return;
1760            }
1761        }
1762        }
1763        TagStack   sp = stack;
1764
1765        while ((sp != null) && (elem != sp.elem)) {
1766        sp = sp.next;
1767        }
1768        if (sp == null) {
1769        error("unmatched.endtag", elem.getName());
1770        return;
1771        }
1772
1773        // People put font ending tags in the darndest places.
1774        // Don't close other contexts based on them being between
1775        // a font tag and the corresponding end tag.  Instead,
1776        // ignore the end tag like it doesn't exist and allow the end
1777        // of the document to close us out.
1778        String   elemName = elem.getName();
1779        if (stack != sp &&
1780        (elemName.equals("font") ||
1781         elemName.equals("center"))) {
1782
1783        // Since closing out a center tag can have real wierd
1784        // effects on the formatting,  make sure that tags
1785        // for which omitting an end tag is legimitate
1786        // get closed out.
1787        //
1788        if (elemName.equals("center")) {
1789            while(stack.elem.omitEnd() && stack != sp) {
1790            endTag(true);
1791            }
1792            if (stack.elem == elem) {
1793            endTag(false);
1794            }
1795        }
1796        return;
1797        }
1798        // People do the same thing with center tags.  In this
1799        // case we would like to close off the center tag but
1800        // not necessarily all enclosing tags.
1801
1802
1803
1804        // end tags
1805        while (stack != sp) {
1806        endTag(true);
1807        }
1808
1809        endTag(false);
1810        return;
1811
1812      case -1:
1813        error("eof");
1814        return;
1815    }
1816
1817    // start tag [14] 314:1
1818    if (!parseIdentifier(true)) {
1819        elem = recent;
1820        if ((ch != '>') || (elem == null)) {
1821        error("expected.tagname");
1822        return;
1823        }
1824    } else {
1825        String   elemStr = getString(0);
1826
1827        if (elemStr.equals("image")) {
1828        elemStr = new String  ("img");
1829        }
1830
1831        /* determine if this element is part of the dtd. */
1832
1833        if (!dtd.elementExists(elemStr)) {
1834        //      parseInvalidTag();
1835        error("tag.unrecognized ", elemStr);
1836        elem = dtd.getElement("unknown");
1837        elem.name = elemStr;
1838        unknown = true;
1839        } else {
1840        elem = dtd.getElement(elemStr);
1841        }
1842    }
1843
1844    // Parse attributes
1845    parseAttributeSpecificationList(elem);
1846
1847    switch (ch) {
1848      case '/':
1849        net = true;
1850      case '>':
1851        ch = readCh();
1852      case '<':
1853        break;
1854
1855      default:
1856        error("expected", "'>'");
1857        break;
1858    }
1859
1860    if (!strict) {
1861      if (elem.getName().equals("script")) {
1862        error("javascript.unsupported");
1863      }
1864    }
1865
1866    // ignore RE after start tag
1867    //
1868    if (!elem.isEmpty())  {
1869        if (ch == '\n') {
1870        ln++;
1871        lfCount++;
1872        ch = readCh();
1873        } else if (ch == '\r') {
1874        ln++;
1875        if ((ch = readCh()) == '\n') {
1876            ch = readCh();
1877            crlfCount++;
1878        }
1879        else {
1880            crCount++;
1881        }
1882        }
1883    }
1884
1885    // ensure a legal context for the tag
1886    TagElement   tag = makeTag(elem, false);
1887
1888
1889    /** In dealing with forms, we have decided to treat
1890        them as legal in any context.  Also, even though
1891        they do have a start and an end tag, we will
1892        not put this tag on the stack.  This is to deal
1893        several pages in the web oasis that choose to
1894        start and end forms in any possible location. **/
1895
1896        /*
1897    if (!strict && elem.getName().equals("form")) {
1898        if (lastFormSent == null) {
1899        lastFormSent = tag;
1900        } else {
1901        handleEndTag(lastFormSent);
1902        lastFormSent = tag;
1903        }
1904    } else {
1905        */
1906        // Smlly, if a tag is unknown, we will apply
1907        // no legalTagContext logic to it.
1908        //
1909        if (!unknown) {
1910        legalTagContext(tag);
1911
1912        // If skip tag is true,  this implies that
1913        // the tag was illegal and that the error
1914        // recovery strategy adopted is to ignore
1915        // the tag.
1916        if (!strict && skipTag) {
1917            skipTag = false;
1918            return;
1919        }
1920        }
1921            /*
1922    }
1923            */
1924
1925    startTag(tag);
1926
1927    if (!elem.isEmpty()) {
1928        switch (elem.getType()) {
1929          case CDATA:
1930        parseLiteral(false);
1931        break;
1932          case RCDATA:
1933        parseLiteral(true);
1934        break;
1935          default:
1936        if (stack != null) {
1937            stack.net = net;
1938        }
1939        break;
1940        }
1941    }
1942    }
1943
1944    /**
1945     * Parse Content. [24] 320:1
1946     */
1947    void parseContent() throws IOException {
1948    Thread   curThread = Thread.currentThread();
1949
1950    for (;;) {
1951        if (curThread.isInterrupted()) {
1952                curThread.interrupt(); // resignal the interrupt
1953                break;
1954            }
1955
1956        int c = ch;
1957        currentBlockStartPos = currentPosition;
1958        switch (c) {
1959          case '<':
1960        parseTag();
1961        lastBlockStartPos = currentPosition;
1962        continue;
1963
1964          case '/':
1965        ch = readCh();
1966        if ((stack != null) && stack.net) {
1967            // null end tag.
1968            endTag(false);
1969            continue;
1970        }
1971        break;
1972
1973          case -1:
1974        return;
1975
1976          case '&':
1977        if (textpos == 0) {
1978            if (!legalElementContext(dtd.pcdata)) {
1979            error("unexpected.pcdata");
1980            }
1981            if (last.breaksFlow()) {
1982            space = false;
1983            }
1984        }
1985        char data[] = parseEntityReference();
1986        if (textpos + data.length + 1 > text.length) {
1987            char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1988            System.arraycopy(text, 0, newtext, 0, text.length);
1989            text = newtext;
1990        }
1991        if (space) {
1992            space = false;
1993            text[textpos++] = ' ';
1994        }
1995        System.arraycopy(data, 0, text, textpos, data.length);
1996        textpos += data.length;
1997                ignoreSpace = false;
1998        continue;
1999
2000          case '\n':
2001        ln++;
2002        lfCount++;
2003        ch = readCh();
2004        if ((stack != null) && stack.pre) {
2005            break;
2006        }
2007        if (textpos == 0) {
2008            lastBlockStartPos = currentPosition;
2009        }
2010                if (!ignoreSpace) {
2011                    space = true;
2012                }
2013        continue;
2014
2015          case '\r':
2016        ln++;
2017        c = '\n';
2018        if ((ch = readCh()) == '\n') {
2019            ch = readCh();
2020            crlfCount++;
2021        }
2022        else {
2023            crCount++;
2024        }
2025        if ((stack != null) && stack.pre) {
2026            break;
2027        }
2028        if (textpos == 0) {
2029            lastBlockStartPos = currentPosition;
2030        }
2031                if (!ignoreSpace) {
2032                    space = true;
2033                }
2034        continue;
2035
2036
2037          case '\t':
2038          case ' ':
2039        ch = readCh();
2040        if ((stack != null) && stack.pre) {
2041            break;
2042        }
2043        if (textpos == 0) {
2044            lastBlockStartPos = currentPosition;
2045        }
2046                if (!ignoreSpace) {
2047                    space = true;
2048                }
2049        continue;
2050
2051          default:
2052        if (textpos == 0) {
2053            if (!legalElementContext(dtd.pcdata)) {
2054            error("unexpected.pcdata");
2055            }
2056            if (last.breaksFlow()) {
2057            space = false;
2058            }
2059        }
2060        ch = readCh();
2061        break;
2062        }
2063
2064        // enlarge buffer if needed
2065        if (textpos + 2 > text.length) {
2066        char newtext[] = new char[text.length + 128];
2067        System.arraycopy(text, 0, newtext, 0, text.length);
2068        text = newtext;
2069        }
2070
2071        // output pending space
2072        if (space) {
2073        if (textpos == 0) {
2074            lastBlockStartPos--;
2075        }
2076        text[textpos++] = ' ';
2077        space = false;
2078        }
2079        text[textpos++] = (char)c;
2080            ignoreSpace = false;
2081    }
2082    }
2083
2084    /**
2085     * Returns the end of line string. This will return the end of line
2086     * string that has been encountered the most, one of \r, \n or \r\n.
2087     */
2088    String   getEndOfLineString() {
2089    if (crlfCount >= crCount) {
2090        if (lfCount >= crlfCount) {
2091        return "\n";
2092        }
2093        else {
2094        return "\r\n";
2095        }
2096    }
2097    else {
2098        if (crCount > lfCount) {
2099        return "\r";
2100        }
2101        else {
2102        return "\n";
2103        }
2104    }
2105    }
2106
2107    /**
2108     * Parse an HTML stream, given a DTD.
2109     */
2110    public synchronized void parse(Reader in) throws IOException {
2111    this.in = in;
2112
2113    this.ln = 1;
2114
2115    seenHtml = false;
2116    seenHead = false;
2117    seenBody = false;
2118
2119    crCount = lfCount = crlfCount = 0;
2120
2121    try {
2122        try {
2123        ch = readCh();
2124        text = new char[1024];
2125        str = new char[128];
2126
2127        parseContent();
2128        // NOTE: interruption may have occurred.  Control flows out
2129        // of here normally.
2130        while (stack != null) {
2131            endTag(true);
2132        }
2133        } finally {
2134        in.close();
2135        }
2136
2137    } catch (IOException e) {
2138        errorContext();
2139        error("ioexception");
2140        throw e;
2141    } catch (Exception   e) {
2142        errorContext();
2143        error("exception", e.getClass().getName(), e.getMessage());
2144        e.printStackTrace();
2145    } catch (ThreadDeath   e) {
2146        errorContext();
2147        error("terminated");
2148        e.printStackTrace();
2149        throw e;
2150    } finally {
2151        for (; stack != null ; stack = stack.next) {
2152        handleEndTag(stack.tag);
2153        }
2154
2155        text = null;
2156        str = null;
2157    }
2158
2159    }
2160
2161
2162    /*
2163     * Input cache.  This is much faster than calling down to a synchronized
2164     * method of BufferedReader for each byte.  Measurements done 5/30/97
2165     * show that there's no point in having a bigger buffer:  Increasing
2166     * the buffer to 8192 had no measurable impact for a program discarding
2167     * one character at a time (reading from an http URL to a local machine).
2168     * NOTE: If the current encoding is bogus, and we read too much
2169     * (past the content-type) we may suffer a MalformedInputException. For
2170     * this reason the initial size is 1 and when the body is encountered the
2171     * size is adjusted to 256.
2172     */
2173    private char buf[] = new char[1];
2174    private int pos;
2175    private int len;
2176    /*
2177    tracks position relative to the beginning of the
2178    document.
2179    */
2180    private int currentPosition;
2181
2182
2183    private final int readCh() throws IOException {
2184
2185    if (pos >= len) {
2186
2187        // This loop allows us to ignore interrupts if the flag
2188        // says so
2189        for (;;) {
2190        try {
2191            len = in.read(buf);
2192            break;
2193        } catch (InterruptedIOException ex) {
2194            throw ex;
2195        }
2196        }
2197
2198        if (len <= 0) {
2199        return -1;  // eof
2200        }
2201        pos = 0;
2202    }
2203    ++currentPosition;
2204
2205    return buf[pos++];
2206    }
2207
2208
2209    protected int getCurrentPos() {
2210    return currentPosition;
2211    }
2212}
2213
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags