Lexer


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/13 14:51:43 $
10  // $Revision: 1.37 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.lexer;
28  
29  import java.io.IOException  ;
30  import java.io.Serializable  ;
31  import java.net.MalformedURLException  ;
32  import java.net.URLConnection  ;
33  import java.util.Vector  ;
34  
35  import org.htmlparser.Node;
36  import org.htmlparser.NodeFactory;
37  import org.htmlparser.Remark;
38  import org.htmlparser.Text;
39  import org.htmlparser.Tag;
40  import org.htmlparser.http.ConnectionManager;
41  import org.htmlparser.nodes.RemarkNode;
42  import org.htmlparser.nodes.TextNode;
43  import org.htmlparser.nodes.TagNode;
44  import org.htmlparser.util.ParserException;
45  
46  /**
47   * This class parses the HTML stream into nodes.
48   * There are three major types of nodes (lexemes):
49   * <li>Remark</li>
50   * <li>Text</li>
51   * <li>Tag</li>
52   * Each time <code>nextNode()</code> is called, another node is returned until
53   * the stream is exhausted, and <code>null</code> is returned.
54   */
55  public class Lexer
56      implements
57          Serializable  ,
58          NodeFactory
59  {
60      /**
61       * The page lexemes are retrieved from.
62       */
63      protected Page mPage;
64  
65      /**
66       * The current position on the page.
67       */
68      protected Cursor mCursor;
69  
70      /**
71       * The factory for new nodes.
72       */
73      protected NodeFactory mFactory;
74  
75      /**
76       * Line number to trigger on.
77       * This is tested on each <code>nextNode()</code> call, as an aid to debugging.
78       * Alter this value and set a breakpoint on the line after the test.
79       * Remember, these line numbers are zero based, while most editors are one based.
80       * @see #nextNode
81       */ 
82      static protected int mDebugLineTrigger = -1;
83  
84      /**
85       * Creates a new instance of a Lexer.
86       */
87      public Lexer ()
88      {
89          this (new Page (""));
90      }
91  
92      /**
93       * Creates a new instance of a Lexer.
94       * @param page The page with HTML text.
95       */
96      public Lexer (Page page)
97      {
98          setPage (page);
99          setCursor (new Cursor (page, 0));
100         setNodeFactory (this);
101     }
102 
103     /**
104      * Creates a new instance of a Lexer.
105      * @param text The text to parse.
106      */
107     public Lexer (String   text)
108     {
109         this (new Page (text));
110     }
111 
112     /**
113      * Creates a new instance of a Lexer.
114      * @param connection The url to parse.
115      */
116     public Lexer (URLConnection   connection) throws ParserException
117     {
118         this (new Page (connection));
119     }
120 
121     /**
122      * Reset the lexer to start parsing from the beginning again.
123      * The underlying components are reset such that the next call to
124      * <code>nextNode()</code> will return the first lexeme on the page.
125      */
126     public void reset ()
127     {
128         getPage ().reset ();
129         setCursor (new Cursor (getPage (), 0));
130     }
131 
132     /**
133      * Get the page this lexer is working on.
134      * @return The page that nodes are being read from.
135      */
136     public Page getPage ()
137     {
138         return (mPage);
139     }
140 
141     /**
142      * Set the page this lexer is working on.
143      * @param page The page that nodes will be read from.
144      */
145     public void setPage (Page page)
146     {
147         if (null == page)
148             throw new IllegalArgumentException   ("page cannot be null");
149         // todo: sanity checks
150         mPage = page;
151     }
152 
153     /**
154      * Get the current scanning position.
155      * @return The lexer's cursor position.
156      */
157     public Cursor getCursor ()
158     {
159         return (mCursor);
160     }
161 
162     /**
163      * Set the current scanning position.
164      * @param cursor The lexer's new cursor position.
165      */
166     public void setCursor (Cursor cursor)
167     {
168         if (null == cursor)
169             throw new IllegalArgumentException   ("cursor cannot be null");
170         // todo: sanity checks
171         mCursor = cursor;
172     }
173 
174     /**
175      * Get the current node factory.
176      * @return The lexer's node factory.
177      */
178     public NodeFactory getNodeFactory ()
179     {
180         return (mFactory);
181     }
182 
183     /**
184      * Set the current node factory.
185      * @param factory The node factory to be used by the lexer.
186      */
187     public void setNodeFactory (NodeFactory factory)
188     {
189         if (null == factory)
190             throw new IllegalArgumentException   ("node factory cannot be null");
191         mFactory = factory;
192     }
193 
194     public int getPosition ()
195     {
196         return (getCursor ().getPosition ());
197     }
198 
199     public void setPosition (int position)
200     {
201         // todo: sanity checks
202         getCursor ().setPosition (position);
203     }
204 
205     /**
206      * Get the current line number.
207      * @return The line number the lexer's working on.
208      */
209     public int getCurrentLineNumber ()
210     {
211         return (getPage ().row (getCursor ()));
212     }
213 
214     /**
215      * Get the current line.
216      * @return The string the lexer's working on.
217      */
218     public String   getCurrentLine ()
219     {
220         return (getPage ().getLine (getCursor ()));
221     }
222 
223     /**
224      * Get the next node from the source.
225      * @return A Remark, Text or Tag, or <code>null</code> if no
226      * more lexemes are present.
227      * @exception ParserException If there is a problem with the underlying page.
228      */
229     public Node nextNode ()
230         throws
231             ParserException
232     {
233         return nextNode (false);
234     }
235 
236     /**
237      * Get the next node from the source.
238      * @param quotesmart If <code>true</code>, strings ignore quoted contents.
239      * @return A Remark, Text or Tag, or <code>null</code> if no
240      * more lexemes are present.
241      * @exception ParserException If there is a problem with the underlying page.
242      */
243     public Node nextNode (boolean quotesmart)
244         throws
245             ParserException
246     {
247         int start;
248         char ch;
249         Node ret;
250 
251         // debugging suppport
252         if (-1 != mDebugLineTrigger)
253         {
254             Page page = getPage ();
255             int lineno = page.row (mCursor);
256             if (mDebugLineTrigger < lineno)
257                 mDebugLineTrigger = lineno + 1; // trigger on subsequent lines too
258         }
259         start = mCursor.getPosition ();
260         ch = mPage.getCharacter (mCursor);
261         switch (ch)
262         {
263             case Page.EOF:
264                 ret = null;
265                 break;
266             case '<':
267                 ch = mPage.getCharacter (mCursor);
268                 if (Page.EOF == ch)
269                     ret = makeString (start, mCursor.getPosition ());
270                 else if ('%' == ch)
271                 {
272                     mCursor.retreat ();
273                     ret = parseJsp (start);
274                 }
275                 else if ('/' == ch || '%' == ch || Character.isLetter (ch))
276                 {
277                     mCursor.retreat ();
278                     ret = parseTag (start);
279                 }
280                 else if ('!' == ch)
281                 {
282                     ch = mPage.getCharacter (mCursor);
283                     if (Page.EOF == ch)
284                         ret = makeString (start, mCursor.getPosition ());
285                     else
286                     {
287                         if ('>' == ch) // handle <!>
288                             ret = makeRemark (start, mCursor.getPosition ());
289                         else
290                         {
291                             mCursor.retreat (); // remark and tag need this character
292                             if ('-' == ch)
293                                 ret = parseRemark (start, quotesmart);
294                             else
295                             {
296                                 mCursor.retreat (); // tag needs the previous one too
297                                 ret = parseTag (start);
298                             }
299                         }
300                     }
301                 }
302                 else
303                     ret = parseString (start, quotesmart);
304                 break;
305             default:
306                 mCursor.retreat (); // string needs to see leading foreslash
307                 ret = parseString (start, quotesmart);
308                 break;
309         }
310 
311         return (ret);
312     }
313 
314     /**
315      * Advance the cursor through a JIS escape sequence.
316      * @param cursor A cursor positioned within the escape sequence.
317      */
318     protected void scanJIS (Cursor cursor)
319         throws
320             ParserException
321     {
322         boolean done;
323         char ch;
324         int state;
325 
326         done = false;
327         state = 0;
328         while (!done)
329         {
330             ch = mPage.getCharacter (cursor);
331             if (Page.EOF == ch)
332                 done = true;
333             else
334                 switch (state)
335                 {
336                     case 0:
337                         if (0x1b == ch) // escape
338                             state = 1;
339                         break;
340                     case 1:
341                         if ('(' == ch)
342                             state = 2;
343                         else
344                             state = 0;
345                         break;
346                     case 2:
347                         if ('J' == ch)
348                             done = true;
349                         else
350                             state = 0;
351                         break;
352                     default:
353                         throw new IllegalStateException   ("how the fuck did we get in state " + state);
354                 }
355         }
356     }
357 
358     /**
359      * Parse a string node.
360      * Scan characters until "&lt;/", "&lt;%", "&lt;!" or &lt; followed by a
361      * letter is encountered, or the input stream is exhausted, in which
362      * case <code>null</code> is returned.
363      * @param start The position at which to start scanning.
364      * @param quotesmart If <code>true</code>, strings ignore quoted contents.
365      */
366     protected Node parseString (int start, boolean quotesmart)
367         throws
368             ParserException
369     {
370         boolean done;
371         char ch;
372         char quote;
373 
374         done = false;
375         quote = 0;
376         while (!done)
377         {
378             ch = mPage.getCharacter (mCursor);
379             if (Page.EOF == ch)
380                 done = true;
381             else if (0x1b == ch) // escape
382             {
383                 ch = mPage.getCharacter (mCursor);
384                 if (Page.EOF == ch)
385                     done = true;
386                 else if ('$' == ch)
387                 {
388                     ch = mPage.getCharacter (mCursor);
389                     if (Page.EOF == ch)
390                         done = true;
391                     else if ('B' == ch)
392                         scanJIS (mCursor);
393                     else
394                     {
395                         mCursor.retreat ();
396                         mCursor.retreat ();
397                     }
398                 }
399                 else
400                     mCursor.retreat ();
401             }
402             else if (quotesmart && (0 == quote) && (('\'' == ch) || ('"' == ch)))
403                 quote = ch; // enter quoted state
404             // patch contributed by Gernot Fricke to handle escaped closing quote
405             else if (quotesmart && (0 != quote) && ('\\' == ch))
406             {
407                 ch = mPage.getCharacter (mCursor); //try to consume escaped character
408                 if ((Page.EOF != ch)
409                     && ('\\' != ch) // escaped backslash
410                     && (ch != quote)) // escaped quote character 
411                        // ( reflects ["] or [']  whichever opened the quotation)
412                     mCursor.retreat(); // unconsume char if character was not an escapable char.
413             }
414             else if (quotesmart && (ch == quote))
415                 quote = 0; // exit quoted state
416             else if (quotesmart && (0 == quote) && (ch == '/'))
417             {
418                 // handle multiline and double slash comments (with a quote) in script like:
419                 // I can't handle single quotations.
420                 ch = mPage.getCharacter (mCursor);
421                 if (Page.EOF == ch)
422                     done = true;
423                 else if ('/' == ch)
424                 {
425                     do
426                         ch = mPage.getCharacter (mCursor);
427                     while ((Page.EOF != ch) && ('\n' != ch));
428                 }
429                 else if ('*' == ch)
430                 {
431                     do
432                     {
433                         do
434                             ch = mPage.getCharacter (mCursor);
435                         while ((Page.EOF != ch) && ('*' != ch));
436                         ch = mPage.getCharacter (mCursor);
437                         if (ch == '*')
438                             mCursor.retreat ();
439                     }
440                     while ((Page.EOF != ch) && ('/' != ch));
441                 }
442                 else
443                     mCursor.retreat ();
444             }
445             else if ((0 == quote) && ('<' == ch))
446             {
447                 ch = mPage.getCharacter (mCursor);
448                 if (Page.EOF == ch)
449                     done = true;
450                 // the order of these tests might be optimized for speed:
451                 else if ('/' == ch || Character.isLetter (ch) || '!' == ch || '%' == ch)
452                 {
453                     done = true;
454                     mCursor.retreat ();
455                     mCursor.retreat ();
456                 }
457                 else
458                 {
459                     // it's not a tag, so keep going, but check for quotes
460                     mCursor.retreat ();
461                 }
462             }
463         }
464 
465         return (makeString (start, mCursor.getPosition ()));
466     }
467 
468     /**
469      * Create a string node based on the current cursor and the one provided.
470      */
471     protected Node makeString (int start, int end)
472         throws
473             ParserException
474     {
475         int length;
476         Node ret;
477 
478         length = end - start;
479         if (0 != length)
480         {   // got some characters
481             ret = getNodeFactory ().createStringNode (this.getPage (), start, end);
482         }
483         else
484             ret = null;
485         
486         return (ret);
487     }
488 
489     private void whitespace (Vector   attributes, int[] bookmarks)
490     {
491         if (bookmarks[1] > bookmarks[0])
492             attributes.addElement (new PageAttribute (mPage, -1, -1, bookmarks[0], bookmarks[1], (char)0));
493     }
494 
495     private void standalone (Vector   attributes, int[] bookmarks)
496     {
497         attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], -1, -1, (char)0));
498     }
499 
500     private void empty (Vector   attributes, int[] bookmarks)
501     {
502         attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char)0));
503     }
504 
505     private void naked (Vector   attributes, int[] bookmarks)
506     {
507         attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char)0));
508     }
509 
510     private void single_quote (Vector   attributes, int[] bookmarks)
511     {
512         attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\''));
513     }
514 
515     private void double_quote (Vector   attributes, int[] bookmarks)
516     {
517         attributes.addElement (new PageAttribute (mPage, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"'));
518     }
519 
520     /**
521      * Parse a tag.
522      * Parse the name and attributes from a start tag.<p>
523      * From the <a HREF="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2">
524      * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
525      * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2<p>
526      * <cite>
527      * 3.2.2 Attributes<p>
528      * Elements may have associated properties, called attributes, which may
529      * have values (by default, or set by authors or scripts). Attribute/value
530      * pairs appear before the final ">" of an element's start tag. Any number
531      * of (legal) attribute value pairs, separated by spaces, may appear in an
532      * element's start tag. They may appear in any order.<p>
533      * In this example, the id attribute is set for an H1 element:
534      * <code>
535      * &lt;H1 id="section1"&gt;
536      * </code>
537      * This is an identified heading thanks to the id attribute
538      * <code>
539      * &lt;/H1&gt;
540      * </code>
541      * By default, SGML requires that all attribute values be delimited using
542      * either double quotation marks (ASCII decimal 34) or single quotation
543      * marks (ASCII decimal 39). Single quote marks can be included within the
544      * attribute value when the value is delimited by double quote marks, and
545      * vice versa. Authors may also use numeric character references to
546      * represent double quotes (&amp;#34;) and single quotes (&amp;#39;).
547      * For doublequotes authors can also use the character entity reference &amp;quot;.<p>
548      * In certain cases, authors may specify the value of an attribute without
549      * any quotation marks. The attribute value may only contain letters
550      * (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45),
551      * periods (ASCII decimal 46), underscores (ASCII decimal 95),
552      * and colons (ASCII decimal 58). We recommend using quotation marks even
553      * when it is possible to eliminate them.<p>
554      * Attribute names are always case-insensitive.<p>
555      * Attribute values are generally case-insensitive. The definition of each
556      * attribute in the reference manual indicates whether its value is case-insensitive.<p>
557      * All the attributes defined by this specification are listed in the attribute index.<p>
558      * </cite>
559      * <p>
560      * This method uses a state machine with the following states:
561      * <ol>
562      * <li>state 0 - outside of any attribute</li>
563      * <li>state 1 - within attributre name</li>
564      * <li>state 2 - equals hit</li>
565      * <li>state 3 - within naked attribute value.</li>
566      * <li>state 4 - within single quoted attribute value</li>
567      * <li>state 5 - within double quoted attribute value</li>
568      * <li>state 6 - whitespaces after attribute name could lead to state 2 (=)or state 0</li>
569      * </ol>
570      * <p>
571      * The starting point for the various components is stored in an array
572      * of integers that match the initiation point for the states one-for-one,
573      * i.e. bookmarks[0] is where state 0 began, bookmarks[1] is where state 1
574      * began, etc.
575      * Attributes are stored in a <code>Vector</code> having
576      * one slot for each whitespace or attribute/value pair.
577      * The first slot is for attribute name (kind of like a standalone attribute).
578      * @param start The position at which to start scanning.
579      * @return The parsed tag.
580      */
581     protected Node parseTag (int start)
582         throws
583             ParserException
584     {
585         boolean done;
586         char ch;
587         int state;
588         int[] bookmarks;
589         Vector   attributes;
590 
591         done = false;
592         attributes = new Vector   ();
593         state = 0;
594         bookmarks = new int[8];
595         bookmarks[0] = mCursor.getPosition ();
596         while (!done)
597         {
598             bookmarks[state + 1] = mCursor.getPosition ();
599             ch = mPage.getCharacter (mCursor);
600             switch (state)
601             {
602                 case 0: // outside of any attribute
603                     if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))
604                     {
605                         if ('<' == ch)
606                         {
607                             // don't consume the opening angle
608                             mCursor.retreat ();
609                             bookmarks[state + 1] = mCursor.getPosition ();
610                         }
611                         whitespace (attributes, bookmarks);
612                         done = true;
613                     }
614                     else if (!Character.isWhitespace (ch))
615                     {
616                         whitespace (attributes, bookmarks);
617                         state = 1;
618                     }
619                     break;
620                 case 1: // within attribute name
621                     if ((Page.EOF == ch) || ('>' == ch) || ('<' == ch))
622                     {
623                         if ('<' == ch)
624                         {
625                             // don't consume the opening angle
626                             mCursor.retreat ();
627                             bookmarks[state + 1] = mCursor.getPosition ();
628                         }
629                         standalone (attributes, bookmarks);
630                         done = true;
631                     }
632                     else if (Character.isWhitespace (ch))
633                     {
634                         // whitespaces might be followed by next attribute or an equal sign
635                         // see Bug #891058 Bug in lexer.
636                         bookmarks[6] = bookmarks[2]; // setting the bookmark[0] is done in state 6 if applicable
637                         state = 6;
638                     }
639                     else if ('=' == ch)
640                         state = 2;
641                     break;
642                 case 2: // equals hit
643                     if ((Page.EOF == ch) || ('>' == ch))
644                     {
645                         empty (attributes, bookmarks);
646                         done = true;
647                     }
648                     else if ('\'' == ch)
649                     {
650                         state = 4;
651                         bookmarks[4] = bookmarks[3];
652                     }
653                     else if ('"' == ch)
654                     {
655                         state = 5;
656                         bookmarks[5] = bookmarks[3];
657                     }
658                     else if (Character.isWhitespace (ch))
659                     { 
660                         // collect white spaces after "=" into the assignment string;
661                         // do nothing
662                         // see Bug #891058 Bug in lexer.
663                     }
664                     else
665                         state = 3;
666                     break;
667                 case 3: // within naked attribute value
668                     if ((Page.EOF == ch) || ('>' == ch))
669                     {
670                         naked (attributes, bookmarks);
671                         done = true;
672                     }
673                     else if (Character.isWhitespace (ch))
674                     {
675                         naked (attributes, bookmarks);
676                         bookmarks[0] = bookmarks[4];
677                         state = 0;
678                     }
679                     break;
680                 case 4: // within single quoted attribute value
681                     if (Page.EOF == ch)
682                     {
683                         single_quote (attributes, bookmarks);
684                         done = true; // complain?
685                     }
686                     else if ('\'' == ch)
687                     {
688                         single_quote (attributes, bookmarks);
689                         bookmarks[0] = bookmarks[5] + 1;
690                         state = 0;
691                     }
692                     break;
693                 case 5: // within double quoted attribute value
694                     if (Page.EOF == ch)
695                     {
696                         double_quote (attributes, bookmarks);
697                         done = true; // complain?
698                     }
699                     else if ('"' == ch)
700                     {
701                         double_quote (attributes, bookmarks);
702                         bookmarks[0] = bookmarks[6] + 1;
703                         state = 0;
704                     }
705                     break;
706                 // patch for lexer state correction by
707                 // Gernot Fricke
708                 // See Bug # 891058 Bug in lexer.
709                 case 6: // undecided for state 0 or 2
710                         // we have read white spaces after an attributte name
711                     if (Page.EOF == ch)
712                     {
713                         // same as last else clause
714                         standalone (attributes, bookmarks);
715                         bookmarks[0]=bookmarks[6];
716                         mCursor.retreat();
717                         state=0;
718                     }
719                     else if (Character.isWhitespace (ch))
720                     { 
721                         // proceed
722                     } 
723                     else if ('=' == ch) // yepp. the white spaces belonged to the equal.
724                     {
725                         bookmarks[2] = bookmarks[6];
726                         bookmarks[3] = bookmarks[7];
727                         state=2;
728                     }
729                     else
730                     {
731                         // white spaces were not ended by equal
732                         // meaning the attribute was a stand alone attribute
733                         // now: create the stand alone attribute and rewind 
734                         // the cursor to the end of the white spaces
735                         // and restart scanning as whitespace attribute.
736                         standalone (attributes, bookmarks);
737                         bookmarks[0]=bookmarks[6];
738                         mCursor.retreat();
739                         state=0;
740                     }
741                     break;
742                 default:
743                     throw new IllegalStateException   ("how the fuck did we get in state " + state);
744             }
745         }
746 
747         return (makeTag (start, mCursor.getPosition (), attributes));
748     }
749 
750     /**
751      * Create a tag node based on the current cursor and the one provided.
752      */
753     protected Node makeTag (int start, int end, Vector   attributes)
754         throws
755             ParserException
756     {
757         int length;
758         Node ret;
759 
760         length = end - start;
761         if (0 != length)
762         {   // return tag based on second character, '/', '%', Letter (ch), '!'
763             if (2 > length)
764                 // this is an error
765                 return (makeString (start, end));
766             ret = getNodeFactory ().createTagNode (this.getPage (), start, end, attributes);
767         }
768         else
769             ret = null;
770 
771         return (ret);
772     }
773 
774     /**
775      * Parse a comment.
776      * Parse a remark markup.<p>
777      * From the <a HREF="http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4">
778      * HTML 4.01 Specification, W3C Recommendation 24 December 1999</a>
779      * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.4<p>
780      * <cite>
781      * 3.2.4 Comments<p>
782      * HTML comments have the following syntax:<p>
783      * <code>
784      * &lt;!-- this is a comment --&gt;<p>
785      * &lt;!-- and so is this one,<p>
786      *     which occupies more than one line --&gt;<p>
787      * </code>
788      * White space is not permitted between the markup declaration
789      * open delimiter("&lt;!") and the comment open delimiter ("--"),
790      * but is permitted between the comment close delimiter ("--") and
791      * the markup declaration close delimiter ("&gt;").
792      * A common error is to include a string of hyphens ("---") within a comment.
793      * Authors should avoid putting two or more adjacent hyphens inside comments.
794      * Information that appears between comments has no special meaning
795      * (e.g., character references are not interpreted as such).
796      * Note that comments are markup.<p>
797      * </cite>
798      * <p>
799      * This method uses a state machine with the following states:
800      * <ol>
801      * <li>state 0 - prior to the first open delimiter</li>
802      * <li>state 1 - prior to the second open delimiter</li>
803      * <li>state 2 - prior to the first closing delimiter</li>
804      * <li>state 3 - prior to the second closing delimiter</li>
805      * <li>state 4 - prior to the terminating &gt;</li>
806      * </ol>
807      * <p>
808      * All comment text (everything excluding the &lt; and &gt;), is included
809      * in the remark text.
810      * We allow terminators like --!&gt; even though this isn't part of the spec.
811      * @param start The position at which to start scanning.
812      * @param quotesmart If <code>true</code>, strings ignore quoted contents.
813      */
814     protected Node parseRemark (int start, boolean quotesmart)
815         throws
816             ParserException
817     {
818         boolean done;
819         char ch;
820         int state;
821 
822         done = false;
823         state = 0;
824         while (!done)
825         {
826             ch = mPage.getCharacter (mCursor);
827             if (Page.EOF == ch)
828                 done = true;
829             else
830                 switch (state)
831                 {
832                     case 0: // prior to the first open delimiter
833                         if ('>' == ch)
834                             done = true;
835                         if ('-' == ch)
836                             state = 1;
837                         else
838                             return (parseString (start, quotesmart));
839                         break;
840                     case 1: // prior to the second open delimiter
841                         if ('-' == ch)
842                         {
843                             // handle <!--> because netscape does
844                             ch = mPage.getCharacter (mCursor);
845                             if (Page.EOF == ch)
846                                 done = true;
847                             else if ('>' == ch)
848                                 done = true;
849                             else
850                             {
851                                 mCursor.retreat ();
852                                 state = 2;
853                             }                        
854                         }
855                         else
856                             return (parseString (start, quotesmart));
857                         break;
858                     case 2: // prior to the first closing delimiter
859                         if ('-' == ch)
860                             state = 3;
861                         else if (Page.EOF == ch)
862                             return (parseString (start, quotesmart)); // no terminator
863                         break;
864                     case 3: // prior to the second closing delimiter
865                         if ('-' == ch)
866                             state = 4;
867                         else
868                             state = 2;
869                         break;
870                     case 4: // prior to the terminating >
871                         if ('>' == ch)
872                             done = true;
873                         else if (('!' == ch) || ('-' == ch) || Character.isWhitespace (ch))
874                         {
875                             // stay in state 4
876                         }
877                         else
878                             state = 2;
879                         break;
880                     default:
881                         throw new IllegalStateException   ("how the fuck did we get in state " + state);
882                 }
883         }
884 
885         return (makeRemark (start, mCursor.getPosition ()));
886     }
887 
888     /**
889      * Create a remark node based on the current cursor and the one provided.
890      */
891     protected Node makeRemark (int start, int end)
892         throws
893             ParserException
894     {
895         int length;
896         Node ret;
897 
898         length = end - start;
899         if (0 != length)
900         {   // return tag based on second character, '/', '%', Letter (ch), '!'
901             if (2 > length)
902                 // this is an error
903                 return (makeString (start, end));
904             ret = getNodeFactory ().createRemarkNode (this.getPage (), start, end);
905         }
906         else
907             ret = null;
908         
909         return (ret);
910     }
911 
912     /**
913      * Parse a java server page node.
914      * Scan characters until "%&gt;" is encountered, or the input stream is
915      * exhausted, in which case <code>null</code> is returned.
916      * @param start The position at which to start scanning.
917      */
918     protected Node parseJsp (int start)
919         throws
920             ParserException
921     {
922         boolean done;
923         char ch;
924         int state;
925         Vector   attributes;
926         int code;
927 
928         done = false;
929         state = 0;
930         code = 0;
931         attributes = new Vector   ();
932         // <%xyz%>
933         // 012223d
934         // <%=xyz%>
935         // 0122223d
936         // <%@xyz%d
937         // 0122223d
938         while (!done)
939         {
940             ch = mPage.getCharacter (mCursor);
941             switch (state)
942             {
943                 case 0: // prior to the percent
944                     switch (ch)
945                     {
946                         case '%': // <%
947                             state = 1;
948                             break;
949                         // case Page.EOF: // <\0
950                         // case '>': // <>
951                         default:
952                             done = true;
953                             break;
954                     }
955                     break;
956                 case 1: // prior to the optional qualifier
957                     switch (ch)
958                     {
959                         case Page.EOF:   // <%\0
960                         case '>': // <%>
961                             done = true;
962                             break;
963                         case '=': // <%=
964                         case '@': // <%@
965                             code = mCursor.getPosition ();
966                             attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
967                             state = 2;
968                             break;
969                         default:  // <%x
970                             code = mCursor.getPosition () - 1;
971                             attributes.addElement (new PageAttribute (mPage, start + 1, code, -1, -1, (char)0));
972                             state = 2;
973                             break;
974                     }
975                     break;
976                 case 2: // prior to the closing percent
977                     switch (ch)
978                     {
979                         case Page.EOF: // <%x\0
980                         case '>': // <%x>
981                             done = true;
982                             break;
983                         case '\'':
984                         case '"':// <%???"
985                             state = ch;
986                             break;
987                         case '%': // <%???%
988                             state = 3;
989                             break;
990                         default:  // <%???x
991                             break;
992                     }
993                     break;
994                 case 3:
995                     switch (ch)
996                     {
997                         case Page.EOF: // <%x??%\0
998                             done = true;
999                             break;
1000                        case '>':
1001                            state = 4;
1002                            done = true;
1003                            break;
1004                        default:  // <%???%x
1005                            state = 2;
1006                            break;
1007                    }
1008                    break;
1009                case '"':
1010                    switch (ch)
1011                    {
1012                        case Page.EOF: // <%x??"\0
1013                            done = true;
1014                            break;
1015                        case '"':
1016                            state = 2;
1017                            break;
1018                        default:  // <%???'??x
1019                            break;
1020                    }
1021                    break;
1022                case '\'':
1023                    switch (ch)
1024                    {
1025                        case Page.EOF: // <%x??'\0
1026                            done = true;
1027                            break;
1028                        case '\'':
1029                            state = 2;
1030                            break;
1031                        default:  // <%???"??x
1032                            break;
1033                    }
1034                    break;
1035                default:
1036                    throw new IllegalStateException   ("how the fuck did we get in state " + state);
1037            }
1038        }
1039
1040        if (4 == state) // normal exit
1041        {
1042            if (0 != code)
1043            {
1044                state = mCursor.getPosition () - 2; // reuse state
1045                attributes.addElement (new PageAttribute (mPage, code, state, -1, -1, (char)0));
1046                attributes.addElement (new PageAttribute (mPage, state, state + 1, -1, -1, (char)0));
1047            }
1048            else
1049                throw new IllegalStateException   ("jsp with no code!");
1050        }
1051        else
1052            return (parseString (start, true)); // hmmm, true?
1053
1054        return (makeTag (start, mCursor.getPosition (), attributes));
1055    }
1056
1057    /**
1058     * Return CDATA as a text node.
1059     * According to appendix <a HREF="http://www.w3.org/TR/html4/appendix/notes.html#notes-specifying-data">
1060     * B.3.2 Specifying non-HTML data</a> of the
1061     * <a HREF="http://www.w3.org/TR/html4/">HTML 4.01 Specification</a>:<br>
1062     * <quote>
1063     * <b>Element content</b><br>
1064     * When script or style data is the content of an element (SCRIPT and STYLE),
1065     * the data begins immediately after the element start tag and ends at the
1066     * first ETAGO ("&lt;/") delimiter followed by a name start character ([a-zA-Z]);
1067     * note that this may not be the element's end tag.
1068     * Authors should therefore escape "&lt;/" within the content. Escape mechanisms
1069     * are specific to each scripting or style sheet language.
1070     * </quote>
1071     * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
1072     */
1073    public Node parseCDATA ()
1074        throws
1075            ParserException
1076    {
1077        return (parseCDATA (false));
1078    }
1079
1080    /**
1081     * Return CDATA as a text node.
1082     * Slightly less rigid than {@link #parseCDATA()} this method provides for
1083     * parsing CDATA that may contain quoted strings that have embedded
1084     * ETAGO ("&lt;/") delimiters and skips single and multiline comments.
1085     * @param quotesmart If <code>true</code> the strict definition of CDATA is
1086     * extended to allow for single or double quoted ETAGO ("&lt;/") sequences.
1087     * @return The <code>TextNode</code> of the CDATA or <code>null</code> if none.
1088     * @see #parseCDATA()
1089     */
1090    public Node parseCDATA (boolean quotesmart)
1091        throws
1092            ParserException
1093    {
1094        int start;
1095        int state;
1096        boolean done;
1097        char quote;
1098        char ch;
1099        int end;
1100
1101        start = mCursor.getPosition ();
1102        state = 0;
1103        done = false;
1104        quote = 0;
1105        while (!done)
1106        {
1107            ch = mPage.getCharacter (mCursor);
1108            switch (state)
1109            {
1110                case 0: // prior to ETAGO
1111                    switch (ch)
1112                    {
1113                        case Page.EOF:
1114                            done = true;
1115                            break;
1116                        case '\'':
1117                            if (quotesmart)
1118                                if (0 == quote)
1119                                    quote = '\''; // enter quoted state
1120                                else if ('\'' == quote)
1121                                    quote = 0; // exit quoted state
1122                            break;
1123                        case '"':
1124                            if (quotesmart)
1125                                if (0 == quote)
1126                                    quote = '"'; // enter quoted state
1127                                else if ('"' == quote)
1128                                    quote = 0; // exit quoted state
1129                            break;
1130                        case '\\':
1131                            if (quotesmart)
1132                                if (0 != quote)
1133                                {
1134                                    ch = mPage.getCharacter (mCursor); // try to consume escaped character
1135                                    if (Page.EOF == ch)
1136                                        done = true;
1137                                    else if (  (ch != '\\') && (ch != quote))
1138                                        mCursor.retreat (); // unconsume char if character was not an escapable char.
1139                                }
1140                            break;
1141                        case '/':
1142                            if (quotesmart)
1143                                if (0 == quote)
1144                                {
1145                                    // handle multiline and double slash comments (with a quote)
1146                                    ch = mPage.getCharacter (mCursor);
1147                                    if (Page.EOF == ch)
1148                                        done = true;
1149                                    else if ('/' == ch)
1150                                    {
1151                                        do
1152                                            ch = mPage.getCharacter (mCursor);
1153                                        while ((Page.EOF != ch) && ('\n' != ch));
1154                                    }
1155                                    else if ('*' == ch)
1156                                    {
1157                                        do
1158                                        {
1159                                            do
1160                                                ch = mPage.getCharacter (mCursor);
1161                                            while ((Page.EOF != ch) && ('*' != ch));
1162                                            ch = mPage.getCharacter (mCursor);
1163                                            if (ch == '*')
1164                                                mCursor.retreat ();
1165                                        }
1166                                        while ((Page.EOF != ch) && ('/' != ch));
1167                                    }
1168                                    else
1169                                        mCursor.retreat ();
1170                                }
1171                            break;
1172                        case '<':
1173                            if (quotesmart)
1174                            {
1175                                if (0 == quote)
1176                                    state = 1;
1177                            }
1178                            else
1179                                state = 1;
1180                            break;
1181                        default:
1182                            break;
1183                    }
1184                    break;
1185                case 1: // <
1186                    switch (ch)
1187                    {
1188                        case Page.EOF:
1189                            done = true;
1190                            break;
1191                        case '/':
1192                            state = 2;
1193                            break;
1194                        default:
1195                            state = 0;
1196                            break;
1197                    }
1198                    break;
1199                case 2: // </
1200                    if (Page.EOF == ch)
1201                        done = true;
1202                    else if (Character.isLetter (ch))
1203                    {
1204                        done = true;
1205                        // back up to the start of ETAGO
1206                        mCursor.retreat ();
1207                        mCursor.retreat ();
1208                        mCursor.retreat ();
1209                    }
1210                    else
1211                        state = 0;
1212                    break;
1213                default:
1214                    throw new IllegalStateException   ("how the fuck did we get in state " + state);
1215            }
1216        }
1217        end = mCursor.getPosition ();
1218
1219        return (makeString (start, end));
1220    }
1221
1222    //
1223    // NodeFactory interface
1224    //
1225
1226    /**
1227     * Create a new string node.
1228     * @param page The page the node is on.
1229     * @param start The beginning position of the string.
1230     * @param end The ending positiong of the string.
1231     */
1232    public Text createStringNode (Page page,  int start, int end)
1233    {
1234        return (new TextNode (page, start, end));
1235    }
1236
1237    /**
1238     * Create a new remark node.
1239     * @param page The page the node is on.
1240     * @param start The beginning position of the remark.
1241     * @param end The ending positiong of the remark.
1242     */
1243    public Remark createRemarkNode (Page page,  int start, int end)
1244    {
1245        return (new RemarkNode (page, start, end));
1246    }
1247
1248    /**
1249     * Create a new tag node.
1250     * Note that the attributes vector contains at least one element,
1251     * which is the tag name (standalone attribute) at position zero.
1252     * This can be used to decide which type of node to create, or
1253     * gate other processing that may be appropriate.
1254     * @param page The page the node is on.
1255     * @param start The beginning position of the tag.
1256     * @param end The ending positiong of the tag.
1257     * @param attributes The attributes contained in this tag.
1258     */
1259    public Tag createTagNode (Page page, int start, int end, Vector   attributes)
1260    {
1261        return (new TagNode (page, start, end, attributes));
1262    }
1263
1264    /**
1265     * Mainline for command line operation
1266     */
1267    public static void main (String  [] args)
1268        throws
1269            MalformedURLException  ,
1270            IOException  ,
1271            ParserException
1272    {
1273        Lexer lexer;
1274        Node node;
1275
1276        if (0 >= args.length)
1277            System.out.println ("usage: java -jar htmllexer.jar <url>");
1278        else
1279        {
1280            try
1281            {
1282                ConnectionManager manager = Page.getConnectionManager ();
1283                lexer = new Lexer (manager.openConnection (args[0]));
1284                while (null != (node = lexer.nextNode ()))
1285                    System.out.println (node.toString ());
1286            }
1287            catch (ParserException pe)
1288            {
1289                System.out.println (pe.getMessage ());
1290                if (null != pe.getThrowable ())
1291                    System.out.println (pe.getThrowable ().getMessage ());
1292            }
1293        }
1294    }
1295}
1296
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags