KitTest


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // Copyright (C) August 26, 2003 Derrick Oswald
3   //
4   // Revision Control Information
5   //
6   //    $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/KitTest.java,v $
7   //    $Author: derrickoswald $
8   //    $Date: 2004/07/31 16:42:31 $
9   //    $Revision: 1.8 $
10  //
11  // This library is free software; you can redistribute it and/or
12  // modify it under the terms of the GNU Lesser General Public
13  // License as published by the Free Software Foundation; either
14  // version 2.1 of the License, or (at your option) any later version.
15  //
16  // This library is distributed in the hope that it will be useful,
17  // but WITHOUT ANY WARRANTY; without even the implied warranty of
18  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  // Lesser General Public License for more details.
20  //
21  // You should have received a copy of the GNU Lesser General Public
22  // License along with this library; if not, write to the Free Software
23  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  //
25  
26  package org.htmlparser.tests.lexerTests;
27  
28  import java.io.IOException  ;
29  import java.net.URL  ;
30  import java.util.Vector  ;
31  import javax.swing.text.BadLocationException  ;
32  import javax.swing.text.MutableAttributeSet  ;
33  import javax.swing.text.html.HTML  ;
34  import javax.swing.text.html.HTMLEditorKit  ;
35  import javax.swing.text.html.HTMLEditorKit.Parser;
36  import javax.swing.text.html.HTMLEditorKit.ParserCallback;
37  
38  import org.htmlparser.Attribute;
39  import org.htmlparser.Node;
40  import org.htmlparser.Tag;
41  import org.htmlparser.nodes.AbstractNode;
42  import org.htmlparser.lexer.Cursor;
43  import org.htmlparser.lexer.Lexer;
44  import org.htmlparser.util.ParserException;
45  import org.htmlparser.util.Translate;
46  
47  /**
48   * Compare output from javax.swing.text.html.HTMLEditorKit with Lexer.
49   * This test provides a means of comparing the lexemes from
50   * javax.swing.text.html.HTMLEditorKit.Parser class with the lexemes
51   * produced by the org.htmlparser.lexer.Lexer class.
52   * <blockquote>
53   * The differences have eluded automation since the HTMLEditorKit parser
54   * adds spurious nodes where it thinks elements need closing or it gets
55   * confused.  The intent is to eventually incorporate this into the
56   * 'fit test' and run it against lots of HTML pages, but so far you must
57   * analyse the differences by hand.
58   * </blockquote>
59   */
60  public class KitTest extends ParserCallback
61  {
62      Vector   mNodes;
63      int mIndex;
64  
65      /**
66       * Creates a new instance of KitTest
67       * @param nodes The list of lexemes from Lexer to compare with the kit lexemes.
68       */
69      public KitTest (Vector   nodes)
70      {
71          mNodes = nodes;
72          mIndex = 0;
73      }
74  
75      /**
76       * Remove whitespace from a string.
77       * @param s The string to crunch.
78       * @return The string with whitespace characters removed.
79       */
80      String   snowhite (String   s)
81      {
82          int length;
83          char ch;
84          StringBuffer   ret;
85  
86          length = s.length ();
87          ret = new StringBuffer   (length);
88          for (int i = 0; i < length; i++)
89          {
90              ch = s.charAt (i);
91              if (!Character.isWhitespace (ch) && !(160 == ch))
92                  ret.append (ch);
93          }
94  
95          return (ret.toString ());
96      }
97  
98      /**
99       * Check if two strings match.
100      * @param s1 One string.
101      * @param s2 The other string.
102      * @return <code>true</code> if the strings are equivalent ignoring whitespace.
103      */
104     boolean match (String   s1, String   s2)
105     {
106         s1 = snowhite (Translate.decode (s1));
107         s2 = snowhite (Translate.decode (s2));
108         return (s1.equalsIgnoreCase (s2));
109     }
110 
111     /**
112      * Callback for a text lexeme.
113      * @param data The text extracted from the page.
114      * @param pos The position in the page.
115      * <em>Note: This differs from the Lexer concept of position which is an
116      * absolute location in the HTML input stream. This position is the character
117      * position if the text from the page were displayed in a browser.</em>
118      */
119     public void handleText (char[] data, int pos)
120     {
121         StringBuffer   sb;
122         String   theirs;
123         Node node;
124         int match;
125         String   ours;
126 
127         sb = new StringBuffer   (data.length);
128         for (int i = 0; i < data.length; i++)
129         {
130             if (160 == data[i])
131                 sb.append ("&nbsp;");
132             else
133                 sb.append (data[i]);
134         }
135         theirs = sb.toString ();
136         match = -1;
137         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
138         {
139             node = (Node)mNodes.elementAt (i);
140             ours = node.getText ();
141             if (match (theirs, ours))
142             {
143                 match = i;
144                 break;
145             }
146         }
147         if (-1 == match)
148         {
149             node = (Node)mNodes.elementAt (mIndex);
150             ours = node.getText ();
151             System.out.println ("theirs: " + theirs);
152             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
153             System.out.println ("ours " + cursor + ": " + ours);
154         }
155         else
156         {
157             boolean skipped = false;
158             for (int i = mIndex; i < match; i++)
159             {
160                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
161                 if (0 != ours.trim ().length ())
162                 {
163                     if (!skipped)
164                         System.out.println ("skipping:");
165                     System.out.println (ours);
166                     skipped = true;
167                 }
168             }
169             if (skipped)
170             {
171                 System.out.println ("to match:");
172                 node = (Node)mNodes.elementAt (match);
173                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
174                 System.out.println ("@" + cursor + ": " + node.toHtml ());
175             }
176 //            System.out.println (" match: " + theirs);
177             mIndex = match + 1;
178         }
179     }
180 
181     /**
182      * Callback for a remark lexeme.
183      * @param data The text extracted from the page.
184      * @param pos The position in the page.
185      * <em>Note: This differs from the Lexer concept of position which is an
186      * absolute location in the HTML input stream. This position is the character
187      * position if the text from the page were displayed in a browser.</em>
188      */
189     public void handleComment (char[] data, int pos)
190     {
191         StringBuffer   sb;
192         String   theirs;
193         Node node;
194         int match;
195         String   ours;
196 
197         sb = new StringBuffer   (data.length);
198         sb.append (data);
199         theirs = sb.toString ();
200         match = -1;
201         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
202         {
203             node = (Node)mNodes.elementAt (i);
204             ours = node.getText ();
205             if (match (theirs, ours))
206             {
207                 match = i;
208                 break;
209             }
210         }
211         if (-1 == match)
212         {
213             node = (Node)mNodes.elementAt (mIndex);
214             ours = node.getText ();
215             System.out.println ("theirs: " + theirs);
216             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
217             System.out.println ("ours " + cursor + ": " + ours);
218         }
219         else
220         {
221             boolean skipped = false;
222             for (int i = mIndex; i < match; i++)
223             {
224                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
225                 if (0 != ours.trim ().length ())
226                 {
227                     if (!skipped)
228                         System.out.println ("skipping:");
229                     System.out.println (ours);
230                     skipped = true;
231                 }
232             }
233             if (skipped)
234             {
235                 System.out.println ("to match:");
236                 node = (Node)mNodes.elementAt (match);
237                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
238                 System.out.println ("@" + cursor + ": " + node.toHtml ());
239             }
240 //            System.out.println (" match: " + theirs);
241             mIndex = match + 1;
242         }
243     }
244 
245     /**
246      * Callback for a start tag lexeme.
247      * @param t The tag extracted from the page.
248      * @param a The attributes parsed out of the tag.
249      * @param pos The position in the page.
250      * <em>Note: This differs from the Lexer concept of position which is an
251      * absolute location in the HTML input stream. This position is the character
252      * position if the text from the page were displayed in a browser.</em>
253      */
254     public void handleStartTag (HTML.Tag   t, MutableAttributeSet   a, int pos)
255     {
256         String   theirs;
257         Node node;
258         int match;
259         String   ours;
260 
261         theirs = t.toString ();
262         match = -1;
263         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
264         {
265             node = (Node)mNodes.elementAt (i);
266             if (node instanceof Tag)
267             {
268                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();
269                 if (match (theirs, ours))
270                 {
271                     match = i;
272                     break;
273                 }
274             }
275         }
276         if (-1 == match)
277         {
278             node = (Node)mNodes.elementAt (mIndex);
279             ours = node.getText ();
280             System.out.println ("theirs: " + theirs);
281             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
282             System.out.println ("ours " + cursor + ": " + ours);
283         }
284         else
285         {
286             boolean skipped = false;
287             for (int i = mIndex; i < match; i++)
288             {
289                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
290                 if (0 != ours.trim ().length ())
291                 {
292                     if (!skipped)
293                         System.out.println ("skipping:");
294                     System.out.println (ours);
295                     skipped = true;
296                 }
297             }
298             if (skipped)
299             {
300                 System.out.println ("to match:");
301                 node = (Node)mNodes.elementAt (match);
302                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
303                 System.out.println ("@" + cursor + ": " + node.toHtml ());
304             }
305 //            System.out.println (" match: " + theirs);
306             mIndex = match + 1;
307         }
308     }
309 
310     /**
311      * Callback for an end tag lexeme.
312      * @param t The tag extracted from the page.
313      * @param pos The position in the page.
314      * <em>Note: This differs from the Lexer concept of position which is an
315      * absolute location in the HTML input stream. This position is the character
316      * position if the text from the page were displayed in a browser.</em>
317      */
318     public void handleEndTag (HTML.Tag   t, int pos)
319     {
320         String   theirs;
321         Node node;
322         int match;
323         String   ours;
324 
325         theirs = t.toString ();
326         match = -1;
327         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
328         {
329             node = (Node)mNodes.elementAt (i);
330             if (node instanceof Tag)
331             {
332                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ().substring (1);
333                 if (match (theirs, ours))
334                 {
335                     match = i;
336                     break;
337                 }
338             }
339         }
340         if (-1 == match)
341         {
342             node = (Node)mNodes.elementAt (mIndex);
343             ours = node.getText ();
344             System.out.println ("theirs: " + theirs);
345             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
346             System.out.println ("ours " + cursor + ": " + ours);
347         }
348         else
349         {
350             boolean skipped = false;
351             for (int i = mIndex; i < match; i++)
352             {
353                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
354                 if (0 != ours.trim ().length ())
355                 {
356                     if (!skipped)
357                         System.out.println ("skipping:");
358                     System.out.println (ours);
359                     skipped = true;
360                 }
361             }
362             if (skipped)
363             {
364                 System.out.println ("to match:");
365                 node = (Node)mNodes.elementAt (match);
366                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
367                 System.out.println ("@" + cursor + ": " + node.toHtml ());
368             }
369 //            System.out.println (" match: " + theirs);
370             mIndex = match + 1;
371         }
372     }
373 
374     /**
375      * Callback for a non-composite tag.
376      * @param t The tag extracted from the page.
377      * @param a The attributes parsed out of the tag.
378      * @param pos The position in the page.
379      * <em>Note: This differs from the Lexer concept of position which is an
380      * absolute location in the HTML input stream. This position is the character
381      * position if the text from the page were displayed in a browser.</em>
382      */
383     public void handleSimpleTag (HTML.Tag   t, MutableAttributeSet   a, int pos)
384     {
385         String   theirs;
386         Node node;
387         int match;
388         String   ours;
389 
390         theirs = t.toString ();
391         match = -1;
392         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
393         {
394             node = (Node)mNodes.elementAt (i);
395             if (node instanceof Tag)
396             {
397                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();
398                 if (match (theirs, ours))
399                 {
400                     match = i;
401                     break;
402                 }
403                 if (match (theirs, ours))
404                 {
405                     match = i;
406                     break;
407                 }
408             }
409         }
410         if (-1 == match)
411         {
412             node = (Node)mNodes.elementAt (mIndex);
413             ours = node.getText ();
414             System.out.println ("theirs: " + theirs);
415             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
416             System.out.println ("ours " + cursor + ": " + ours);
417         }
418         else
419         {
420             boolean skipped = false;
421             for (int i = mIndex; i < match; i++)
422             {
423                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
424                 if (0 != ours.trim ().length ())
425                 {
426                     if (!skipped)
427                         System.out.println ("skipping:");
428                     System.out.println (ours);
429                     skipped = true;
430                 }
431             }
432             if (skipped)
433             {
434                 System.out.println ("to match:");
435                 node = (Node)mNodes.elementAt (match);
436                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
437                 System.out.println ("@" + cursor + ": " + node.toHtml ());
438             }
439 //            System.out.println (" match: " + theirs);
440             mIndex = match + 1;
441         }
442     }
443 
444 
445     /**
446      * Callback for an error condition.
447      * @param errorMsg The error condition as a text message.
448      * @param pos The position in the page.
449      * <em>Note: This differs from the Lexer concept of position which is an
450      * absolute location in the HTML input stream. This position is the character
451      * position if the text from the page were displayed in a browser.</em>
452      */
453     public void handleError (String   errorMsg, int pos)
454     {
455         System.out.println ("******* error @" + pos + " ******** " + errorMsg);
456     }
457 
458     /**
459      * Callback for flushing the state, just prior to shutting down the parser.
460      */
461     public void flush () throws BadLocationException  
462     {
463     }
464 
465     /**
466      * This is invoked after the stream has been parsed, but before
467      * <code>flush</code>. <code>eol</code> will be one of \n, \r
468      * or \r\n, which ever is encountered the most in parsing the
469      * stream.
470      *
471      * @since 1.3
472      */
473     public void handleEndOfLineString (String   eol)
474     {
475     }
476 
477 //    /**
478 //     * Get the document data from the URL.
479 //     * @param rd The reader to read bytes from.
480 //     * @return The parsed HTML document.
481 //     */
482 //    protected static Element[] getData (Reader rd) throws IOException
483 //    {
484 //        EditorKit kit;
485 //        Document doc;
486 //        Element[] ret;
487 //
488 //        ret = null;
489 //
490 //        // need this because HTMLEditorKit is not thread safe apparently
491 //        synchronized (Boolean.TRUE)
492 //        {
493 //            kit = new HTMLEditorKit ();
494 //            doc = kit.createDefaultDocument ();
495 //            // the Document class does not yet handle charset's properly
496 //            doc.putProperty ("IgnoreCharsetDirective", Boolean.TRUE);
497 //
498 //            try
499 //            {
500 //                // parse the HTML
501 //                kit.read (rd, doc, 0);
502 //            }
503 //            catch (BadLocationException ble)
504 //            {
505 //                throw new IOException ("parse error " + ble.getMessage ());
506 //            }
507 //
508 //            ret = doc.getRootElements ();
509 //        }
510 //
511 //        return (ret);
512 //    }
513 
514 //    public static void scanElements (Element element) throws BadLocationException
515 //    {
516 //        int start;
517 //        int end;
518 //        String string;
519 //        ElementIterator it;
520 //        Element child;
521 //
522 //        if (element.isLeaf ())
523 //        {
524 //            start = element.getStartOffset ();
525 //            end = element.getEndOffset ();
526 //            string = element.getDocument ().getText (start, end - start);
527 //            System.out.println (string);
528 //        }
529 //        else
530 //            // iterate through the elements of the element
531 //            for (int i = 0; i < element.getElementCount (); i++)
532 //            {
533 //                child = element.getElement (i);
534 //                scanElements (child);
535 //            }
536 //    }
537 
538     /**
539      * Subclass of HTMLEditorKit to expose getParser().
540      */
541     class MyKit extends HTMLEditorKit  
542     {
543         public MyKit ()
544         {
545         }
546 
547         public HTMLEditorKit.Parser   getParser ()
548         {
549             return (super.getParser ());
550         }
551     }
552 
553     /**
554      * Return a editor kit.
555      */
556     public MyKit getKit ()
557     {
558         return (new MyKit ());
559     }
560 
561     /**
562      * Manline for the test.
563      * @param args the command line arguments.
564      * If present the first array element is used as a URL to parse.
565      */
566     public static void main (String  [] args) throws ParserException, IOException  
567     {
568         String   link;
569         Lexer lexer;
570         Node node;
571         Vector   nodes;
572         KitTest test;
573         MyKit kit;
574         Parser parser;
575 
576 
577         if (0 == args.length)
578             link = "http://sourceforge.net/projects/htmlparser";
579         else
580             link = args[0];
581         // pass through it once to read the entire page
582         URL   url = new URL   (link);
583         lexer = new Lexer (url.openConnection ());
584         nodes = new Vector   ();
585         while (null != (node = lexer.nextNode ()))
586             nodes.addElement (node);
587 
588         // reset the reader
589         lexer.getPage ().getSource ().reset ();
590         test = new KitTest (nodes);
591         kit = test.getKit ();
592         parser = kit.getParser ();
593         parser.parse (lexer.getPage ().getSource (), test, true);
594     }
595 }
596 
597 /*
598  * Revision Control Modification History
599  *
600  * $Log: KitTest.java,v $
601  * Revision 1.8  2004/07/31 16:42:31  derrickoswald
602  * Remove unused variables and other fixes exposed by turning on compiler warnings.
603  *
604  * Revision 1.7  2004/05/24 16:18:31  derrickoswald
605  * Part three of a multiphase refactoring.
606  * The three node types are now fronted by interfaces (program to the interface paradigm)
607  * with concrete implementations in the new htmlparser.nodes package. Classes from the
608  * lexer.nodes package are moved to this package, and obvious references to the concrete
609  * classes that got broken by this have been changed to use the interfaces where possible.
610  *
611  * Revision 1.6  2004/01/14 02:53:47  derrickoswald
612  * *** empty log message ***
613  *
614  * Revision 1.5  2003/10/20 01:28:03  derrickoswald
615  * Removed lexer level AbstractNode.
616  * Removed data package from parser level tags.
617  * Separated tag creation from recursion in NodeFactory interface.
618  *
619  * Revision 1.4  2003/09/10 03:38:24  derrickoswald
620  * Add style checking target to ant build script:
621  *     ant checkstyle
622  * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory.
623  * The rules are in the file htmlparser_checks.xml in the src directory.
624  *
625  * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation
626  * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000.
627  *
628  * There are a few issues with the style checker that need to be resolved before it should be taken too seriously.
629  * For example:
630  * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on).
631  * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment
632  * that's more than 80 characters long.
633  * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box.
634  * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } }
635  *
636  * But it points out some really interesting things, even if you don't agree with the style guidelines,
637  * so it's worth a look.
638  *
639  * Revision 1.3  2003/08/27 02:40:24  derrickoswald
640  * Testing cvs keyword substitution.
641  *
642  *
643  */
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags