KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > tests > lexerTests > KitTest


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// Copyright (C) August 26, 2003 Derrick Oswald
3
//
4
// Revision Control Information
5
//
6
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/KitTest.java,v $
7
// $Author: derrickoswald $
8
// $Date: 2004/07/31 16:42:31 $
9
// $Revision: 1.8 $
10
//
11
// This library is free software; you can redistribute it and/or
12
// modify it under the terms of the GNU Lesser General Public
13
// License as published by the Free Software Foundation; either
14
// version 2.1 of the License, or (at your option) any later version.
15
//
16
// This library is distributed in the hope that it will be useful,
17
// but WITHOUT ANY WARRANTY; without even the implied warranty of
18
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19
// Lesser General Public License for more details.
20
//
21
// You should have received a copy of the GNU Lesser General Public
22
// License along with this library; if not, write to the Free Software
23
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
//
25

26 package org.htmlparser.tests.lexerTests;
27
28 import java.io.IOException JavaDoc;
29 import java.net.URL JavaDoc;
30 import java.util.Vector JavaDoc;
31 import javax.swing.text.BadLocationException JavaDoc;
32 import javax.swing.text.MutableAttributeSet JavaDoc;
33 import javax.swing.text.html.HTML JavaDoc;
34 import javax.swing.text.html.HTMLEditorKit JavaDoc;
35 import javax.swing.text.html.HTMLEditorKit.Parser;
36 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
37
38 import org.htmlparser.Attribute;
39 import org.htmlparser.Node;
40 import org.htmlparser.Tag;
41 import org.htmlparser.nodes.AbstractNode;
42 import org.htmlparser.lexer.Cursor;
43 import org.htmlparser.lexer.Lexer;
44 import org.htmlparser.util.ParserException;
45 import org.htmlparser.util.Translate;
46
47 /**
48  * Compare output from javax.swing.text.html.HTMLEditorKit with Lexer.
49  * This test provides a means of comparing the lexemes from
50  * javax.swing.text.html.HTMLEditorKit.Parser class with the lexemes
51  * produced by the org.htmlparser.lexer.Lexer class.
52  * <blockquote>
53  * The differences have eluded automation since the HTMLEditorKit parser
54  * adds spurious nodes where it thinks elements need closing or it gets
55  * confused. The intent is to eventually incorporate this into the
56  * 'fit test' and run it against lots of HTML pages, but so far you must
57  * analyse the differences by hand.
58  * </blockquote>
59  */

60 public class KitTest extends ParserCallback
61 {
62     Vector JavaDoc mNodes;
63     int mIndex;
64
65     /**
66      * Creates a new instance of KitTest
67      * @param nodes The list of lexemes from Lexer to compare with the kit lexemes.
68      */

69     public KitTest (Vector JavaDoc nodes)
70     {
71         mNodes = nodes;
72         mIndex = 0;
73     }
74
75     /**
76      * Remove whitespace from a string.
77      * @param s The string to crunch.
78      * @return The string with whitespace characters removed.
79      */

80     String JavaDoc snowhite (String JavaDoc s)
81     {
82         int length;
83         char ch;
84         StringBuffer JavaDoc ret;
85
86         length = s.length ();
87         ret = new StringBuffer JavaDoc (length);
88         for (int i = 0; i < length; i++)
89         {
90             ch = s.charAt (i);
91             if (!Character.isWhitespace (ch) && !(160 == ch))
92                 ret.append (ch);
93         }
94
95         return (ret.toString ());
96     }
97
98     /**
99      * Check if two strings match.
100      * @param s1 One string.
101      * @param s2 The other string.
102      * @return <code>true</code> if the strings are equivalent ignoring whitespace.
103      */

104     boolean match (String JavaDoc s1, String JavaDoc s2)
105     {
106         s1 = snowhite (Translate.decode (s1));
107         s2 = snowhite (Translate.decode (s2));
108         return (s1.equalsIgnoreCase (s2));
109     }
110
111     /**
112      * Callback for a text lexeme.
113      * @param data The text extracted from the page.
114      * @param pos The position in the page.
115      * <em>Note: This differs from the Lexer concept of position which is an
116      * absolute location in the HTML input stream. This position is the character
117      * position if the text from the page were displayed in a browser.</em>
118      */

119     public void handleText (char[] data, int pos)
120     {
121         StringBuffer JavaDoc sb;
122         String JavaDoc theirs;
123         Node node;
124         int match;
125         String JavaDoc ours;
126
127         sb = new StringBuffer JavaDoc (data.length);
128         for (int i = 0; i < data.length; i++)
129         {
130             if (160 == data[i])
131                 sb.append ("&nbsp;");
132             else
133                 sb.append (data[i]);
134         }
135         theirs = sb.toString ();
136         match = -1;
137         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
138         {
139             node = (Node)mNodes.elementAt (i);
140             ours = node.getText ();
141             if (match (theirs, ours))
142             {
143                 match = i;
144                 break;
145             }
146         }
147         if (-1 == match)
148         {
149             node = (Node)mNodes.elementAt (mIndex);
150             ours = node.getText ();
151             System.out.println ("theirs: " + theirs);
152             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
153             System.out.println ("ours " + cursor + ": " + ours);
154         }
155         else
156         {
157             boolean skipped = false;
158             for (int i = mIndex; i < match; i++)
159             {
160                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
161                 if (0 != ours.trim ().length ())
162                 {
163                     if (!skipped)
164                         System.out.println ("skipping:");
165                     System.out.println (ours);
166                     skipped = true;
167                 }
168             }
169             if (skipped)
170             {
171                 System.out.println ("to match:");
172                 node = (Node)mNodes.elementAt (match);
173                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
174                 System.out.println ("@" + cursor + ": " + node.toHtml ());
175             }
176 // System.out.println (" match: " + theirs);
177
mIndex = match + 1;
178         }
179     }
180
181     /**
182      * Callback for a remark lexeme.
183      * @param data The text extracted from the page.
184      * @param pos The position in the page.
185      * <em>Note: This differs from the Lexer concept of position which is an
186      * absolute location in the HTML input stream. This position is the character
187      * position if the text from the page were displayed in a browser.</em>
188      */

189     public void handleComment (char[] data, int pos)
190     {
191         StringBuffer JavaDoc sb;
192         String JavaDoc theirs;
193         Node node;
194         int match;
195         String JavaDoc ours;
196
197         sb = new StringBuffer JavaDoc (data.length);
198         sb.append (data);
199         theirs = sb.toString ();
200         match = -1;
201         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
202         {
203             node = (Node)mNodes.elementAt (i);
204             ours = node.getText ();
205             if (match (theirs, ours))
206             {
207                 match = i;
208                 break;
209             }
210         }
211         if (-1 == match)
212         {
213             node = (Node)mNodes.elementAt (mIndex);
214             ours = node.getText ();
215             System.out.println ("theirs: " + theirs);
216             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
217             System.out.println ("ours " + cursor + ": " + ours);
218         }
219         else
220         {
221             boolean skipped = false;
222             for (int i = mIndex; i < match; i++)
223             {
224                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
225                 if (0 != ours.trim ().length ())
226                 {
227                     if (!skipped)
228                         System.out.println ("skipping:");
229                     System.out.println (ours);
230                     skipped = true;
231                 }
232             }
233             if (skipped)
234             {
235                 System.out.println ("to match:");
236                 node = (Node)mNodes.elementAt (match);
237                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
238                 System.out.println ("@" + cursor + ": " + node.toHtml ());
239             }
240 // System.out.println (" match: " + theirs);
241
mIndex = match + 1;
242         }
243     }
244
245     /**
246      * Callback for a start tag lexeme.
247      * @param t The tag extracted from the page.
248      * @param a The attributes parsed out of the tag.
249      * @param pos The position in the page.
250      * <em>Note: This differs from the Lexer concept of position which is an
251      * absolute location in the HTML input stream. This position is the character
252      * position if the text from the page were displayed in a browser.</em>
253      */

254     public void handleStartTag (HTML.Tag JavaDoc t, MutableAttributeSet JavaDoc a, int pos)
255     {
256         String JavaDoc theirs;
257         Node node;
258         int match;
259         String JavaDoc ours;
260
261         theirs = t.toString ();
262         match = -1;
263         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
264         {
265             node = (Node)mNodes.elementAt (i);
266             if (node instanceof Tag)
267             {
268                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();
269                 if (match (theirs, ours))
270                 {
271                     match = i;
272                     break;
273                 }
274             }
275         }
276         if (-1 == match)
277         {
278             node = (Node)mNodes.elementAt (mIndex);
279             ours = node.getText ();
280             System.out.println ("theirs: " + theirs);
281             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
282             System.out.println ("ours " + cursor + ": " + ours);
283         }
284         else
285         {
286             boolean skipped = false;
287             for (int i = mIndex; i < match; i++)
288             {
289                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
290                 if (0 != ours.trim ().length ())
291                 {
292                     if (!skipped)
293                         System.out.println ("skipping:");
294                     System.out.println (ours);
295                     skipped = true;
296                 }
297             }
298             if (skipped)
299             {
300                 System.out.println ("to match:");
301                 node = (Node)mNodes.elementAt (match);
302                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
303                 System.out.println ("@" + cursor + ": " + node.toHtml ());
304             }
305 // System.out.println (" match: " + theirs);
306
mIndex = match + 1;
307         }
308     }
309
310     /**
311      * Callback for an end tag lexeme.
312      * @param t The tag extracted from the page.
313      * @param pos The position in the page.
314      * <em>Note: This differs from the Lexer concept of position which is an
315      * absolute location in the HTML input stream. This position is the character
316      * position if the text from the page were displayed in a browser.</em>
317      */

318     public void handleEndTag (HTML.Tag JavaDoc t, int pos)
319     {
320         String JavaDoc theirs;
321         Node node;
322         int match;
323         String JavaDoc ours;
324
325         theirs = t.toString ();
326         match = -1;
327         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
328         {
329             node = (Node)mNodes.elementAt (i);
330             if (node instanceof Tag)
331             {
332                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ().substring (1);
333                 if (match (theirs, ours))
334                 {
335                     match = i;
336                     break;
337                 }
338             }
339         }
340         if (-1 == match)
341         {
342             node = (Node)mNodes.elementAt (mIndex);
343             ours = node.getText ();
344             System.out.println ("theirs: " + theirs);
345             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
346             System.out.println ("ours " + cursor + ": " + ours);
347         }
348         else
349         {
350             boolean skipped = false;
351             for (int i = mIndex; i < match; i++)
352             {
353                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
354                 if (0 != ours.trim ().length ())
355                 {
356                     if (!skipped)
357                         System.out.println ("skipping:");
358                     System.out.println (ours);
359                     skipped = true;
360                 }
361             }
362             if (skipped)
363             {
364                 System.out.println ("to match:");
365                 node = (Node)mNodes.elementAt (match);
366                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
367                 System.out.println ("@" + cursor + ": " + node.toHtml ());
368             }
369 // System.out.println (" match: " + theirs);
370
mIndex = match + 1;
371         }
372     }
373
374     /**
375      * Callback for a non-composite tag.
376      * @param t The tag extracted from the page.
377      * @param a The attributes parsed out of the tag.
378      * @param pos The position in the page.
379      * <em>Note: This differs from the Lexer concept of position which is an
380      * absolute location in the HTML input stream. This position is the character
381      * position if the text from the page were displayed in a browser.</em>
382      */

383     public void handleSimpleTag (HTML.Tag JavaDoc t, MutableAttributeSet JavaDoc a, int pos)
384     {
385         String JavaDoc theirs;
386         Node node;
387         int match;
388         String JavaDoc ours;
389
390         theirs = t.toString ();
391         match = -1;
392         for (int i = mIndex; i < Math.min (mIndex + 25, mNodes.size ()); i++)
393         {
394             node = (Node)mNodes.elementAt (i);
395             if (node instanceof Tag)
396             {
397                 ours = ((Attribute)(((Tag)node).getAttributesEx ().elementAt (0))).getName ();
398                 if (match (theirs, ours))
399                 {
400                     match = i;
401                     break;
402                 }
403                 if (match (theirs, ours))
404                 {
405                     match = i;
406                     break;
407                 }
408             }
409         }
410         if (-1 == match)
411         {
412             node = (Node)mNodes.elementAt (mIndex);
413             ours = node.getText ();
414             System.out.println ("theirs: " + theirs);
415             Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
416             System.out.println ("ours " + cursor + ": " + ours);
417         }
418         else
419         {
420             boolean skipped = false;
421             for (int i = mIndex; i < match; i++)
422             {
423                 ours = ((Node)mNodes.elementAt (i)).toHtml ();
424                 if (0 != ours.trim ().length ())
425                 {
426                     if (!skipped)
427                         System.out.println ("skipping:");
428                     System.out.println (ours);
429                     skipped = true;
430                 }
431             }
432             if (skipped)
433             {
434                 System.out.println ("to match:");
435                 node = (Node)mNodes.elementAt (match);
436                 Cursor cursor = new Cursor (((AbstractNode)node).getPage (), node.elementBegin ());
437                 System.out.println ("@" + cursor + ": " + node.toHtml ());
438             }
439 // System.out.println (" match: " + theirs);
440
mIndex = match + 1;
441         }
442     }
443
444
445     /**
446      * Callback for an error condition.
447      * @param errorMsg The error condition as a text message.
448      * @param pos The position in the page.
449      * <em>Note: This differs from the Lexer concept of position which is an
450      * absolute location in the HTML input stream. This position is the character
451      * position if the text from the page were displayed in a browser.</em>
452      */

453     public void handleError (String JavaDoc errorMsg, int pos)
454     {
455         System.out.println ("******* error @" + pos + " ******** " + errorMsg);
456     }
457
458     /**
459      * Callback for flushing the state, just prior to shutting down the parser.
460      */

461     public void flush () throws BadLocationException JavaDoc
462     {
463     }
464
465     /**
466      * This is invoked after the stream has been parsed, but before
467      * <code>flush</code>. <code>eol</code> will be one of \n, \r
468      * or \r\n, which ever is encountered the most in parsing the
469      * stream.
470      *
471      * @since 1.3
472      */

473     public void handleEndOfLineString (String JavaDoc eol)
474     {
475     }
476
477 // /**
478
// * Get the document data from the URL.
479
// * @param rd The reader to read bytes from.
480
// * @return The parsed HTML document.
481
// */
482
// protected static Element[] getData (Reader rd) throws IOException
483
// {
484
// EditorKit kit;
485
// Document doc;
486
// Element[] ret;
487
//
488
// ret = null;
489
//
490
// // need this because HTMLEditorKit is not thread safe apparently
491
// synchronized (Boolean.TRUE)
492
// {
493
// kit = new HTMLEditorKit ();
494
// doc = kit.createDefaultDocument ();
495
// // the Document class does not yet handle charset's properly
496
// doc.putProperty ("IgnoreCharsetDirective", Boolean.TRUE);
497
//
498
// try
499
// {
500
// // parse the HTML
501
// kit.read (rd, doc, 0);
502
// }
503
// catch (BadLocationException ble)
504
// {
505
// throw new IOException ("parse error " + ble.getMessage ());
506
// }
507
//
508
// ret = doc.getRootElements ();
509
// }
510
//
511
// return (ret);
512
// }
513

514 // public static void scanElements (Element element) throws BadLocationException
515
// {
516
// int start;
517
// int end;
518
// String string;
519
// ElementIterator it;
520
// Element child;
521
//
522
// if (element.isLeaf ())
523
// {
524
// start = element.getStartOffset ();
525
// end = element.getEndOffset ();
526
// string = element.getDocument ().getText (start, end - start);
527
// System.out.println (string);
528
// }
529
// else
530
// // iterate through the elements of the element
531
// for (int i = 0; i < element.getElementCount (); i++)
532
// {
533
// child = element.getElement (i);
534
// scanElements (child);
535
// }
536
// }
537

538     /**
539      * Subclass of HTMLEditorKit to expose getParser().
540      */

541     class MyKit extends HTMLEditorKit JavaDoc
542     {
543         public MyKit ()
544         {
545         }
546
547         public HTMLEditorKit.Parser JavaDoc getParser ()
548         {
549             return (super.getParser ());
550         }
551     }
552
553     /**
554      * Return a editor kit.
555      */

556     public MyKit getKit ()
557     {
558         return (new MyKit ());
559     }
560
561     /**
562      * Manline for the test.
563      * @param args the command line arguments.
564      * If present the first array element is used as a URL to parse.
565      */

566     public static void main (String JavaDoc[] args) throws ParserException, IOException JavaDoc
567     {
568         String JavaDoc link;
569         Lexer lexer;
570         Node node;
571         Vector JavaDoc nodes;
572         KitTest test;
573         MyKit kit;
574         Parser parser;
575
576
577         if (0 == args.length)
578             link = "http://sourceforge.net/projects/htmlparser";
579         else
580             link = args[0];
581         // pass through it once to read the entire page
582
URL JavaDoc url = new URL JavaDoc (link);
583         lexer = new Lexer (url.openConnection ());
584         nodes = new Vector JavaDoc ();
585         while (null != (node = lexer.nextNode ()))
586             nodes.addElement (node);
587
588         // reset the reader
589
lexer.getPage ().getSource ().reset ();
590         test = new KitTest (nodes);
591         kit = test.getKit ();
592         parser = kit.getParser ();
593         parser.parse (lexer.getPage ().getSource (), test, true);
594     }
595 }
596
597 /*
598  * Revision Control Modification History
599  *
600  * $Log: KitTest.java,v $
601  * Revision 1.8 2004/07/31 16:42:31 derrickoswald
602  * Remove unused variables and other fixes exposed by turning on compiler warnings.
603  *
604  * Revision 1.7 2004/05/24 16:18:31 derrickoswald
605  * Part three of a multiphase refactoring.
606  * The three node types are now fronted by interfaces (program to the interface paradigm)
607  * with concrete implementations in the new htmlparser.nodes package. Classes from the
608  * lexer.nodes package are moved to this package, and obvious references to the concrete
609  * classes that got broken by this have been changed to use the interfaces where possible.
610  *
611  * Revision 1.6 2004/01/14 02:53:47 derrickoswald
612  * *** empty log message ***
613  *
614  * Revision 1.5 2003/10/20 01:28:03 derrickoswald
615  * Removed lexer level AbstractNode.
616  * Removed data package from parser level tags.
617  * Separated tag creation from recursion in NodeFactory interface.
618  *
619  * Revision 1.4 2003/09/10 03:38:24 derrickoswald
620  * Add style checking target to ant build script:
621  * ant checkstyle
622  * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory.
623  * The rules are in the file htmlparser_checks.xml in the src directory.
624  *
625  * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation
626  * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000.
627  *
628  * There are a few issues with the style checker that need to be resolved before it should be taken too seriously.
629  * For example:
630  * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on).
631  * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment
632  * that's more than 80 characters long.
633  * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box.
634  * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } }
635  *
636  * But it points out some really interesting things, even if you don't agree with the style guidelines,
637  * so it's worth a look.
638  *
639  * Revision 1.3 2003/08/27 02:40:24 derrickoswald
640  * Testing cvs keyword substitution.
641  *
642  *
643  */
Popular Tags