TestStandardTokenizer


1   /*
2    * TestStandardTokenizer.java: JUnit test for the StandardTokenizer
3    *
4    * Copyright (C) 2002 Heiko Blau
5    *
6    * This file belongs to the JTopas test suite.
7    * The JTopas test suite is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your option) 
10   * any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along 
18   * with the JTopas test suite. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
28   * You should have received a copy of their JUnit licence agreement along with 
29   * the JTopas test suite.
30   *
31   * We do NOT provide the JUnit archive junit.jar nessecary to compile and run 
32   * our tests, since we assume, that You  either have it already or would like 
33   * to get the current release Yourself. 
34   * Please visit either:
35   *   http://sourceforge.net/projects/junit
36   * or
37   *   http://junit.org
38   * to obtain JUnit.
39   *
40   * Contact:
41   *   email: heiko@susebox.de 
42   */
43  
44  package de.susebox.jtopas;
45  
46  //-----------------------------------------------------------------------------
47  // Imports
48  //
49  import java.io.InputStream  ;
50  import java.io.FileInputStream  ;
51  import java.io.InputStreamReader  ;
52  import java.util.Vector  ;
53  import java.util.Properties  ;
54  import java.net.URL  ;
55  
56  import junit.framework.Test;
57  import junit.framework.TestCase;
58  import junit.framework.TestSuite;
59  import junit.framework.Assert;
60  
61  import de.susebox.java.lang.ExtRuntimeException;
62  
63  import de.susebox.TestUtilities;
64  
65  
66  //-----------------------------------------------------------------------------
67  // Class TestStandardTokenizer
68  //
69  
70  /**<p>
71   * This test suite works with a test configuration file. This file contains some
72   * sets of properties, each set for one or more different test runs.
73   *</p><p>
74   * The properties are defined as class constants. In the configuration file, a 
75   * property consists of the property name and a number identifying the property
76   * set. 
77   *</p>
78   *
79   * @see     Tokenizer
80   * @see     AbstractTokenizer
81   * @see     java.io.InputStreamReader
82   * @author  Heiko Blau
83   */
84  public class TestStandardTokenizer extends TestCase {
85    
86    //---------------------------------------------------------------------------
87    // properties
88    //
89  
90    /**
91     * The name of the test configuration file. This file will be read by 
92     * {@link java.lang.Class#getResourceAsStream}.
93     */
94    public static final String   CONFIG_FILE = "TestStandardTokenizer.conf";
95    
96    /**
97     * Property for the tests {@link #testLinkParsing} and {@link #testContentsParsing}
98     */
99    public static final String   PROP_PATH = "Path";
100   
101   /**
102    * Property for the test {@link #testLineCounting}.
103    */
104   public static final String   PROP_COUNTLINES_PATH = "CountLinesPath";
105   
106   
107   //---------------------------------------------------------------------------
108   // main method
109   //
110   
111   /**
112    * call this method to invoke the tests
113    */
114   public static void main(String  [] args) {
115     String  []   tests = { TestStandardTokenizer.class.getName() };
116 
117     TestUtilities.run(tests, args);
118   }
119   
120 
121   //---------------------------------------------------------------------------
122   // suite method
123   //
124   
125   /**
126    * Implementation of the JUnit method <code>suite</code>. For each set of test
127    * properties one or more tests are instantiated.
128    *
129    * @return a test suite
130    */
131   public static Test suite() {
132     TestSuite   suite = new TestSuite(TestStandardTokenizer.class.getName());
133     Properties    props = new Properties  ();
134     int         count = 1;
135     String        path;
136     URL           url;
137     
138     try {
139       props.load(TestStandardTokenizer.class.getResourceAsStream(CONFIG_FILE));
140     } catch (Exception   ex) {
141       throw new ExtRuntimeException(ex);
142     }
143     
144     while ((path = props.getProperty(PROP_PATH + count)) != null) {
145       if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
146         path = url.getFile();
147       }
148       suite.addTest(new TestStandardTokenizer("testLinkParsing",        path));
149       suite.addTest(new TestStandardTokenizer("testContentsParsing",    path));
150       suite.addTest(new TestStandardTokenizer("testContentsFormatting", path));
151       count++;
152     }
153     count = 1;
154     while ((path = props.getProperty(PROP_COUNTLINES_PATH + count)) != null) {
155       if ((url = TestStandardTokenizer.class.getResource(path)) != null) {
156         path = url.getFile();
157       }
158       suite.addTest(new TestStandardTokenizer("testLineCounting", path));
159       count++;
160     }
161     return suite;
162   }
163   
164   
165   //---------------------------------------------------------------------------
166   // Constructor
167   //
168   
169   /**
170    * Default constructor. Standard input {@link java.lang.System#in} is used
171    * to construct the input stream reader.
172    */  
173   public TestStandardTokenizer(String   test, String   path) {
174     super(test);
175     _path = path;
176   }
177 
178   
179   //---------------------------------------------------------------------------
180   // Fixture setup and release
181   //
182   
183     /**
184      * Sets up the fixture, for example, open a network connection.
185      * This method is called before a test is executed.
186      */
187   protected void setUp() throws Exception   {
188     InputStream    stream = new FileInputStream  (_path);
189     
190     _reader = new InputStreamReader  (stream);
191     }
192 
193   
194     /**
195      * Tears down the fixture, for example, close a network connection.
196      * This method is called after a test is executed.
197      */
198     protected void tearDown() throws Exception   {
199     _reader.close();
200     }
201   
202   //---------------------------------------------------------------------------
203   // test cases
204   //
205   
206   public void testLinkParsing() throws Throwable   {
207     long                start     = System.currentTimeMillis();
208     TokenizerProperties props     = new StandardTokenizerProperties();
209     Tokenizer           tokenizer = new StandardTokenizer(props);
210     Vector                links     = new Vector  ();
211     Token               token;
212 
213     try {
214       props.setParseFlags(Flags.F_NO_CASE);
215       props.setSeparators("=");
216       props.addString("\"", "\"", "\\");
217       props.addBlockComment(">", "<");            // overread everything outside of tags
218       props.addBlockComment("SCRIPT", "/SCRIPT"); // overread script parts
219       props.addBlockComment("!--", "--");         // overread HTML comments without < and >
220       props.addKeyword("HREF");
221       tokenizer.setSource(new ReaderSource(_reader));
222 
223       System.out.println("\nStart looking for links in \"" + _path + "\"");
224       while (tokenizer.hasMoreToken()) {
225         token = tokenizer.nextToken();
226         if (token.getType() == Token.KEYWORD) {
227           tokenizer.nextToken();               // should be the '=' character
228           System.out.println("  " + tokenizer.nextImage());
229           assertTrue(tokenizer.currentImage() != null);
230           assertTrue(tokenizer.currentToken().getType() == Token.STRING);
231         }
232       }
233     } finally {
234       // Cleanup
235       tokenizer.close();
236     }
237 
238     long diff = System.currentTimeMillis() - start;
239     System.out.println("Finished after " + diff + " milliseconds");
240   }
241   
242   
243   /**
244    * Extracting the pure contents of a HTML stream.
245    */
246   public void testContentsParsing() throws Throwable   {
247     long                start     = System.currentTimeMillis();
248     TokenizerProperties props     = new StandardTokenizerProperties();
249     Tokenizer           tokenizer = new StandardTokenizer(props);
250 
251     try {
252       tokenizer.setSource(new ReaderSource(_reader));
253       System.out.println("\nStart extracting contents in \"" + _path + "\"");
254 
255       props.setParseFlags(Flags.F_NO_CASE | Flags.F_TOKEN_POS_ONLY);
256       props.setWhitespaces(null);
257       props.setSeparators(null);
258       props.addBlockComment("<", ">");            // overread HTML tags
259       props.addBlockComment("<HEAD>", "</HEAD>"); // overread HTML header
260       props.addBlockComment("<!--", "-->");       // overread HTML comments
261 
262       while (tokenizer.hasMoreToken()) {
263         tokenizer.nextToken();
264         if (tokenizer.currentToken().getType() != Token.EOF) {
265           System.out.println(tokenizer.currentImage());
266           assertTrue("Method currentImage() returned null.", tokenizer.currentImage() != null);
267         }
268         assertTrue("Found token type " + tokenizer.currentToken().getType() 
269                + ", expected NORMAL (" + Token.NORMAL + ") or EOF (" + Token.EOF + ").",
270                tokenizer.currentToken().getType() == Token.NORMAL 
271                || tokenizer.currentToken().getType() == Token.EOF);
272       }
273     } finally {
274       tokenizer.close();
275     }
276     
277     long diff = System.currentTimeMillis() - start;
278     System.out.println("Finished after " + diff + " milliseconds");
279   }
280   
281   
282   /**
283    * Testing the line and column counting correctness. This is done by using a
284    * specially formatted file. At a line x and a column y, the method expects
285    * the token "x/y", e.g. "0/0" at the very beginning of the file.
286    */
287   public void testLineCounting() throws Throwable   {
288     long                start     = System.currentTimeMillis();
289     TokenizerProperties props     = new StandardTokenizerProperties();
290     Tokenizer           tokenizer = new StandardTokenizer(props);
291     Token               token;
292     String                image;
293     int                 delPos;
294     int                 line;
295     int                 col;
296 
297     System.out.println("\nStart counting lines in \"" + _path + "\"");
298     
299     try {
300       tokenizer.setSource(new ReaderSource(_reader));
301       props.setParseFlags(Flags.F_TOKEN_POS_ONLY | Flags.F_COUNT_LINES);
302       props.setWhitespaces(TokenizerProperties.DEFAULT_WHITESPACES);
303       props.setSeparators(TokenizerProperties.DEFAULT_SEPARATORS);
304       props.addLineComment(TokenizerProperties.DEFAULT_LINE_COMMENT);
305       props.addBlockComment(TokenizerProperties.DEFAULT_BLOCK_COMMENT_START, TokenizerProperties.DEFAULT_BLOCK_COMMENT_END);
306 
307       while (tokenizer.hasMoreToken()) {
308         token = tokenizer.nextToken();
309         switch (token.getType()) {
310         case Token.NORMAL:
311           image = tokenizer.currentImage();
312           line  = Integer.parseInt(image);
313           assertTrue("Missing separator \"/\".", tokenizer.nextToken().getType() == Token.SEPARATOR && tokenizer.currentImage().equals("/"));
314           assertTrue("Missing column number", tokenizer.nextToken().getType() == Token.NORMAL);
315           image = tokenizer.currentImage();
316           col   = Integer.parseInt(image);
317           assertTrue("Found line number " + token.getStartLine() + " does not match expected line number " + line, 
318                     line == token.getStartLine());
319           assertTrue("Found column number " + token.getStartColumn() + " does not match expected column number " + col, 
320                     col == token.getStartColumn());
321           assertTrue("Found token length " + tokenizer.currentToken().getLength() + " does not match expected length " + image.length(), 
322                     image.length() == tokenizer.currentToken().getLength());
323           break;
324         }
325       }
326     } finally {
327       tokenizer.close();
328     }
329     
330     long diff = System.currentTimeMillis() - start;
331     System.out.println("Finished after " + diff + " milliseconds");
332   }
333   
334 
335   /**
336    * Advanced contents extracting. Lines will be around 80 characters, a basic
337    * paragraph recognition takes place.
338    */
339   public void testContentsFormatting() throws Throwable  {
340     long                start     = System.currentTimeMillis();
341     TokenizerProperties props     = new StandardTokenizerProperties();
342     Tokenizer           tokenizer = new StandardTokenizer(props);
343     Token               token;
344     String                image;
345     int                 len;
346     Object                startPRE  = new Object  ();
347     Object                endPRE    = new Object  ();
348     int                 inPRE     = 0;
349     
350     // Counter for expected parts
351     int       wsCount       = 0;
352     int       normalCount   = 0;
353     int       specCount     = 0;
354     int       commentCount  = 0;
355 
356     System.out.println("\nStart formatting contents in \"" + _path + "\"");
357 
358     try {
359       tokenizer.setSource(new ReaderSource(_reader));
360       props.setParseFlags( Flags.F_NO_CASE 
361                          | Flags.F_TOKEN_POS_ONLY 
362                          | Flags.F_RETURN_WHITESPACES);
363       props.setSeparators(null);
364       props.addBlockComment("<", ">");
365       props.addBlockComment("<HEAD>", "</HEAD>");
366       props.addBlockComment("<!--", "-->");
367       props.addSpecialSequence("<b>", "");
368       props.addSpecialSequence("</b>", "");
369       props.addSpecialSequence("<i>", "");
370       props.addSpecialSequence("</i>", "");
371       props.addSpecialSequence("<code>", "");
372       props.addSpecialSequence("</code>", "");
373       props.addSpecialSequence("<pre>",  startPRE);
374       props.addSpecialSequence("</pre>", endPRE);
375       props.addSpecialSequence("&auml;",  "\u00E4", 0, Flags.F_NO_CASE);
376       props.addSpecialSequence("&ouml;",  "\u00F6", 0, Flags.F_NO_CASE);
377       props.addSpecialSequence("&uuml;",  "\u00FC", 0, Flags.F_NO_CASE);
378       props.addSpecialSequence("&szlig;", "\u00DF", 0, Flags.F_NO_CASE);                   
379       props.addSpecialSequence("&Auml;",  "\u00C4", 0, Flags.F_NO_CASE);
380       props.addSpecialSequence("&Ouml;",  "\u00D6", 0, Flags.F_NO_CASE);
381       props.addSpecialSequence("&Uuml;",  "\u00DC", 0, Flags.F_NO_CASE);
382       props.addSpecialSequence("&nbsp;",  " ",      0, Flags.F_NO_CASE);
383       props.addSpecialSequence("&gt;",    ">",      0, Flags.F_NO_CASE);
384       props.addSpecialSequence("&lt;",    "<",      0, Flags.F_NO_CASE);
385       props.addSpecialSequence("&copy;",  "\u00A9");
386       props.addSpecialSequence("&euro;",  "\u20AC");
387 
388       len = 0;
389       while (tokenizer.hasMoreToken()) {
390         token = tokenizer.nextToken();
391         switch (token.getType()) {
392         case Token.NORMAL:
393           image = tokenizer.currentImage();
394           assertTrue("Found HTML tag in normal token: " + image, image.indexOf('<') < 0);
395           System.out.print(image);
396           if (inPRE <= 0) {
397             len += token.getLength();
398           }
399           normalCount++;
400           break;
401 
402         case Token.SPECIAL_SEQUENCE:
403           image = tokenizer.currentImage();
404           assertTrue("Couldn't find special sequence in properties: " + image, props.specialSequenceExists(image));
405           if (token.getCompanion() == startPRE) {
406             System.out.println();
407             len   = 0;
408             inPRE++;
409           } else if (token.getCompanion() == endPRE) {
410             System.out.println();
411             len   = 0;
412             inPRE--;
413           } else {
414             System.out.print((String  )token.getCompanion());
415           }
416           specCount++;
417           break;
418 
419         case Token.BLOCK_COMMENT:
420           if (len > 0) {
421             System.out.println();
422             len = 0;
423           }
424           commentCount++;
425           break;
426 
427         case Token.WHITESPACE:
428           if (inPRE > 0) {
429             System.out.print(tokenizer.currentImage());
430           } else if (len > 75) {
431             System.out.println();
432             len = 0;
433           } else if (len > 0) {
434             System.out.print(' ');
435             len++;
436           }
437           wsCount++;
438           break;
439         }
440       }
441 
442       // Where should have been something of each categorie
443       assertTrue("Not one simple context part was found in file " + _path + ".", normalCount > 0);
444       assertTrue("No HTML tag found " + _path + ".", commentCount > 0);
445       assertTrue("No whitespaces found " + _path + ".", wsCount > 0);
446       
447     } finally {
448       // cleanup
449       tokenizer.close();
450     }
451 
452     // Ready
453     long diff = System.currentTimeMillis() - start;
454     System.out.println("Finished after " + diff + " milliseconds");
455   }
456   
457   //---------------------------------------------------------------------------
458   // Members
459   //
460   protected InputStreamReader   _reader = null;
461   protected String              _path   = null;
462 }
463
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags