TestTokenizerSource


1   /*
2    * TestTokenizerSource.java: JUnit test for a Tokenizer
3    *
4    * Copyright (C) 2004 Heiko Blau
5    *
6    * This file belongs to the JTopas test suite.
7    * The JTopas test suite is free software; you can redistribute it and/or modify it 
8    * under the terms of the GNU Lesser General Public License as published by the 
9    * Free Software Foundation; either version 2.1 of the License, or (at your option) 
10   * any later version.
11   *
12   * This software is distributed in the hope that it will be useful, but WITHOUT
13   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
14   * FITNESS FOR A PARTICULAR PURPOSE. 
15   * See the GNU Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public License along 
18   * with the JTopas test suite. If not, write to the
19   *
20   *   Free Software Foundation, Inc.
21   *   59 Temple Place, Suite 330, 
22   *   Boston, MA 02111-1307 
23   *   USA
24   *
25   * or check the Internet: http://www.fsf.org
26   *
27   * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
28   * You should have received a copy of their JUnit licence agreement along with 
29   * the JTopas test suite.
30   *
31   * We do NOT provide the JUnit archive junit.jar nessecary to compile and run 
32   * our tests, since we assume, that You  either have it already or would like 
33   * to get the current release Yourself. 
34   * Please visit either:
35   *   http://sourceforge.net/projects/junit
36   * or
37   *   http://junit.org
38   * to obtain JUnit.
39   *
40   * Contact:
41   *   email: heiko@susebox.de 
42   */
43  
44  package de.susebox.jtopas;
45  
46  //-----------------------------------------------------------------------------
47  // Imports
48  //
49  import java.io.Reader  ;
50  import java.io.StringReader  ;
51  import java.io.File  ;
52  import java.io.PrintWriter  ;
53  import java.util.Iterator  ;
54  import java.util.List  ;
55  import java.util.LinkedList  ;
56  
57  import junit.framework.Test;
58  import junit.framework.TestCase;
59  import junit.framework.TestSuite;
60  import junit.framework.Assert;
61  
62  import de.susebox.TestUtilities;
63  
64  
65  //-----------------------------------------------------------------------------
66  // Class TestTokenizerSource
67  //
68  
69  /**<p>
70   * The class contains a number of test cases that are supposed to be difficult
71   * to handle for a {@link Tokenizer}, e.g. EOF conditions inside strings etc.
72   *</p>
73   *
74   * @see     TokenizerSource
75   * @author  Heiko Blau
76   */
77  public class TestTokenizerSource extends TestCase {
78    
79    //---------------------------------------------------------------------------
80    // properties
81    //
82  
83    
84    //---------------------------------------------------------------------------
85    // main method
86    //
87    
88    /**
89     * call this method to invoke the tests
90     */
91    public static void main(String  [] args) {
92      String  []   tests = { TestTokenizerSource.class.getName() };
93  
94      TestUtilities.run(tests, args);
95    }
96    
97  
98    //---------------------------------------------------------------------------
99    // suite method
100   //
101   
102   /**
103    * Implementation of the JUnit method <code>suite</code>. For each set of test
104    * properties one or more tests are instantiated.
105    *
106    * @return a test suite
107    */
108   public static Test suite() {
109     TestSuite   suite = new TestSuite(TestTokenizerSource.class.getName());
110   
111     suite.addTest(new TestTokenizerSource("testEmptySource"));
112     suite.addTest(new TestTokenizerSource("testSmallBuffer"));
113     suite.addTest(new TestTokenizerSource("testLargeBuffer"));
114     suite.addTest(new TestTokenizerSource("testSpeed"));
115     suite.addTest(new TestTokenizerSource("testSimilarResults"));
116     suite.addTest(new TestTokenizerSource("testLargeSource"));
117     return suite;
118   }
119   
120   
121   //---------------------------------------------------------------------------
122   // Constructor
123   //
124   
125   /**
126    * Default constructor. Standard input {@link java.lang.System#in} is used
127    * to construct the input stream reader.
128    */  
129   public TestTokenizerSource(String   test) {
130     super(test);
131   }
132 
133   
134   //---------------------------------------------------------------------------
135   // Fixture setup and release
136   //
137   
138   /**
139    * Sets up the fixture, for example, open a network connection.
140    * This method is called before a test is executed.
141    */
142   protected void setUp() throws Exception   {}
143 
144   
145   /**
146    * Tears down the fixture, for example, close a network connection.
147    * This method is called after a test is executed.
148    */
149   protected void tearDown() throws Exception   {}
150   
151   
152   //---------------------------------------------------------------------------
153   // test cases
154   //
155 
156   /**
157    * Test empty data sources
158    */
159   public void testEmptySource() throws Throwable   {
160     TokenizerSource[] source  = { null, null, null, null, null, null };
161     char[]            cbuf    = new char[8129];
162     int               count;
163     
164     source[0] = new CharArraySource(null);
165     source[1] = new ReaderSource((java.io.InputStream  )null);
166     source[2] = new StringSource(null);
167     source[3] = new CharArraySource(new char[0]);
168     source[4] = new ReaderSource(new StringReader  (""));
169     source[5] = new StringSource("");
170     for (int index = 0; index < source.length; ++index) {
171       count = source[index].read(cbuf, 0, cbuf.length);
172       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
173     }
174   }
175   
176   /**
177    * Test a buffer that is smaller than the available data
178    */
179   public void testSmallBuffer() throws Throwable   {
180     TokenizerSource[] source  = { null, null, null };
181     char[]            cbuf    = new char[1];
182     char[]            text    = new char[DATA.length()];
183     int               count;
184     
185     DATA.getChars(0, DATA.length(), text, 0);
186     source[0] = new CharArraySource(text);
187     source[1] = new ReaderSource(new StringReader  (DATA));
188     source[2] = new StringSource(DATA);
189     for (int index = 0; index < source.length; ++index) {
190       for (int readIndex = 0; readIndex < DATA.length(); ++readIndex) {
191         count = source[index].read(cbuf, 0, cbuf.length);
192         assertTrue(source[index].getClass().getName() + ": expected 1, got " + count, count == 1);
193       }
194       count = source[index].read(cbuf, 0, cbuf.length);
195       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
196     }
197   }
198   
199   /**
200    * Test a buffer that is larger than the available data
201    */
202   public void testLargeBuffer() throws Throwable   {
203     TokenizerSource[] source  = { null, null, null };
204     char[]            cbuf    = new char[8192];
205     char[]            text    = new char[DATA.length()];
206     int               count;
207     
208     DATA.getChars(0, DATA.length(), text, 0);
209     source[0] = new CharArraySource(text);
210     source[1] = new ReaderSource(new StringReader  (DATA));
211     source[2] = new StringSource(DATA);
212     for (int index = 0; index < source.length; ++index) {
213       count = source[index].read(cbuf, 0, cbuf.length);
214       assertTrue(source[index].getClass().getName() + ": expected " + DATA.length() + ", got " + count, count == DATA.length());
215       count = source[index].read(cbuf, 0, cbuf.length);
216       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
217     }
218   }
219   
220   /**
221    * Test speed
222    */
223   public void testSpeed() throws Throwable   {
224     // construct a really huge string
225     TokenizerSource source;
226     char[]          buffer;
227     String            text   = expandData(20000);
228     char[]          cbuf   = new char[text.length()];
229 
230     text.getChars(0, text.length(), cbuf, 0);
231 
232     for (int bufferSize = 8; bufferSize < 0x20000; bufferSize *= 2) {
233       System.out.println("Buffer size " + bufferSize + ":");
234       buffer = new char[bufferSize];
235       
236       // CharArraySource
237       readSource(new CharArraySource(cbuf), buffer);
238 
239       // ReaderSource
240       readSource(new ReaderSource(new StringReader  (text)), buffer);
241 
242       // StringSource
243       readSource(new StringSource(text), buffer);
244     }
245   }
246   
247   /**
248    * Test similar special sequences.
249    */
250   public void testSimilarResults() throws Throwable   {
251     // construct a really huge string
252     String   text = expandData(1000);
253 
254     // initialize the properties
255     TokenizerProperties props     = new StandardTokenizerProperties();
256     StandardTokenizer   tokenizer = new StandardTokenizer();
257     TokenizerSource     source;
258     long                startTime;
259 
260     props.addSpecialSequence(ORIG_SMILEY,       ORIG_SMILEY);
261     props.addSpecialSequence(FRIGHTENED_SMIKEY, FRIGHTENED_SMIKEY);
262     props.addSpecialSequence(WINKING_SMILEY,    WINKING_SMILEY);
263     props.addString("\"", "\"", "\\");
264     props.addString("'", "'", "\\");
265     
266     try {
267       tokenizer.setTokenizerProperties(props);
268   
269       // CharArraySource
270       char[] cbuf = new char[text.length()];
271 
272       text.getChars(0, text.length(), cbuf, 0);
273   
274       // tokenize several times to avoid JIT or hotspot optimization effects
275       int   loopCount   = 100;
276       int   loops       = 0;
277       long  timeTotal1  = 0;
278       long  timeTotal2  = 0;
279       long  timeTotal3  = 0;
280       
281       while (loops++ < loopCount) {
282         tokenizer.setSource(new CharArraySource(cbuf));
283 
284         startTime  = System.currentTimeMillis();
285         List   list1 = tokenize(tokenizer);
286         long time1 = System.currentTimeMillis() - startTime;
287         System.out.println("Loop #" + loops + ": CharArraySource needed " + time1  + "ms for " + list1.size() + " token.");
288         timeTotal1 += time1;
289 
290         // ReaderSource
291         tokenizer.setSource(new ReaderSource(new StringReader  (text)));
292 
293         startTime = System.currentTimeMillis();
294         List   list2 = tokenize(tokenizer);
295         long time2 = System.currentTimeMillis() - startTime;
296         System.out.println("Loop #" + loops + ": ReaderSource needed " + time2 + "ms for " + list2.size() + " token.");
297         timeTotal2 += time2;
298 
299         // StringSource
300         tokenizer.setSource(new StringSource(text));
301 
302         startTime = System.currentTimeMillis();
303         List   list3 = tokenize(tokenizer);
304         long time3 = System.currentTimeMillis() - startTime;
305         System.out.println("Loop #" + loops + ": StringSource needed " + time3 + "ms for " + list3.size() + " token.");
306         timeTotal3 += time3;
307 
308         System.out.println("CharArraySource has " + list1.size() + " token.");
309         System.out.println("ReaderSource has "    + list2.size() + " token.");
310         System.out.println("StringSource has "    + list3.size() + " token.");
311         
312         // any list shorter than the others?
313         assertTrue("CharArraySource token count differs from ReaderSource token count.", list1.size() == list2.size());
314         assertTrue("CharArraySource token count differs from StringSource token count.", list1.size() == list3.size());
315 
316         // check token list only once
317         if (loops == loopCount) {
318           System.out.println("CharArraySource total time: " + timeTotal1 + "ms.");
319           System.out.println("ReaderSource total time: "    + timeTotal2 + "ms.");
320           System.out.println("StringSource total time: "    + timeTotal3 + "ms.");
321 
322           Iterator   iter1 = list1.iterator();
323           Iterator   iter2 = list2.iterator();
324           Iterator   iter3 = list3.iterator();
325           int      index = 0;
326           while (iter1.hasNext()) {
327             // compare token
328             Token token1 = (Token)iter1.next();
329             Token token2 = (Token)iter2.next();
330             Token token3 = (Token)iter3.next();
331 
332             assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", ReaderSource \"" + token2 + "\"",
333                       token1.equals(token2));
334             assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", StringSource \"" + token3 + "\"",
335                       token1.equals(token3));
336             index++;
337           }
338         }
339       }
340     } finally {
341       tokenizer.close();
342     }
343   }
344   
345 
346   /**
347    * Test similar special sequences.
348    */
349   public void testLargeSource() throws Throwable   {
350     // construct a large data source
351     String          dataItem  = "/*\n"
352                             + "* This is a Java style data item.\n"
353                             + "* It is concatenated \"multible\" times to get a real\n"
354                             + "* big chunk of data.\n"
355                             + "* With such a lot of characters the speed of the tokenizers\n"
356                             + "* can be compared.\n"
357                             + "*/\n" 
358                             + "package org.muppets.gonzo;\n\n"
359                             + "/**\n"
360                             + "* This is a class comment :-)\n"
361                             + "*/\n"
362                             + "public class Gonzo extends Serializable {\n\n"
363                             + "  /** The standard constructor */\n"
364                             + "  public Gonzo() {\n"
365                             + "    // nothing todo here\n"
366                             + "  }\n\n"
367                             + "  /** a method */\n"
368                             + "  public String toString() {\n"
369                             + "    return \"This is Gonzo\";\n"
370                             + "  }\n\n"
371                             + "}\n\n\n";
372     int           tokenCountPerItem = 35;
373     int           tokenCount        = 0;
374     int           maxSize           = 0x80000;
375     StringBuffer    data              = new StringBuffer  (maxSize);
376     
377     while (data.length() < maxSize) {
378       data.append(dataItem);
379       tokenCount += tokenCountPerItem;
380     }
381     tokenCount++;   // EOF token
382     
383     // Set up the Properties
384     TokenizerProperties props = new StandardTokenizerProperties();
385 
386     props.setParseFlags(Flags.F_RETURN_BLOCK_COMMENTS + Flags.F_RETURN_LINE_COMMENTS + Flags.F_TOKEN_POS_ONLY);
387     props.addBlockComment("/*", "*/");
388     props.addBlockComment("/**", "*/");
389     props.addLineComment("//");
390     props.addString("\"", "\"", "\\");
391     props.addString("'", "'", "\\");
392     props.addKeyword("package");
393     props.addKeyword("public");
394     props.addKeyword("class");
395     props.addKeyword("extends");
396     props.addKeyword("return");
397     props.addKeyword("if");
398     props.addKeyword("then");
399     props.addKeyword("while");
400     props.addKeyword("for");
401     props.addKeyword("int");
402     props.addKeyword("char");
403     props.addSpecialSequence("(");
404     props.addSpecialSequence(")");
405     props.addSpecialSequence(";");
406     props.addSpecialSequence("==");
407     props.addSpecialSequence("!=");
408     props.addSpecialSequence("<=");
409     props.addSpecialSequence(">=");
410 
411     // create the tokenizers.
412     // NOTE: the sources have a special structure that is required for the
413     // analysis below
414     Tokenizer tokenizer  = new StandardTokenizer(props);
415     Object  []  sources    = new Object  [] { new StringSource(data.toString())
416                                         , new ReaderSource(new StringReader  (data.toString()))
417                                         , new StringSource(data.toString().substring(0, data.toString().length() / 2))
418                                         , new ReaderSource(new StringReader  (data.toString().substring(0, data.toString().length() / 2)))
419                                         , new StringSource(data.toString().substring(0, data.toString().length() / 5))
420                                         , new ReaderSource(new StringReader  (data.toString().substring(0, data.toString().length() / 5)))
421                                         , new StringSource(data.toString().substring(0, data.toString().length() / 20))
422                                         , new ReaderSource(new StringReader  (data.toString().substring(0, data.toString().length() / 20))) };
423     Object  []    tokenLists = new Object  [] { null
424                                         , null
425                                         , null
426                                         , null 
427                                         , null
428                                         , null 
429                                         , null
430                                         , null };
431 
432     try {
433       for (int index = 0; index < sources.length; ++index) {
434         long        start     = System.currentTimeMillis();
435       
436         System.out.println(sources[index].getClass().getName() + ": running ...");
437         tokenizer.setSource((TokenizerSource)sources[index]);
438         
439         tokenLists[index] = tokenize(tokenizer);
440 
441         System.out.println(sources[index].getClass().getName() + ": " + (System.currentTimeMillis() - start) + "ms.");
442       }
443     } finally {
444       tokenizer.close();
445     }
446     
447     // check the results
448     for (int index = 0; index < sources.length; ++index) {
449       List   tokenList  = (List  )tokenLists[index];
450       
451       System.out.println(sources[index].getClass().getName() + " has " + tokenList.size() + " token.");
452       
453       // only the first 2 data sources have the full token count
454       if (index < 2) {
455         assertTrue("Expected " + tokenCount + " token, got " + tokenList.size(), tokenCount == tokenList.size());
456       }
457       
458       // compare two lists with the same amount of data 
459       if (index % 2 == 1) {
460         List        tokenList0 = (List  )tokenLists[index - 1];
461         Iterator    iter0      = tokenList0.iterator();
462         Iterator    iter       = tokenList.iterator();
463         int       tokenIndex = 0;
464         
465         while (iter.hasNext()) {
466           Token token0 = (Token)iter0.next();
467           Token token  = (Token)iter.next();
468           
469           assertTrue("Token #" + tokenIndex + "differs:\n" + token0 + "\n" + token, token0.equals(token));
470           tokenIndex++;
471         }
472       }
473     }
474   }
475 
476   
477   //---------------------------------------------------------------------------
478   // helpers
479   //
480   
481   /**
482    * This method returns a {@link java.util.List} of Token.
483    */
484   private List   tokenize(Tokenizer tokenizer) throws Throwable   {
485     List            list    = new LinkedList  ();
486     //File          file    = File.createTempFile(tokenizer.getSource().getClass().getName(), null);
487     //PrintWriter   writer  = new PrintWriter(file.getAbsolutePath());
488 
489     try {
490       while (tokenizer.hasMoreToken()) {
491         Token   token = tokenizer.nextToken();
492 
493         // writer.println(token);
494         list.add(token);
495       }
496     } finally {
497       // writer.close();
498     }
499     return list;
500   }
501 
502   /**
503    * Expand some text
504    */
505   private String   expandData(int factor) {
506     StringBuffer    expandedData = new StringBuffer  (DATA.length() * factor);
507     
508     for (int ii = 0; ii < factor; ++ii) {
509       expandedData.append(DATA);
510     }
511     return expandedData.toString();
512   }
513 
514   /**
515    * Read the full source
516    */
517   private void readSource(TokenizerSource source, char[] buffer) throws Throwable   {
518     long  startTime = System.currentTimeMillis();
519     int   chars;
520     
521     while ((chars = source.read(buffer, 0, buffer.length)) > 0);
522     System.out.println(source.getClass().getName() + " needed " + (System.currentTimeMillis() - startTime) + "ms.");
523   }
524 
525 
526   //---------------------------------------------------------------------------
527   // members
528   //
529   
530   // various constants
531   private static final String   ORIG_SMILEY       = ":-)";
532   private static final String   FRIGHTENED_SMIKEY = "=8-[";
533   private static final String   WINKING_SMILEY    = ".-\\";
534 
535   // Text data for the tests
536   private static final String   DATA  =  
537       "this is a simple text with a lot of perfectly normal\n"
538     + "token. And a few separators (brackets are some, for instance)\n"
539     + "as well.     There could\talso be some\ttabs (\"\\t\")\n"
540     + "in between. And 'some strings' :-).\n"
541     + "And the smileys (;-), =8-[, .-\\ etc.) should be regarded as\n"
542     + "'special sequences'.\n\n";
543 }  
544 
545
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags