KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > susebox > jtopas > TestTokenizerSource


1 /*
2  * TestTokenizerSource.java: JUnit test for a Tokenizer
3  *
4  * Copyright (C) 2004 Heiko Blau
5  *
6  * This file belongs to the JTopas test suite.
7  * The JTopas test suite is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by the
9  * Free Software Foundation; either version 2.1 of the License, or (at your option)
10  * any later version.
11  *
12  * This software is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License along
18  * with the JTopas test suite. If not, write to the
19  *
20  * Free Software Foundation, Inc.
21  * 59 Temple Place, Suite 330,
22  * Boston, MA 02111-1307
23  * USA
24  *
25  * or check the Internet: http://www.fsf.org
26  *
27  * The JTopas test suite uses the test framework JUnit by Kent Beck and Erich Gamma.
28  * You should have received a copy of their JUnit licence agreement along with
29  * the JTopas test suite.
30  *
31  * We do NOT provide the JUnit archive junit.jar nessecary to compile and run
32  * our tests, since we assume, that You either have it already or would like
33  * to get the current release Yourself.
34  * Please visit either:
35  * http://sourceforge.net/projects/junit
36  * or
37  * http://junit.org
38  * to obtain JUnit.
39  *
40  * Contact:
41  * email: heiko@susebox.de
42  */

43
44 package de.susebox.jtopas;
45
46 //-----------------------------------------------------------------------------
47
// Imports
48
//
49
import java.io.Reader JavaDoc;
50 import java.io.StringReader JavaDoc;
51 import java.io.File JavaDoc;
52 import java.io.PrintWriter JavaDoc;
53 import java.util.Iterator JavaDoc;
54 import java.util.List JavaDoc;
55 import java.util.LinkedList JavaDoc;
56
57 import junit.framework.Test;
58 import junit.framework.TestCase;
59 import junit.framework.TestSuite;
60 import junit.framework.Assert;
61
62 import de.susebox.TestUtilities;
63
64
65 //-----------------------------------------------------------------------------
66
// Class TestTokenizerSource
67
//
68

69 /**<p>
70  * The class contains a number of test cases that are supposed to be difficult
71  * to handle for a {@link Tokenizer}, e.g. EOF conditions inside strings etc.
72  *</p>
73  *
74  * @see TokenizerSource
75  * @author Heiko Blau
76  */

77 public class TestTokenizerSource extends TestCase {
78   
79   //---------------------------------------------------------------------------
80
// properties
81
//
82

83   
84   //---------------------------------------------------------------------------
85
// main method
86
//
87

88   /**
89    * call this method to invoke the tests
90    */

91   public static void main(String JavaDoc[] args) {
92     String JavaDoc[] tests = { TestTokenizerSource.class.getName() };
93
94     TestUtilities.run(tests, args);
95   }
96   
97
98   //---------------------------------------------------------------------------
99
// suite method
100
//
101

102   /**
103    * Implementation of the JUnit method <code>suite</code>. For each set of test
104    * properties one or more tests are instantiated.
105    *
106    * @return a test suite
107    */

108   public static Test suite() {
109     TestSuite suite = new TestSuite(TestTokenizerSource.class.getName());
110   
111     suite.addTest(new TestTokenizerSource("testEmptySource"));
112     suite.addTest(new TestTokenizerSource("testSmallBuffer"));
113     suite.addTest(new TestTokenizerSource("testLargeBuffer"));
114     suite.addTest(new TestTokenizerSource("testSpeed"));
115     suite.addTest(new TestTokenizerSource("testSimilarResults"));
116     suite.addTest(new TestTokenizerSource("testLargeSource"));
117     return suite;
118   }
119   
120   
121   //---------------------------------------------------------------------------
122
// Constructor
123
//
124

125   /**
126    * Default constructor. Standard input {@link java.lang.System#in} is used
127    * to construct the input stream reader.
128    */

129   public TestTokenizerSource(String JavaDoc test) {
130     super(test);
131   }
132
133   
134   //---------------------------------------------------------------------------
135
// Fixture setup and release
136
//
137

138   /**
139    * Sets up the fixture, for example, open a network connection.
140    * This method is called before a test is executed.
141    */

142   protected void setUp() throws Exception JavaDoc {}
143
144   
145   /**
146    * Tears down the fixture, for example, close a network connection.
147    * This method is called after a test is executed.
148    */

149   protected void tearDown() throws Exception JavaDoc {}
150   
151   
152   //---------------------------------------------------------------------------
153
// test cases
154
//
155

156   /**
157    * Test empty data sources
158    */

159   public void testEmptySource() throws Throwable JavaDoc {
160     TokenizerSource[] source = { null, null, null, null, null, null };
161     char[] cbuf = new char[8129];
162     int count;
163     
164     source[0] = new CharArraySource(null);
165     source[1] = new ReaderSource((java.io.InputStream JavaDoc)null);
166     source[2] = new StringSource(null);
167     source[3] = new CharArraySource(new char[0]);
168     source[4] = new ReaderSource(new StringReader JavaDoc(""));
169     source[5] = new StringSource("");
170     for (int index = 0; index < source.length; ++index) {
171       count = source[index].read(cbuf, 0, cbuf.length);
172       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
173     }
174   }
175   
176   /**
177    * Test a buffer that is smaller than the available data
178    */

179   public void testSmallBuffer() throws Throwable JavaDoc {
180     TokenizerSource[] source = { null, null, null };
181     char[] cbuf = new char[1];
182     char[] text = new char[DATA.length()];
183     int count;
184     
185     DATA.getChars(0, DATA.length(), text, 0);
186     source[0] = new CharArraySource(text);
187     source[1] = new ReaderSource(new StringReader JavaDoc(DATA));
188     source[2] = new StringSource(DATA);
189     for (int index = 0; index < source.length; ++index) {
190       for (int readIndex = 0; readIndex < DATA.length(); ++readIndex) {
191         count = source[index].read(cbuf, 0, cbuf.length);
192         assertTrue(source[index].getClass().getName() + ": expected 1, got " + count, count == 1);
193       }
194       count = source[index].read(cbuf, 0, cbuf.length);
195       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
196     }
197   }
198   
199   /**
200    * Test a buffer that is larger than the available data
201    */

202   public void testLargeBuffer() throws Throwable JavaDoc {
203     TokenizerSource[] source = { null, null, null };
204     char[] cbuf = new char[8192];
205     char[] text = new char[DATA.length()];
206     int count;
207     
208     DATA.getChars(0, DATA.length(), text, 0);
209     source[0] = new CharArraySource(text);
210     source[1] = new ReaderSource(new StringReader JavaDoc(DATA));
211     source[2] = new StringSource(DATA);
212     for (int index = 0; index < source.length; ++index) {
213       count = source[index].read(cbuf, 0, cbuf.length);
214       assertTrue(source[index].getClass().getName() + ": expected " + DATA.length() + ", got " + count, count == DATA.length());
215       count = source[index].read(cbuf, 0, cbuf.length);
216       assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1);
217     }
218   }
219   
220   /**
221    * Test speed
222    */

223   public void testSpeed() throws Throwable JavaDoc {
224     // construct a really huge string
225
TokenizerSource source;
226     char[] buffer;
227     String JavaDoc text = expandData(20000);
228     char[] cbuf = new char[text.length()];
229
230     text.getChars(0, text.length(), cbuf, 0);
231
232     for (int bufferSize = 8; bufferSize < 0x20000; bufferSize *= 2) {
233       System.out.println("Buffer size " + bufferSize + ":");
234       buffer = new char[bufferSize];
235       
236       // CharArraySource
237
readSource(new CharArraySource(cbuf), buffer);
238
239       // ReaderSource
240
readSource(new ReaderSource(new StringReader JavaDoc(text)), buffer);
241
242       // StringSource
243
readSource(new StringSource(text), buffer);
244     }
245   }
246   
247   /**
248    * Test similar special sequences.
249    */

250   public void testSimilarResults() throws Throwable JavaDoc {
251     // construct a really huge string
252
String JavaDoc text = expandData(1000);
253
254     // initialize the properties
255
TokenizerProperties props = new StandardTokenizerProperties();
256     StandardTokenizer tokenizer = new StandardTokenizer();
257     TokenizerSource source;
258     long startTime;
259
260     props.addSpecialSequence(ORIG_SMILEY, ORIG_SMILEY);
261     props.addSpecialSequence(FRIGHTENED_SMIKEY, FRIGHTENED_SMIKEY);
262     props.addSpecialSequence(WINKING_SMILEY, WINKING_SMILEY);
263     props.addString("\"", "\"", "\\");
264     props.addString("'", "'", "\\");
265     
266     try {
267       tokenizer.setTokenizerProperties(props);
268   
269       // CharArraySource
270
char[] cbuf = new char[text.length()];
271
272       text.getChars(0, text.length(), cbuf, 0);
273   
274       // tokenize several times to avoid JIT or hotspot optimization effects
275
int loopCount = 100;
276       int loops = 0;
277       long timeTotal1 = 0;
278       long timeTotal2 = 0;
279       long timeTotal3 = 0;
280       
281       while (loops++ < loopCount) {
282         tokenizer.setSource(new CharArraySource(cbuf));
283
284         startTime = System.currentTimeMillis();
285         List JavaDoc list1 = tokenize(tokenizer);
286         long time1 = System.currentTimeMillis() - startTime;
287         System.out.println("Loop #" + loops + ": CharArraySource needed " + time1 + "ms for " + list1.size() + " token.");
288         timeTotal1 += time1;
289
290         // ReaderSource
291
tokenizer.setSource(new ReaderSource(new StringReader JavaDoc(text)));
292
293         startTime = System.currentTimeMillis();
294         List JavaDoc list2 = tokenize(tokenizer);
295         long time2 = System.currentTimeMillis() - startTime;
296         System.out.println("Loop #" + loops + ": ReaderSource needed " + time2 + "ms for " + list2.size() + " token.");
297         timeTotal2 += time2;
298
299         // StringSource
300
tokenizer.setSource(new StringSource(text));
301
302         startTime = System.currentTimeMillis();
303         List JavaDoc list3 = tokenize(tokenizer);
304         long time3 = System.currentTimeMillis() - startTime;
305         System.out.println("Loop #" + loops + ": StringSource needed " + time3 + "ms for " + list3.size() + " token.");
306         timeTotal3 += time3;
307
308         System.out.println("CharArraySource has " + list1.size() + " token.");
309         System.out.println("ReaderSource has " + list2.size() + " token.");
310         System.out.println("StringSource has " + list3.size() + " token.");
311         
312         // any list shorter than the others?
313
assertTrue("CharArraySource token count differs from ReaderSource token count.", list1.size() == list2.size());
314         assertTrue("CharArraySource token count differs from StringSource token count.", list1.size() == list3.size());
315
316         // check token list only once
317
if (loops == loopCount) {
318           System.out.println("CharArraySource total time: " + timeTotal1 + "ms.");
319           System.out.println("ReaderSource total time: " + timeTotal2 + "ms.");
320           System.out.println("StringSource total time: " + timeTotal3 + "ms.");
321
322           Iterator JavaDoc iter1 = list1.iterator();
323           Iterator JavaDoc iter2 = list2.iterator();
324           Iterator JavaDoc iter3 = list3.iterator();
325           int index = 0;
326           while (iter1.hasNext()) {
327             // compare token
328
Token token1 = (Token)iter1.next();
329             Token token2 = (Token)iter2.next();
330             Token token3 = (Token)iter3.next();
331
332             assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", ReaderSource \"" + token2 + "\"",
333                       token1.equals(token2));
334             assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", StringSource \"" + token3 + "\"",
335                       token1.equals(token3));
336             index++;
337           }
338         }
339       }
340     } finally {
341       tokenizer.close();
342     }
343   }
344   
345
346   /**
347    * Test similar special sequences.
348    */

349   public void testLargeSource() throws Throwable JavaDoc {
350     // construct a large data source
351
String JavaDoc dataItem = "/*\n"
352                             + "* This is a Java style data item.\n"
353                             + "* It is concatenated \"multible\" times to get a real\n"
354                             + "* big chunk of data.\n"
355                             + "* With such a lot of characters the speed of the tokenizers\n"
356                             + "* can be compared.\n"
357                             + "*/\n"
358                             + "package org.muppets.gonzo;\n\n"
359                             + "/**\n"
360                             + "* This is a class comment :-)\n"
361                             + "*/\n"
362                             + "public class Gonzo extends Serializable {\n\n"
363                             + " /** The standard constructor */\n"
364                             + " public Gonzo() {\n"
365                             + " // nothing todo here\n"
366                             + " }\n\n"
367                             + " /** a method */\n"
368                             + " public String toString() {\n"
369                             + " return \"This is Gonzo\";\n"
370                             + " }\n\n"
371                             + "}\n\n\n";
372     int tokenCountPerItem = 35;
373     int tokenCount = 0;
374     int maxSize = 0x80000;
375     StringBuffer JavaDoc data = new StringBuffer JavaDoc(maxSize);
376     
377     while (data.length() < maxSize) {
378       data.append(dataItem);
379       tokenCount += tokenCountPerItem;
380     }
381     tokenCount++; // EOF token
382

383     // Set up the Properties
384
TokenizerProperties props = new StandardTokenizerProperties();
385
386     props.setParseFlags(Flags.F_RETURN_BLOCK_COMMENTS + Flags.F_RETURN_LINE_COMMENTS + Flags.F_TOKEN_POS_ONLY);
387     props.addBlockComment("/*", "*/");
388     props.addBlockComment("/**", "*/");
389     props.addLineComment("//");
390     props.addString("\"", "\"", "\\");
391     props.addString("'", "'", "\\");
392     props.addKeyword("package");
393     props.addKeyword("public");
394     props.addKeyword("class");
395     props.addKeyword("extends");
396     props.addKeyword("return");
397     props.addKeyword("if");
398     props.addKeyword("then");
399     props.addKeyword("while");
400     props.addKeyword("for");
401     props.addKeyword("int");
402     props.addKeyword("char");
403     props.addSpecialSequence("(");
404     props.addSpecialSequence(")");
405     props.addSpecialSequence(";");
406     props.addSpecialSequence("==");
407     props.addSpecialSequence("!=");
408     props.addSpecialSequence("<=");
409     props.addSpecialSequence(">=");
410
411     // create the tokenizers.
412
// NOTE: the sources have a special structure that is required for the
413
// analysis below
414
Tokenizer tokenizer = new StandardTokenizer(props);
415     Object JavaDoc[] sources = new Object JavaDoc[] { new StringSource(data.toString())
416                                         , new ReaderSource(new StringReader JavaDoc(data.toString()))
417                                         , new StringSource(data.toString().substring(0, data.toString().length() / 2))
418                                         , new ReaderSource(new StringReader JavaDoc(data.toString().substring(0, data.toString().length() / 2)))
419                                         , new StringSource(data.toString().substring(0, data.toString().length() / 5))
420                                         , new ReaderSource(new StringReader JavaDoc(data.toString().substring(0, data.toString().length() / 5)))
421                                         , new StringSource(data.toString().substring(0, data.toString().length() / 20))
422                                         , new ReaderSource(new StringReader JavaDoc(data.toString().substring(0, data.toString().length() / 20))) };
423     Object JavaDoc[] tokenLists = new Object JavaDoc[] { null
424                                         , null
425                                         , null
426                                         , null
427                                         , null
428                                         , null
429                                         , null
430                                         , null };
431
432     try {
433       for (int index = 0; index < sources.length; ++index) {
434         long start = System.currentTimeMillis();
435       
436         System.out.println(sources[index].getClass().getName() + ": running ...");
437         tokenizer.setSource((TokenizerSource)sources[index]);
438         
439         tokenLists[index] = tokenize(tokenizer);
440
441         System.out.println(sources[index].getClass().getName() + ": " + (System.currentTimeMillis() - start) + "ms.");
442       }
443     } finally {
444       tokenizer.close();
445     }
446     
447     // check the results
448
for (int index = 0; index < sources.length; ++index) {
449       List JavaDoc tokenList = (List JavaDoc)tokenLists[index];
450       
451       System.out.println(sources[index].getClass().getName() + " has " + tokenList.size() + " token.");
452       
453       // only the first 2 data sources have the full token count
454
if (index < 2) {
455         assertTrue("Expected " + tokenCount + " token, got " + tokenList.size(), tokenCount == tokenList.size());
456       }
457       
458       // compare two lists with the same amount of data
459
if (index % 2 == 1) {
460         List JavaDoc tokenList0 = (List JavaDoc)tokenLists[index - 1];
461         Iterator JavaDoc iter0 = tokenList0.iterator();
462         Iterator JavaDoc iter = tokenList.iterator();
463         int tokenIndex = 0;
464         
465         while (iter.hasNext()) {
466           Token token0 = (Token)iter0.next();
467           Token token = (Token)iter.next();
468           
469           assertTrue("Token #" + tokenIndex + "differs:\n" + token0 + "\n" + token, token0.equals(token));
470           tokenIndex++;
471         }
472       }
473     }
474   }
475
476   
477   //---------------------------------------------------------------------------
478
// helpers
479
//
480

481   /**
482    * This method returns a {@link java.util.List} of Token.
483    */

484   private List JavaDoc tokenize(Tokenizer tokenizer) throws Throwable JavaDoc {
485     List JavaDoc list = new LinkedList JavaDoc();
486     //File file = File.createTempFile(tokenizer.getSource().getClass().getName(), null);
487
//PrintWriter writer = new PrintWriter(file.getAbsolutePath());
488

489     try {
490       while (tokenizer.hasMoreToken()) {
491         Token token = tokenizer.nextToken();
492
493         // writer.println(token);
494
list.add(token);
495       }
496     } finally {
497       // writer.close();
498
}
499     return list;
500   }
501
502   /**
503    * Expand some text
504    */

505   private String JavaDoc expandData(int factor) {
506     StringBuffer JavaDoc expandedData = new StringBuffer JavaDoc(DATA.length() * factor);
507     
508     for (int ii = 0; ii < factor; ++ii) {
509       expandedData.append(DATA);
510     }
511     return expandedData.toString();
512   }
513
514   /**
515    * Read the full source
516    */

517   private void readSource(TokenizerSource source, char[] buffer) throws Throwable JavaDoc {
518     long startTime = System.currentTimeMillis();
519     int chars;
520     
521     while ((chars = source.read(buffer, 0, buffer.length)) > 0);
522     System.out.println(source.getClass().getName() + " needed " + (System.currentTimeMillis() - startTime) + "ms.");
523   }
524
525
526   //---------------------------------------------------------------------------
527
// members
528
//
529

530   // various constants
531
private static final String JavaDoc ORIG_SMILEY = ":-)";
532   private static final String JavaDoc FRIGHTENED_SMIKEY = "=8-[";
533   private static final String JavaDoc WINKING_SMILEY = ".-\\";
534
535   // Text data for the tests
536
private static final String JavaDoc DATA =
537       "this is a simple text with a lot of perfectly normal\n"
538     + "token. And a few separators (brackets are some, for instance)\n"
539     + "as well. There could\talso be some\ttabs (\"\\t\")\n"
540     + "in between. And 'some strings' :-).\n"
541     + "And the smileys (;-), =8-[, .-\\ etc.) should be regarded as\n"
542     + "'special sequences'.\n\n";
543 }
544
545
Popular Tags