1 43 44 package de.susebox.jtopas; 45 46 import java.io.Reader ; 50 import java.io.StringReader ; 51 import java.io.File ; 52 import java.io.PrintWriter ; 53 import java.util.Iterator ; 54 import java.util.List ; 55 import java.util.LinkedList ; 56 57 import junit.framework.Test; 58 import junit.framework.TestCase; 59 import junit.framework.TestSuite; 60 import junit.framework.Assert; 61 62 import de.susebox.TestUtilities; 63 64 65 69 77 public class TestTokenizerSource extends TestCase { 78 79 83 84 88 91 public static void main(String [] args) { 92 String [] tests = { TestTokenizerSource.class.getName() }; 93 94 TestUtilities.run(tests, args); 95 } 96 97 98 102 108 public static Test suite() { 109 TestSuite suite = new TestSuite(TestTokenizerSource.class.getName()); 110 111 suite.addTest(new TestTokenizerSource("testEmptySource")); 112 suite.addTest(new TestTokenizerSource("testSmallBuffer")); 113 suite.addTest(new TestTokenizerSource("testLargeBuffer")); 114 suite.addTest(new TestTokenizerSource("testSpeed")); 115 suite.addTest(new TestTokenizerSource("testSimilarResults")); 116 suite.addTest(new TestTokenizerSource("testLargeSource")); 117 return suite; 118 } 119 120 121 125 129 public TestTokenizerSource(String test) { 130 super(test); 131 } 132 133 134 138 142 protected void setUp() throws Exception {} 143 144 145 149 protected void tearDown() throws Exception {} 150 151 152 156 159 public void testEmptySource() throws Throwable { 160 TokenizerSource[] source = { null, null, null, null, null, null }; 161 char[] cbuf = new char[8129]; 162 int count; 163 164 source[0] = new CharArraySource(null); 165 source[1] = new ReaderSource((java.io.InputStream )null); 166 source[2] = new StringSource(null); 167 source[3] = new CharArraySource(new char[0]); 168 source[4] = new ReaderSource(new StringReader ("")); 169 source[5] = new StringSource(""); 170 for (int index = 0; index < source.length; ++index) { 171 count = source[index].read(cbuf, 0, cbuf.length); 172 assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1); 173 } 174 } 175 176 179 public void testSmallBuffer() throws Throwable { 180 TokenizerSource[] source = { null, null, null }; 181 char[] cbuf = new char[1]; 182 char[] text = new char[DATA.length()]; 183 int count; 184 185 DATA.getChars(0, DATA.length(), text, 0); 186 source[0] = new CharArraySource(text); 187 source[1] = new ReaderSource(new StringReader (DATA)); 188 source[2] = new StringSource(DATA); 189 for (int index = 0; index < source.length; ++index) { 190 for (int readIndex = 0; readIndex < DATA.length(); ++readIndex) { 191 count = source[index].read(cbuf, 0, cbuf.length); 192 assertTrue(source[index].getClass().getName() + ": expected 1, got " + count, count == 1); 193 } 194 count = source[index].read(cbuf, 0, cbuf.length); 195 assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1); 196 } 197 } 198 199 202 public void testLargeBuffer() throws Throwable { 203 TokenizerSource[] source = { null, null, null }; 204 char[] cbuf = new char[8192]; 205 char[] text = new char[DATA.length()]; 206 int count; 207 208 DATA.getChars(0, DATA.length(), text, 0); 209 source[0] = new CharArraySource(text); 210 source[1] = new ReaderSource(new StringReader (DATA)); 211 source[2] = new StringSource(DATA); 212 for (int index = 0; index < source.length; ++index) { 213 count = source[index].read(cbuf, 0, cbuf.length); 214 assertTrue(source[index].getClass().getName() + ": expected " + DATA.length() + ", got " + count, count == DATA.length()); 215 count = source[index].read(cbuf, 0, cbuf.length); 216 assertTrue(source[index].getClass().getName() + ": expected -1, got " + count, count == -1); 217 } 218 } 219 220 223 public void testSpeed() throws Throwable { 224 TokenizerSource source; 226 char[] buffer; 227 String text = expandData(20000); 228 char[] cbuf = new char[text.length()]; 229 230 text.getChars(0, text.length(), cbuf, 0); 231 232 for (int bufferSize = 8; bufferSize < 0x20000; bufferSize *= 2) { 233 System.out.println("Buffer size " + bufferSize + ":"); 234 buffer = new char[bufferSize]; 235 236 readSource(new CharArraySource(cbuf), buffer); 238 239 readSource(new ReaderSource(new StringReader (text)), buffer); 241 242 readSource(new StringSource(text), buffer); 244 } 245 } 246 247 250 public void testSimilarResults() throws Throwable { 251 String text = expandData(1000); 253 254 TokenizerProperties props = new StandardTokenizerProperties(); 256 StandardTokenizer tokenizer = new StandardTokenizer(); 257 TokenizerSource source; 258 long startTime; 259 260 props.addSpecialSequence(ORIG_SMILEY, ORIG_SMILEY); 261 props.addSpecialSequence(FRIGHTENED_SMIKEY, FRIGHTENED_SMIKEY); 262 props.addSpecialSequence(WINKING_SMILEY, WINKING_SMILEY); 263 props.addString("\"", "\"", "\\"); 264 props.addString("'", "'", "\\"); 265 266 try { 267 tokenizer.setTokenizerProperties(props); 268 269 char[] cbuf = new char[text.length()]; 271 272 text.getChars(0, text.length(), cbuf, 0); 273 274 int loopCount = 100; 276 int loops = 0; 277 long timeTotal1 = 0; 278 long timeTotal2 = 0; 279 long timeTotal3 = 0; 280 281 while (loops++ < loopCount) { 282 tokenizer.setSource(new CharArraySource(cbuf)); 283 284 startTime = System.currentTimeMillis(); 285 List list1 = tokenize(tokenizer); 286 long time1 = System.currentTimeMillis() - startTime; 287 System.out.println("Loop #" + loops + ": CharArraySource needed " + time1 + "ms for " + list1.size() + " token."); 288 timeTotal1 += time1; 289 290 tokenizer.setSource(new ReaderSource(new StringReader (text))); 292 293 startTime = System.currentTimeMillis(); 294 List list2 = tokenize(tokenizer); 295 long time2 = System.currentTimeMillis() - startTime; 296 System.out.println("Loop #" + loops + ": ReaderSource needed " + time2 + "ms for " + list2.size() + " token."); 297 timeTotal2 += time2; 298 299 tokenizer.setSource(new StringSource(text)); 301 302 startTime = System.currentTimeMillis(); 303 List list3 = tokenize(tokenizer); 304 long time3 = System.currentTimeMillis() - startTime; 305 System.out.println("Loop #" + loops + ": StringSource needed " + time3 + "ms for " + list3.size() + " token."); 306 timeTotal3 += time3; 307 308 System.out.println("CharArraySource has " + list1.size() + " token."); 309 System.out.println("ReaderSource has " + list2.size() + " token."); 310 System.out.println("StringSource has " + list3.size() + " token."); 311 312 assertTrue("CharArraySource token count differs from ReaderSource token count.", list1.size() == list2.size()); 314 assertTrue("CharArraySource token count differs from StringSource token count.", list1.size() == list3.size()); 315 316 if (loops == loopCount) { 318 System.out.println("CharArraySource total time: " + timeTotal1 + "ms."); 319 System.out.println("ReaderSource total time: " + timeTotal2 + "ms."); 320 System.out.println("StringSource total time: " + timeTotal3 + "ms."); 321 322 Iterator iter1 = list1.iterator(); 323 Iterator iter2 = list2.iterator(); 324 Iterator iter3 = list3.iterator(); 325 int index = 0; 326 while (iter1.hasNext()) { 327 Token token1 = (Token)iter1.next(); 329 Token token2 = (Token)iter2.next(); 330 Token token3 = (Token)iter3.next(); 331 332 assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", ReaderSource \"" + token2 + "\"", 333 token1.equals(token2)); 334 assertTrue("Token mismatch at position " + index + ": CharArraySource \"" + token1 + "\", StringSource \"" + token3 + "\"", 335 token1.equals(token3)); 336 index++; 337 } 338 } 339 } 340 } finally { 341 tokenizer.close(); 342 } 343 } 344 345 346 349 public void testLargeSource() throws Throwable { 350 String dataItem = "/*\n" 352 + "* This is a Java style data item.\n" 353 + "* It is concatenated \"multible\" times to get a real\n" 354 + "* big chunk of data.\n" 355 + "* With such a lot of characters the speed of the tokenizers\n" 356 + "* can be compared.\n" 357 + "*/\n" 358 + "package org.muppets.gonzo;\n\n" 359 + "/**\n" 360 + "* This is a class comment :-)\n" 361 + "*/\n" 362 + "public class Gonzo extends Serializable {\n\n" 363 + " /** The standard constructor */\n" 364 + " public Gonzo() {\n" 365 + " // nothing todo here\n" 366 + " }\n\n" 367 + " /** a method */\n" 368 + " public String toString() {\n" 369 + " return \"This is Gonzo\";\n" 370 + " }\n\n" 371 + "}\n\n\n"; 372 int tokenCountPerItem = 35; 373 int tokenCount = 0; 374 int maxSize = 0x80000; 375 StringBuffer data = new StringBuffer (maxSize); 376 377 while (data.length() < maxSize) { 378 data.append(dataItem); 379 tokenCount += tokenCountPerItem; 380 } 381 tokenCount++; 383 TokenizerProperties props = new StandardTokenizerProperties(); 385 386 props.setParseFlags(Flags.F_RETURN_BLOCK_COMMENTS + Flags.F_RETURN_LINE_COMMENTS + Flags.F_TOKEN_POS_ONLY); 387 props.addBlockComment("/*", "*/"); 388 props.addBlockComment("/**", "*/"); 389 props.addLineComment("//"); 390 props.addString("\"", "\"", "\\"); 391 props.addString("'", "'", "\\"); 392 props.addKeyword("package"); 393 props.addKeyword("public"); 394 props.addKeyword("class"); 395 props.addKeyword("extends"); 396 props.addKeyword("return"); 397 props.addKeyword("if"); 398 props.addKeyword("then"); 399 props.addKeyword("while"); 400 props.addKeyword("for"); 401 props.addKeyword("int"); 402 props.addKeyword("char"); 403 props.addSpecialSequence("("); 404 props.addSpecialSequence(")"); 405 props.addSpecialSequence(";"); 406 props.addSpecialSequence("=="); 407 props.addSpecialSequence("!="); 408 props.addSpecialSequence("<="); 409 props.addSpecialSequence(">="); 410 411 Tokenizer tokenizer = new StandardTokenizer(props); 415 Object [] sources = new Object [] { new StringSource(data.toString()) 416 , new ReaderSource(new StringReader (data.toString())) 417 , new StringSource(data.toString().substring(0, data.toString().length() / 2)) 418 , new ReaderSource(new StringReader (data.toString().substring(0, data.toString().length() / 2))) 419 , new StringSource(data.toString().substring(0, data.toString().length() / 5)) 420 , new ReaderSource(new StringReader (data.toString().substring(0, data.toString().length() / 5))) 421 , new StringSource(data.toString().substring(0, data.toString().length() / 20)) 422 , new ReaderSource(new StringReader (data.toString().substring(0, data.toString().length() / 20))) }; 423 Object [] tokenLists = new Object [] { null 424 , null 425 , null 426 , null 427 , null 428 , null 429 , null 430 , null }; 431 432 try { 433 for (int index = 0; index < sources.length; ++index) { 434 long start = System.currentTimeMillis(); 435 436 System.out.println(sources[index].getClass().getName() + ": running ..."); 437 tokenizer.setSource((TokenizerSource)sources[index]); 438 439 tokenLists[index] = tokenize(tokenizer); 440 441 System.out.println(sources[index].getClass().getName() + ": " + (System.currentTimeMillis() - start) + "ms."); 442 } 443 } finally { 444 tokenizer.close(); 445 } 446 447 for (int index = 0; index < sources.length; ++index) { 449 List tokenList = (List )tokenLists[index]; 450 451 System.out.println(sources[index].getClass().getName() + " has " + tokenList.size() + " token."); 452 453 if (index < 2) { 455 assertTrue("Expected " + tokenCount + " token, got " + tokenList.size(), tokenCount == tokenList.size()); 456 } 457 458 if (index % 2 == 1) { 460 List tokenList0 = (List )tokenLists[index - 1]; 461 Iterator iter0 = tokenList0.iterator(); 462 Iterator iter = tokenList.iterator(); 463 int tokenIndex = 0; 464 465 while (iter.hasNext()) { 466 Token token0 = (Token)iter0.next(); 467 Token token = (Token)iter.next(); 468 469 assertTrue("Token #" + tokenIndex + "differs:\n" + token0 + "\n" + token, token0.equals(token)); 470 tokenIndex++; 471 } 472 } 473 } 474 } 475 476 477 481 484 private List tokenize(Tokenizer tokenizer) throws Throwable { 485 List list = new LinkedList (); 486 489 try { 490 while (tokenizer.hasMoreToken()) { 491 Token token = tokenizer.nextToken(); 492 493 list.add(token); 495 } 496 } finally { 497 } 499 return list; 500 } 501 502 505 private String expandData(int factor) { 506 StringBuffer expandedData = new StringBuffer (DATA.length() * factor); 507 508 for (int ii = 0; ii < factor; ++ii) { 509 expandedData.append(DATA); 510 } 511 return expandedData.toString(); 512 } 513 514 517 private void readSource(TokenizerSource source, char[] buffer) throws Throwable { 518 long startTime = System.currentTimeMillis(); 519 int chars; 520 521 while ((chars = source.read(buffer, 0, buffer.length)) > 0); 522 System.out.println(source.getClass().getName() + " needed " + (System.currentTimeMillis() - startTime) + "ms."); 523 } 524 525 526 530 private static final String ORIG_SMILEY = ":-)"; 532 private static final String FRIGHTENED_SMIKEY = "=8-["; 533 private static final String WINKING_SMILEY = ".-\\"; 534 535 private static final String DATA = 537 "this is a simple text with a lot of perfectly normal\n" 538 + "token. And a few separators (brackets are some, for instance)\n" 539 + "as well. There could\talso be some\ttabs (\"\\t\")\n" 540 + "in between. And 'some strings' :-).\n" 541 + "And the smileys (;-), =8-[, .-\\ etc.) should be regarded as\n" 542 + "'special sequences'.\n\n"; 543 } 544 545 | Popular Tags |