1 19 package org.openharmonise.rm.search; 20 import java.io.*; 21 import java.util.*; 22 23 import org.apache.lucene.analysis.*; 24 25 43 public class HarmoniseAnalyzer extends Analyzer { 44 45 private Set stopWordsSet; 46 47 public HarmoniseAnalyzer() { 48 stopWordsSet = StopFilter.makeStopSet(SMART_STOP_WORDS); 49 } 50 51 54 public TokenStream tokenStream(String fieldName, Reader reader) { 55 return new PorterStemFilter(new StopFilter(new LowerCaseTokenizer(reader), stopWordsSet)); 56 } 57 58 66 public static boolean containsStopWord(String sArg) { 67 boolean bContainsStop = false; 68 69 List list = Arrays.asList(SMART_STOP_WORDS); 70 71 StringTokenizer tokenizer = new StringTokenizer(sArg); 72 73 while (tokenizer.hasMoreTokens()) { 74 String sToken = tokenizer.nextToken(); 75 76 if(list.contains(sToken) == true) { 77 bContainsStop = true; 78 break; 79 } 80 } 81 82 return bContainsStop; 83 } 84 85 90 public static String [] getStopWords() { 91 return SMART_STOP_WORDS; 92 } 93 94 100 101 107 private static final String SMART_STOP_WORDS[] = 108 { 109 "a", 110 "able", 111 "about", 112 "above", 113 "according", 114 "accordingly", 115 "across", 116 "actually", 117 "after", 118 "afterwards", 119 "again", 120 "against", 121 "all", 122 "allow", 123 "allows", 124 "almost", 125 "alone", 126 "along", 127 "already", 128 "also", 129 "although", 130 "always", 131 "am", 132 "among", 133 "amongst", 134 "an", 135 "and", 136 "another", 137 "any", 138 "anybody", 139 "anyhow", 140 "anyone", 141 "anything", 142 "anyway", 143 "anyways", 144 "anywhere", 145 "apart", 146 "appear", 147 "appreciate", 148 "appropriate", 149 "are", 150 "around", 151 "as", 152 "aside", 153 "ask", 154 "asking", 155 "associated", 156 "at", 157 "available", 158 "away", 159 "awfully", 160 "b", 161 "be", 162 "became", 163 "because", 164 "become", 165 "becomes", 166 "becoming", 167 "been", 168 "before", 169 "beforehand", 170 "behind", 171 "being", 172 "believe", 173 "below", 174 "beside", 175 "besides", 176 "best", 177 "better", 178 "between", 179 "beyond", 180 "both", 181 "brief", 182 "but", 183 "by", 184 "c", 185 "came", 186 "can", 187 "cannot", 188 "cant", 189 "cause", 190 "causes", 191 "certain", 192 "certainly", 193 "changes", 194 "clearly", 195 "co", 196 "com", 197 "come", 198 "comes", 199 "concerning", 200 "consequently", 201 "consider", 202 "considering", 203 "contain", 204 "containing", 205 "contains", 206 "corresponding", 207 "could", 208 "course", 209 "currently", 210 "d", 211 "definitely", 212 "described", 213 "despite", 214 "did", 215 "different", 216 "do", 217 "does", 218 "doing", 219 "done", 220 "down", 221 "downwards", 222 "during", 223 "e", 224 "each", 225 "edu", 226 "eg", 227 "eight", 228 "either", 229 "else", 230 "elsewhere", 231 "enough", 232 "entirely", 233 "especially", 234 "et", 235 "etc", 236 "even", 237 "ever", 238 "every", 239 "everybody", 240 "everyone", 241 "everything", 242 "everywhere", 243 "ex", 244 "exactly", 245 "example", 246 "except", 247 "f", 248 "far", 249 "few", 250 "fifth", 251 "first", 252 "five", 253 "followed", 254 "following", 255 "follows", 256 "for", 257 "former", 258 "formerly", 259 "forth", 260 "four", 261 "from", 262 "further", 263 "furthermore", 264 "g", 265 "get", 266 "gets", 267 "getting", 268 "given", 269 "gives", 270 "go", 271 "goes", 272 "going", 273 "gone", 274 "got", 275 "gotten", 276 "greetings", 277 "h", 278 "had", 279 "happens", 280 "hardly", 281 "has", 282 "have", 283 "having", 284 "he", 285 "hello", 286 "help", 287 "hence", 288 "her", 289 "here", 290 "hereafter", 291 "hereby", 292 "herein", 293 "hereupon", 294 "hers", 295 "herself", 296 "hi", 297 "him", 298 "himself", 299 "his", 300 "hither", 301 "hopefully", 302 "how", 303 "howbeit", 304 "however", 305 "i", 306 "ie", 307 "if", 308 "ignored", 309 "immediate", 310 "in", 311 "inasmuch", 312 "inc", 313 "indeed", 314 "indicate", 315 "indicated", 316 "indicates", 317 "inner", 318 "insofar", 319 "instead", 320 "into", 321 "inward", 322 "is", 323 "it", 324 "its", 325 "itself", 326 "j", 327 "just", 328 "k", 329 "keep", 330 "keeps", 331 "kept", 332 "know", 333 "knows", 334 "known", 335 "l", 336 "last", 337 "lately", 338 "later", 339 "latter", 340 "latterly", 341 "least", 342 "less", 343 "lest", 344 "let", 345 "like", 346 "liked", 347 "likely", 348 "little", 349 "look", 350 "looking", 351 "looks", 352 "ltd", 353 "m", 354 "mainly", 355 "many", 356 "may", 357 "maybe", 358 "me", 359 "mean", 360 "meanwhile", 361 "merely", 362 "might", 363 "more", 364 "moreover", 365 "most", 366 "mostly", 367 "much", 368 "must", 369 "my", 370 "myself", 371 "n", 372 "name", 373 "namely", 374 "nd", 375 "near", 376 "nearly", 377 "necessary", 378 "need", 379 "needs", 380 "neither", 381 "never", 382 "nevertheless", 383 "new", 384 "next", 385 "nine", 386 "no", 387 "nobody", 388 "non", 389 "none", 390 "noone", 391 "nor", 392 "normally", 393 "not", 394 "nothing", 395 "novel", 396 "now", 397 "nowhere", 398 "o", 399 "obviously", 400 "of", 401 "off", 402 "often", 403 "oh", 404 "ok", 405 "okay", 406 "old", 407 "on", 408 "once", 409 "one", 410 "ones", 411 "only", 412 "onto", 413 "or", 414 "other", 415 "others", 416 "otherwise", 417 "ought", 418 "our", 419 "ours", 420 "ourselves", 421 "out", 422 "outside", 423 "over", 424 "overall", 425 "own", 426 "p", 427 "particular", 428 "particularly", 429 "per", 430 "perhaps", 431 "placed", 432 "please", 433 "plus", 434 "possible", 435 "presumably", 436 "probably", 437 "provides", 438 "q", 439 "que", 440 "quite", 441 "qv", 442 "r", 443 "rather", 444 "rd", 445 "re", 446 "really", 447 "reasonably", 448 "regarding", 449 "regardless", 450 "regards", 451 "relatively", 452 "respectively", 453 "right", 454 "s", 455 "said", 456 "same", 457 "saw", 458 "say", 459 "saying", 460 "says", 461 "second", 462 "secondly", 463 "see", 464 "seeing", 465 "seem", 466 "seemed", 467 "seeming", 468 "seems", 469 "seen", 470 "self", 471 "selves", 472 "sensible", 473 "sent", 474 "serious", 475 "seriously", 476 "seven", 477 "several", 478 "shall", 479 "she", 480 "should", 481 "since", 482 "six", 483 "so", 484 "some", 485 "somebody", 486 "somehow", 487 "someone", 488 "something", 489 "sometime", 490 "sometimes", 491 "somewhat", 492 "somewhere", 493 "soon", 494 "sorry", 495 "specified", 496 "specify", 497 "specifying", 498 "still", 499 "sub", 500 "such", 501 "sup", 502 "sure", 503 "t", 504 "take", 505 "taken", 506 "tell", 507 "tends", 508 "th", 509 "than", 510 "thank", 511 "thanks", 512 "thanx", 513 "that", 514 "thats", 515 "the", 516 "their", 517 "theirs", 518 "them", 519 "themselves", 520 "then", 521 "thence", 522 "there", 523 "thereafter", 524 "thereby", 525 "therefore", 526 "therein", 527 "theres", 528 "thereupon", 529 "these", 530 "they", 531 "think", 532 "third", 533 "this", 534 "thorough", 535 "thoroughly", 536 "those", 537 "though", 538 "three", 539 "through", 540 "throughout", 541 "thru", 542 "thus", 543 "to", 544 "together", 545 "too", 546 "took", 547 "toward", 548 "towards", 549 "tried", 550 "tries", 551 "truly", 552 "try", 553 "trying", 554 "twice", 555 "two", 556 "u", 557 "un", 558 "under", 559 "unfortunately", 560 "unless", 561 "unlikely", 562 "until", 563 "unto", 564 "up", 565 "upon", 566 "us", 567 "use", 568 "used", 569 "useful", 570 "uses", 571 "using", 572 "usually", 573 "uucp", 574 "v", 575 "value", 576 "various", 577 "very", 578 "via", 579 "viz", 580 "vs", 581 "w", 582 "want", 583 "wants", 584 "was", 585 "way", 586 "we", 587 "welcome", 588 "well", 589 "went", 590 "were", 591 "what", 592 "whatever", 593 "when", 594 "whence", 595 "whenever", 596 "where", 597 "whereafter", 598 "whereas", 599 "whereby", 600 "wherein", 601 "whereupon", 602 "wherever", 603 "whether", 604 "which", 605 "while", 606 "whither", 607 "who", 608 "whoever", 609 "whole", 610 "whom", 611 "whose", 612 "why", 613 "will", 614 "willing", 615 "wish", 616 "with", 617 "within", 618 "without", 619 "wonder", 620 "would", 621 "would", 622 "x", 623 "y", 624 "yes", 625 "yet", 626 "you", 627 "your", 628 "yours", 629 "yourself", 630 "yourselves", 631 "z", 632 "zero" }; 633 634 } 635 | Popular Tags |