1 2 package net.nutch.analysis; 3 4 import net.nutch.searcher.Query; 5 import net.nutch.searcher.QueryFilters; 6 import net.nutch.searcher.Query.Clause; 7 8 import org.apache.lucene.analysis.StopFilter; 9 10 import java.io.*; 11 import java.util.*; 12 13 14 public class NutchAnalysis implements NutchAnalysisConstants { 15 16 private static final String [] STOP_WORDS = { 17 "a", "and", "are", "as", "at", "be", "but", "by", 18 "for", "if", "in", "into", "is", "it", 19 "no", "not", "of", "on", "or", "s", "such", 20 "t", "that", "the", "their", "then", "there", "these", 21 "they", "this", "to", "was", "will", "with" 22 }; 23 24 private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); 25 26 private String queryString; 27 28 30 public static boolean isStopWord(String word) { 31 return STOP_SET.contains(word); 32 } 33 34 35 public static Query parseQuery(String queryString) throws IOException { 36 NutchAnalysis parser = 37 new NutchAnalysis(new FastCharStream(new StringReader(queryString))); 38 parser.queryString = queryString; 39 return parser.parse(); 40 } 41 42 43 public static void main(String [] args) throws Exception { 44 BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 45 while (true) { 46 System.out.print("Query: "); 47 String line = in.readLine(); 48 System.out.println(parseQuery(line)); 49 } 50 } 51 52 53 final public Query parse() throws ParseException { 54 Query query = new Query(); 55 ArrayList terms; 56 Token token; 57 String field; 58 boolean stop; 59 boolean prohibited; 60 nonOpOrTerm(); 61 label_1: 62 while (true) { 63 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 64 case WORD: 65 case ACRONYM: 66 case SIGRAM: 67 case PLUS: 68 case MINUS: 69 case QUOTE: 70 ; 71 break; 72 default: 73 jj_la1[0] = jj_gen; 74 break label_1; 75 } 76 stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; 77 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 78 case PLUS: 79 case MINUS: 80 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 81 case PLUS: 82 jj_consume_token(PLUS); 83 stop=false; 84 break; 85 case MINUS: 86 jj_consume_token(MINUS); 87 stop=false;prohibited=true; 88 break; 89 default: 90 jj_la1[1] = jj_gen; 91 jj_consume_token(-1); 92 throw new ParseException(); 93 } 94 break; 95 default: 96 jj_la1[2] = jj_gen; 97 ; 98 } 99 if (jj_2_1(2147483647)) { 100 token = jj_consume_token(WORD); 101 jj_consume_token(COLON); 102 field = token.image; 103 } else { 104 ; 105 } 106 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 107 case QUOTE: 108 terms = phrase(field); 109 stop=false; 110 break; 111 case WORD: 112 case ACRONYM: 113 case SIGRAM: 114 terms = compound(field); 116 break; 117 default: 118 jj_la1[3] = jj_gen; 119 jj_consume_token(-1); 120 throw new ParseException(); 121 } 122 nonOpOrTerm(); 123 String [] array = (String [])terms.toArray(new String [terms.size()]); 124 125 if (stop && terms.size()==1 && isStopWord(array[0])) { 126 } else { 128 if (prohibited) 129 query.addProhibitedPhrase(array, field); 130 else 131 query.addRequiredPhrase(array, field); 132 } 133 } 134 {if (true) return query;} 135 throw new Error ("Missing return statement in function"); 136 } 137 138 140 final public ArrayList phrase(String field) throws ParseException { 141 int start; 142 int end; 143 ArrayList result = new ArrayList(); 144 String term; 145 jj_consume_token(QUOTE); 146 start = token.endColumn; 147 label_2: 148 while (true) { 149 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 150 case 0: 151 case PLUS: 152 case MINUS: 153 case COLON: 154 case SLASH: 155 case DOT: 156 case ATSIGN: 157 case APOSTROPHE: 158 case WHITE: 159 ; 160 break; 161 default: 162 jj_la1[4] = jj_gen; 163 break label_2; 164 } 165 nonTerm(); 166 } 167 label_3: 168 while (true) { 169 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 170 case WORD: 171 case ACRONYM: 172 case SIGRAM: 173 ; 174 break; 175 default: 176 jj_la1[5] = jj_gen; 177 break label_3; 178 } 179 term = term(); 180 result.add(term); 181 label_4: 182 while (true) { 183 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 184 case 0: 185 case PLUS: 186 case MINUS: 187 case COLON: 188 case SLASH: 189 case DOT: 190 case ATSIGN: 191 case APOSTROPHE: 192 case WHITE: 193 ; 194 break; 195 default: 196 jj_la1[6] = jj_gen; 197 break label_4; 198 } 199 nonTerm(); 200 } 201 } 202 end = token.endColumn; 203 jj_consume_token(QUOTE); 204 if (QueryFilters.isRawField(field)) { 205 result.clear(); 206 result.add(queryString.substring(start, end)); 207 } 208 {if (true) return result;} 209 throw new Error ("Missing return statement in function"); 210 } 211 212 215 final public ArrayList compound(String field) throws ParseException { 216 int start; 217 ArrayList result = new ArrayList(); 218 String term; 219 start = token.endColumn; 220 term = term(); 221 result.add(term); 222 label_5: 223 while (true) { 224 if (jj_2_2(2147483647)) { 225 ; 226 } else { 227 break label_5; 228 } 229 label_6: 230 while (true) { 231 infix(); 232 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 233 case PLUS: 234 case MINUS: 235 case COLON: 236 case SLASH: 237 case DOT: 238 case ATSIGN: 239 case APOSTROPHE: 240 ; 241 break; 242 default: 243 jj_la1[7] = jj_gen; 244 break label_6; 245 } 246 } 247 term = term(); 248 result.add(term); 249 } 250 if (QueryFilters.isRawField(field)) { 251 result.clear(); 252 result.add(queryString.substring(start, token.endColumn)); 253 } 254 {if (true) return result;} 255 throw new Error ("Missing return statement in function"); 256 } 257 258 259 final public String term() throws ParseException { 260 Token token; 261 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 262 case WORD: 263 token = jj_consume_token(WORD); 264 break; 265 case ACRONYM: 266 token = jj_consume_token(ACRONYM); 267 break; 268 case SIGRAM: 269 token = jj_consume_token(SIGRAM); 270 break; 271 default: 272 jj_la1[8] = jj_gen; 273 jj_consume_token(-1); 274 throw new ParseException(); 275 } 276 {if (true) return token.image;} 277 throw new Error ("Missing return statement in function"); 278 } 279 280 281 final public void nonTerm() throws ParseException { 282 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 283 case WHITE: 284 jj_consume_token(WHITE); 285 break; 286 case PLUS: 287 case MINUS: 288 case COLON: 289 case SLASH: 290 case DOT: 291 case ATSIGN: 292 case APOSTROPHE: 293 infix(); 294 break; 295 case 0: 296 jj_consume_token(0); 297 break; 298 default: 299 jj_la1[9] = jj_gen; 300 jj_consume_token(-1); 301 throw new ParseException(); 302 } 303 } 304 305 306 final public void nonOpOrTerm() throws ParseException { 307 label_7: 308 while (true) { 309 if (jj_2_3(2)) { 310 ; 311 } else { 312 break label_7; 313 } 314 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 315 case WHITE: 316 jj_consume_token(WHITE); 317 break; 318 case COLON: 319 case SLASH: 320 case DOT: 321 case ATSIGN: 322 case APOSTROPHE: 323 nonOpInfix(); 324 break; 325 case PLUS: 326 case MINUS: 327 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 328 case PLUS: 329 jj_consume_token(PLUS); 330 break; 331 case MINUS: 332 jj_consume_token(MINUS); 333 break; 334 default: 335 jj_la1[10] = jj_gen; 336 jj_consume_token(-1); 337 throw new ParseException(); 338 } 339 nonTerm(); 340 break; 341 default: 342 jj_la1[11] = jj_gen; 343 jj_consume_token(-1); 344 throw new ParseException(); 345 } 346 } 347 } 348 349 350 final public void infix() throws ParseException { 351 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 352 case PLUS: 353 jj_consume_token(PLUS); 354 break; 355 case MINUS: 356 jj_consume_token(MINUS); 357 break; 358 case COLON: 359 case SLASH: 360 case DOT: 361 case ATSIGN: 362 case APOSTROPHE: 363 nonOpInfix(); 364 break; 365 default: 366 jj_la1[12] = jj_gen; 367 jj_consume_token(-1); 368 throw new ParseException(); 369 } 370 } 371 372 373 final public void nonOpInfix() throws ParseException { 374 switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { 375 case COLON: 376 jj_consume_token(COLON); 377 break; 378 case SLASH: 379 jj_consume_token(SLASH); 380 break; 381 case DOT: 382 jj_consume_token(DOT); 383 break; 384 case ATSIGN: 385 jj_consume_token(ATSIGN); 386 break; 387 case APOSTROPHE: 388 jj_consume_token(APOSTROPHE); 389 break; 390 default: 391 jj_la1[13] = jj_gen; 392 jj_consume_token(-1); 393 throw new ParseException(); 394 } 395 } 396 397 final private boolean jj_2_1(int xla) { 398 jj_la = xla; jj_lastpos = jj_scanpos = token; 399 try { return !jj_3_1(); } 400 catch(LookaheadSuccess ls) { return true; } 401 finally { jj_save(0, xla); } 402 } 403 404 final private boolean jj_2_2(int xla) { 405 jj_la = xla; jj_lastpos = jj_scanpos = token; 406 try { return !jj_3_2(); } 407 catch(LookaheadSuccess ls) { return true; } 408 finally { jj_save(1, xla); } 409 } 410 411 final private boolean jj_2_3(int xla) { 412 jj_la = xla; jj_lastpos = jj_scanpos = token; 413 try { return !jj_3_3(); } 414 catch(LookaheadSuccess ls) { return true; } 415 finally { jj_save(2, xla); } 416 } 417 418 final private boolean jj_3R_24() { 419 if (jj_3R_18()) return true; 420 return false; 421 } 422 423 final private boolean jj_3R_11() { 424 Token xsp; 425 xsp = jj_scanpos; 426 if (jj_scan_token(1)) { 427 jj_scanpos = xsp; 428 if (jj_scan_token(2)) { 429 jj_scanpos = xsp; 430 if (jj_scan_token(3)) return true; 431 } 432 } 433 return false; 434 } 435 436 final private boolean jj_3R_20() { 437 if (jj_3R_11()) return true; 438 Token xsp; 439 while (true) { 440 xsp = jj_scanpos; 441 if (jj_3R_24()) { jj_scanpos = xsp; break; } 442 } 443 return false; 444 } 445 446 final private boolean jj_3R_9() { 447 if (jj_3R_15()) return true; 448 return false; 449 } 450 451 final private boolean jj_3R_22() { 452 if (jj_3R_17()) return true; 453 return false; 454 } 455 456 final private boolean jj_3R_19() { 457 if (jj_3R_18()) return true; 458 return false; 459 } 460 461 final private boolean jj_3R_12() { 462 if (jj_3R_17()) return true; 463 return false; 464 } 465 466 final private boolean jj_3R_10() { 467 if (jj_3R_16()) return true; 468 return false; 469 } 470 471 final private boolean jj_3_2() { 472 Token xsp; 473 if (jj_3R_10()) return true; 474 while (true) { 475 xsp = jj_scanpos; 476 if (jj_3R_10()) { jj_scanpos = xsp; break; } 477 } 478 if (jj_3R_11()) return true; 479 return false; 480 } 481 482 final private boolean jj_3R_14() { 483 if (jj_scan_token(QUOTE)) return true; 484 Token xsp; 485 while (true) { 486 xsp = jj_scanpos; 487 if (jj_3R_19()) { jj_scanpos = xsp; break; } 488 } 489 while (true) { 490 xsp = jj_scanpos; 491 if (jj_3R_20()) { jj_scanpos = xsp; break; } 492 } 493 if (jj_scan_token(QUOTE)) return true; 494 return false; 495 } 496 497 final private boolean jj_3R_17() { 498 Token xsp; 499 xsp = jj_scanpos; 500 if (jj_scan_token(10)) { 501 jj_scanpos = xsp; 502 if (jj_scan_token(11)) { 503 jj_scanpos = xsp; 504 if (jj_scan_token(12)) { 505 jj_scanpos = xsp; 506 if (jj_scan_token(13)) { 507 jj_scanpos = xsp; 508 if (jj_scan_token(14)) return true; 509 } 510 } 511 } 512 } 513 return false; 514 } 515 516 final private boolean jj_3R_25() { 517 if (jj_3R_16()) return true; 518 return false; 519 } 520 521 final private boolean jj_3R_8() { 522 if (jj_3R_14()) return true; 523 return false; 524 } 525 526 final private boolean jj_3R_21() { 527 Token xsp; 528 if (jj_3R_25()) return true; 529 while (true) { 530 xsp = jj_scanpos; 531 if (jj_3R_25()) { jj_scanpos = xsp; break; } 532 } 533 if (jj_3R_11()) return true; 534 return false; 535 } 536 537 final private boolean jj_3R_16() { 538 Token xsp; 539 xsp = jj_scanpos; 540 if (jj_scan_token(7)) { 541 jj_scanpos = xsp; 542 if (jj_scan_token(8)) { 543 jj_scanpos = xsp; 544 if (jj_3R_22()) return true; 545 } 546 } 547 return false; 548 } 549 550 final private boolean jj_3R_15() { 551 if (jj_3R_11()) return true; 552 Token xsp; 553 while (true) { 554 xsp = jj_scanpos; 555 if (jj_3R_21()) { jj_scanpos = xsp; break; } 556 } 557 return false; 558 } 559 560 final private boolean jj_3R_23() { 561 if (jj_3R_16()) return true; 562 return false; 563 } 564 565 final private boolean jj_3_3() { 566 Token xsp; 567 xsp = jj_scanpos; 568 if (jj_scan_token(15)) { 569 jj_scanpos = xsp; 570 if (jj_3R_12()) { 571 jj_scanpos = xsp; 572 if (jj_3R_13()) return true; 573 } 574 } 575 return false; 576 } 577 578 final private boolean jj_3_1() { 579 if (jj_scan_token(WORD)) return true; 580 if (jj_scan_token(COLON)) return true; 581 Token xsp; 582 xsp = jj_scanpos; 583 if (jj_3R_8()) { 584 jj_scanpos = xsp; 585 if (jj_3R_9()) return true; 586 } 587 return false; 588 } 589 590 final private boolean jj_3R_13() { 591 Token xsp; 592 xsp = jj_scanpos; 593 if (jj_scan_token(7)) { 594 jj_scanpos = xsp; 595 if (jj_scan_token(8)) return true; 596 } 597 if (jj_3R_18()) return true; 598 return false; 599 } 600 601 final private boolean jj_3R_18() { 602 Token xsp; 603 xsp = jj_scanpos; 604 if (jj_scan_token(15)) { 605 jj_scanpos = xsp; 606 if (jj_3R_23()) { 607 jj_scanpos = xsp; 608 if (jj_scan_token(0)) return true; 609 } 610 } 611 return false; 612 } 613 614 public NutchAnalysisTokenManager token_source; 615 public Token token, jj_nt; 616 private int jj_ntk; 617 private Token jj_scanpos, jj_lastpos; 618 private int jj_la; 619 public boolean lookingAhead = false; 620 private boolean jj_semLA; 621 private int jj_gen; 622 final private int[] jj_la1 = new int[14]; 623 static private int[] jj_la1_0; 624 static { 625 jj_la1_0(); 626 } 627 private static void jj_la1_0() { 628 jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd81,0xe,0xfd81,0x7d80,0xe,0xfd81,0x180,0xfd80,0x7d80,0x7c00,}; 629 } 630 final private JJCalls[] jj_2_rtns = new JJCalls[3]; 631 private boolean jj_rescan = false; 632 private int jj_gc = 0; 633 634 public NutchAnalysis(CharStream stream) { 635 token_source = new NutchAnalysisTokenManager(stream); 636 token = new Token(); 637 jj_ntk = -1; 638 jj_gen = 0; 639 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 640 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 641 } 642 643 public void ReInit(CharStream stream) { 644 token_source.ReInit(stream); 645 token = new Token(); 646 jj_ntk = -1; 647 jj_gen = 0; 648 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 649 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 650 } 651 652 public NutchAnalysis(NutchAnalysisTokenManager tm) { 653 token_source = tm; 654 token = new Token(); 655 jj_ntk = -1; 656 jj_gen = 0; 657 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 658 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 659 } 660 661 public void ReInit(NutchAnalysisTokenManager tm) { 662 token_source = tm; 663 token = new Token(); 664 jj_ntk = -1; 665 jj_gen = 0; 666 for (int i = 0; i < 14; i++) jj_la1[i] = -1; 667 for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); 668 } 669 670 final private Token jj_consume_token(int kind) throws ParseException { 671 Token oldToken; 672 if ((oldToken = token).next != null) token = token.next; 673 else token = token.next = token_source.getNextToken(); 674 jj_ntk = -1; 675 if (token.kind == kind) { 676 jj_gen++; 677 if (++jj_gc > 100) { 678 jj_gc = 0; 679 for (int i = 0; i < jj_2_rtns.length; i++) { 680 JJCalls c = jj_2_rtns[i]; 681 while (c != null) { 682 if (c.gen < jj_gen) c.first = null; 683 c = c.next; 684 } 685 } 686 } 687 return token; 688 } 689 token = oldToken; 690 jj_kind = kind; 691 throw generateParseException(); 692 } 693 694 static private final class LookaheadSuccess extends java.lang.Error { } 695 final private LookaheadSuccess jj_ls = new LookaheadSuccess(); 696 final private boolean jj_scan_token(int kind) { 697 if (jj_scanpos == jj_lastpos) { 698 jj_la--; 699 if (jj_scanpos.next == null) { 700 jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken(); 701 } else { 702 jj_lastpos = jj_scanpos = jj_scanpos.next; 703 } 704 } else { 705 jj_scanpos = jj_scanpos.next; 706 } 707 if (jj_rescan) { 708 int i = 0; Token tok = token; 709 while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; } 710 if (tok != null) jj_add_error_token(kind, i); 711 } 712 if (jj_scanpos.kind != kind) return true; 713 if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls; 714 return false; 715 } 716 717 final public Token getNextToken() { 718 if (token.next != null) token = token.next; 719 else token = token.next = token_source.getNextToken(); 720 jj_ntk = -1; 721 jj_gen++; 722 return token; 723 } 724 725 final public Token getToken(int index) { 726 Token t = lookingAhead ? jj_scanpos : token; 727 for (int i = 0; i < index; i++) { 728 if (t.next != null) t = t.next; 729 else t = t.next = token_source.getNextToken(); 730 } 731 return t; 732 } 733 734 final private int jj_ntk() { 735 if ((jj_nt=token.next) == null) 736 return (jj_ntk = (token.next=token_source.getNextToken()).kind); 737 else 738 return (jj_ntk = jj_nt.kind); 739 } 740 741 private java.util.Vector jj_expentries = new java.util.Vector (); 742 private int[] jj_expentry; 743 private int jj_kind = -1; 744 private int[] jj_lasttokens = new int[100]; 745 private int jj_endpos; 746 747 private void jj_add_error_token(int kind, int pos) { 748 if (pos >= 100) return; 749 if (pos == jj_endpos + 1) { 750 jj_lasttokens[jj_endpos++] = kind; 751 } else if (jj_endpos != 0) { 752 jj_expentry = new int[jj_endpos]; 753 for (int i = 0; i < jj_endpos; i++) { 754 jj_expentry[i] = jj_lasttokens[i]; 755 } 756 boolean exists = false; 757 for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) { 758 int[] oldentry = (int[])(e.nextElement()); 759 if (oldentry.length == jj_expentry.length) { 760 exists = true; 761 for (int i = 0; i < jj_expentry.length; i++) { 762 if (oldentry[i] != jj_expentry[i]) { 763 exists = false; 764 break; 765 } 766 } 767 if (exists) break; 768 } 769 } 770 if (!exists) jj_expentries.addElement(jj_expentry); 771 if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind; 772 } 773 } 774 775 public ParseException generateParseException() { 776 jj_expentries.removeAllElements(); 777 boolean[] la1tokens = new boolean[20]; 778 for (int i = 0; i < 20; i++) { 779 la1tokens[i] = false; 780 } 781 if (jj_kind >= 0) { 782 la1tokens[jj_kind] = true; 783 jj_kind = -1; 784 } 785 for (int i = 0; i < 14; i++) { 786 if (jj_la1[i] == jj_gen) { 787 for (int j = 0; j < 32; j++) { 788 if ((jj_la1_0[i] & (1<<j)) != 0) { 789 la1tokens[j] = true; 790 } 791 } 792 } 793 } 794 for (int i = 0; i < 20; i++) { 795 if (la1tokens[i]) { 796 jj_expentry = new int[1]; 797 jj_expentry[0] = i; 798 jj_expentries.addElement(jj_expentry); 799 } 800 } 801 jj_endpos = 0; 802 jj_rescan_token(); 803 jj_add_error_token(0, 0); 804 int[][] exptokseq = new int[jj_expentries.size()][]; 805 for (int i = 0; i < jj_expentries.size(); i++) { 806 exptokseq[i] = (int[])jj_expentries.elementAt(i); 807 } 808 return new ParseException(token, exptokseq, tokenImage); 809 } 810 811 final public void enable_tracing() { 812 } 813 814 final public void disable_tracing() { 815 } 816 817 final private void jj_rescan_token() { 818 jj_rescan = true; 819 for (int i = 0; i < 3; i++) { 820 JJCalls p = jj_2_rtns[i]; 821 do { 822 if (p.gen > jj_gen) { 823 jj_la = p.arg; jj_lastpos = jj_scanpos = p.first; 824 switch (i) { 825 case 0: jj_3_1(); break; 826 case 1: jj_3_2(); break; 827 case 2: jj_3_3(); break; 828 } 829 } 830 p = p.next; 831 } while (p != null); 832 } 833 jj_rescan = false; 834 } 835 836 final private void jj_save(int index, int xla) { 837 JJCalls p = jj_2_rtns[index]; 838 while (p.gen > jj_gen) { 839 if (p.next == null) { p = p.next = new JJCalls(); break; } 840 p = p.next; 841 } 842 p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla; 843 } 844 845 static final class JJCalls { 846 int gen; 847 Token first; 848 int arg; 849 JJCalls next; 850 } 851 852 } 853 | Popular Tags |