KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > NutchDocumentAnalyzer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.analysis;
5
6 import org.apache.lucene.analysis.Analyzer;
7 import org.apache.lucene.analysis.TokenFilter;
8 import org.apache.lucene.analysis.TokenStream;
9 import org.apache.lucene.analysis.Token;
10 import java.io.Reader JavaDoc;
11 import java.io.IOException JavaDoc;
12
13 /** The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
14  * analyzer {@link NutchDocumentTokenizer}, with no stop list. This keeps it
15  * consistent with query parsing. */

16 public class NutchDocumentAnalyzer extends Analyzer {
17
18   /** Analyzer used to index textual content. */
19   private static class ContentAnalyzer extends Analyzer {
20     /** Constructs a {@link NutchDocumentTokenizer}. */
21     public TokenStream tokenStream(String JavaDoc field, Reader JavaDoc reader) {
22       return CommonGrams.getFilter(new NutchDocumentTokenizer(reader), field);
23     }
24   }
25
26   /** Analyzer used to index textual content. */
27   public static final Analyzer CONTENT_ANALYZER = new ContentAnalyzer();
28
29   // Anchor Analysis
30
// Like content analysis, but leave gap between anchors to inhibit
31
// cross-anchor phrase matching.
32

33   /** The number of unused term positions between anchors in the anchor
34    * field. */

35   public static final int INTER_ANCHOR_GAP = 4;
36
37   private static class AnchorFilter extends TokenFilter {
38     public AnchorFilter(TokenStream input) {
39       super(input);
40     }
41
42     private boolean first = true;
43     public final Token next() throws IOException JavaDoc {
44       Token result = input.next();
45       if (result == null)
46         return result;
47       if (first) {
48         result.setPositionIncrement(INTER_ANCHOR_GAP);
49         first = false;
50       }
51       return result;
52     }
53   }
54
55   private static class AnchorAnalyzer extends Analyzer {
56     public final TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader) {
57       return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
58     }
59   }
60
61   /** Analyzer used to analyze anchors. */
62   public static final Analyzer ANCHOR_ANALYZER = new AnchorAnalyzer();
63
64   /** Returns a new token stream for text from the named field. */
65   public TokenStream tokenStream(String JavaDoc fieldName, Reader JavaDoc reader) {
66     Analyzer analyzer;
67     if ("url".equals(fieldName) || ("anchor".equals(fieldName)))
68       analyzer = ANCHOR_ANALYZER;
69     else
70       analyzer = CONTENT_ANALYZER;
71
72     return analyzer.tokenStream(fieldName, reader);
73   }
74
75 }
76
Popular Tags