NutchDocumentAnalyzer


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.analysis;
5   
6   import org.apache.lucene.analysis.Analyzer;
7   import org.apache.lucene.analysis.TokenFilter;
8   import org.apache.lucene.analysis.TokenStream;
9   import org.apache.lucene.analysis.Token;
10  import java.io.Reader  ;
11  import java.io.IOException  ;
12  
13  /** The analyzer used for Nutch documents.  Uses the JavaCC-defined lexical
14   * analyzer {@link NutchDocumentTokenizer}, with no stop list.  This keeps it
15   * consistent with query parsing. */
16  public class NutchDocumentAnalyzer extends Analyzer {
17  
18    /** Analyzer used to index textual content. */
19    private static class ContentAnalyzer extends Analyzer {
20      /** Constructs a {@link NutchDocumentTokenizer}. */
21      public TokenStream tokenStream(String   field, Reader   reader) {
22        return CommonGrams.getFilter(new NutchDocumentTokenizer(reader), field);
23      }
24    }
25  
26    /** Analyzer used to index textual content. */
27    public static final Analyzer CONTENT_ANALYZER = new ContentAnalyzer();
28  
29    // Anchor Analysis
30    // Like content analysis, but leave gap between anchors to inhibit
31    // cross-anchor phrase matching.
32  
33    /** The number of unused term positions between anchors in the anchor
34     * field. */
35    public static final int INTER_ANCHOR_GAP = 4;
36  
37    private static class AnchorFilter extends TokenFilter {
38      public AnchorFilter(TokenStream input) {
39        super(input);
40      }
41  
42      private boolean first = true;
43      public final Token next() throws IOException   {
44        Token result = input.next();
45        if (result == null)
46          return result;
47        if (first) {
48          result.setPositionIncrement(INTER_ANCHOR_GAP);
49          first = false;
50        }
51        return result;
52      }
53    }
54  
55    private static class AnchorAnalyzer extends Analyzer {
56      public final TokenStream tokenStream(String   fieldName, Reader   reader) {
57        return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
58      }
59    }
60  
61    /** Analyzer used to analyze anchors. */
62    public static final Analyzer ANCHOR_ANALYZER = new AnchorAnalyzer();
63  
64    /** Returns a new token stream for text from the named field. */
65    public TokenStream tokenStream(String   fieldName, Reader   reader) {
66      Analyzer analyzer;
67      if ("url".equals(fieldName) || ("anchor".equals(fieldName)))
68        analyzer = ANCHOR_ANALYZER;
69      else
70        analyzer = CONTENT_ANALYZER;
71  
72      return analyzer.tokenStream(fieldName, reader);
73    }
74  
75  }
76
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags