1 2 3 4 package net.nutch.analysis; 5 6 import org.apache.lucene.analysis.Analyzer; 7 import org.apache.lucene.analysis.TokenFilter; 8 import org.apache.lucene.analysis.TokenStream; 9 import org.apache.lucene.analysis.Token; 10 import java.io.Reader ; 11 import java.io.IOException ; 12 13 16 public class NutchDocumentAnalyzer extends Analyzer { 17 18 19 private static class ContentAnalyzer extends Analyzer { 20 21 public TokenStream tokenStream(String field, Reader reader) { 22 return CommonGrams.getFilter(new NutchDocumentTokenizer(reader), field); 23 } 24 } 25 26 27 public static final Analyzer CONTENT_ANALYZER = new ContentAnalyzer(); 28 29 33 35 public static final int INTER_ANCHOR_GAP = 4; 36 37 private static class AnchorFilter extends TokenFilter { 38 public AnchorFilter(TokenStream input) { 39 super(input); 40 } 41 42 private boolean first = true; 43 public final Token next() throws IOException { 44 Token result = input.next(); 45 if (result == null) 46 return result; 47 if (first) { 48 result.setPositionIncrement(INTER_ANCHOR_GAP); 49 first = false; 50 } 51 return result; 52 } 53 } 54 55 private static class AnchorAnalyzer extends Analyzer { 56 public final TokenStream tokenStream(String fieldName, Reader reader) { 57 return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader)); 58 } 59 } 60 61 62 public static final Analyzer ANCHOR_ANALYZER = new AnchorAnalyzer(); 63 64 65 public TokenStream tokenStream(String fieldName, Reader reader) { 66 Analyzer analyzer; 67 if ("url".equals(fieldName) || ("anchor".equals(fieldName))) 68 analyzer = ANCHOR_ANALYZER; 69 else 70 analyzer = CONTENT_ANALYZER; 71 72 return analyzer.tokenStream(fieldName, reader); 73 } 74 75 } 76 | Popular Tags |