1 2 3 4 package net.nutch.searcher.basic; 5 6 import org.apache.lucene.search.BooleanQuery; 7 import org.apache.lucene.search.PhraseQuery; 8 import org.apache.lucene.search.TermQuery; 9 10 import net.nutch.analysis.NutchDocumentAnalyzer; 11 import net.nutch.analysis.CommonGrams; 12 13 import net.nutch.searcher.QueryFilter; 14 import net.nutch.searcher.Query; 15 import net.nutch.searcher.Query.*; 16 17 import java.io.IOException ; 18 import java.util.HashSet ; 19 20 22 public class BasicQueryFilter implements QueryFilter { 23 24 private static float URL_BOOST = 4.0f; 25 private static float ANCHOR_BOOST = 2.0f; 26 27 private static int SLOP = Integer.MAX_VALUE; 28 private static float PHRASE_BOOST = 1.0f; 29 30 private static final String [] FIELDS = {"url", "anchor", "content"}; 31 private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST, 1.0f}; 32 33 35 public static void setUrlBoost(float boost) { URL_BOOST = boost; } 36 37 39 public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; } 40 41 43 public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; } 44 45 47 public static void setSlop(int slop) { SLOP = slop; } 48 49 public BooleanQuery filter(Query input, BooleanQuery output) { 50 addTerms(input, output); 51 addSloppyPhrases(input, output); 52 return output; 53 } 54 55 private static void addTerms(Query input, BooleanQuery output) { 56 Clause[] clauses = input.getClauses(); 57 for (int i = 0; i < clauses.length; i++) { 58 Clause c = clauses[i]; 59 60 if (!c.getField().equals(Clause.DEFAULT_FIELD)) 61 continue; 63 BooleanQuery out = new BooleanQuery(); 64 for (int f = 0; f < FIELDS.length; f++) { 65 66 Clause o = c; 67 if (c.isPhrase()) { String [] opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]); 69 if (opt.length==1) { 70 o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited()); 71 } else { 72 o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited()); 73 } 74 } 75 76 out.add(o.isPhrase() 77 ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f]) 78 : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]), 79 false, false); 80 } 81 output.add(out, c.isRequired(), c.isProhibited()); 82 } 83 } 84 85 private static void addSloppyPhrases(Query input, BooleanQuery output) { 86 Clause[] clauses = input.getClauses(); 87 for (int f = 0; f < FIELDS.length; f++) { 88 89 PhraseQuery sloppyPhrase = new PhraseQuery(); 90 sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST); 91 sloppyPhrase.setSlop("anchor".equals(FIELDS[f]) 92 ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP 93 : SLOP); 94 int sloppyTerms = 0; 95 96 for (int i = 0; i < clauses.length; i++) { 97 Clause c = clauses[i]; 98 99 if (!c.getField().equals(Clause.DEFAULT_FIELD)) 100 continue; 102 if (c.isPhrase()) continue; 104 105 if (c.isProhibited()) continue; 107 108 sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm())); 109 sloppyTerms++; 110 } 111 112 if (sloppyTerms > 1) 113 output.add(sloppyPhrase, false, false); 114 } 115 } 116 117 118 private static org.apache.lucene.search.Query 119 termQuery(String field, Term term, float boost) { 120 TermQuery result = new TermQuery(luceneTerm(field, term)); 121 result.setBoost(boost); 122 return result; 123 } 124 125 126 private static org.apache.lucene.search.Query 127 exactPhrase(Phrase nutchPhrase, 128 String field, float boost) { 129 Term[] terms = nutchPhrase.getTerms(); 130 PhraseQuery exactPhrase = new PhraseQuery(); 131 for (int i = 0; i < terms.length; i++) { 132 exactPhrase.add(luceneTerm(field, terms[i])); 133 } 134 exactPhrase.setBoost(boost); 135 return exactPhrase; 136 } 137 138 139 private static org.apache.lucene.index.Term luceneTerm(String field, 140 Term term) { 141 return new org.apache.lucene.index.Term(field, term.toString()); 142 } 143 } 144 | Popular Tags |