KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > searcher > basic > BasicQueryFilter


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.searcher.basic;
5
6 import org.apache.lucene.search.BooleanQuery;
7 import org.apache.lucene.search.PhraseQuery;
8 import org.apache.lucene.search.TermQuery;
9
10 import net.nutch.analysis.NutchDocumentAnalyzer;
11 import net.nutch.analysis.CommonGrams;
12
13 import net.nutch.searcher.QueryFilter;
14 import net.nutch.searcher.Query;
15 import net.nutch.searcher.Query.*;
16
17 import java.io.IOException JavaDoc;
18 import java.util.HashSet JavaDoc;
19
20 /** The default query filter. Query terms in the default query field are
21  * expanded to search the url, anchor and content document fields.*/

22 public class BasicQueryFilter implements QueryFilter {
23
24   private static float URL_BOOST = 4.0f;
25   private static float ANCHOR_BOOST = 2.0f;
26
27   private static int SLOP = Integer.MAX_VALUE;
28   private static float PHRASE_BOOST = 1.0f;
29
30   private static final String JavaDoc[] FIELDS = {"url", "anchor", "content"};
31   private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST, 1.0f};
32
33   /** Set the boost factor for url matches, relative to content and anchor
34    * matches */

35   public static void setUrlBoost(float boost) { URL_BOOST = boost; }
36
37   /** Set the boost factor for title/anchor matches, relative to url and
38    * content matches. */

39   public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; }
40
41   /** Set the boost factor for sloppy phrase matches relative to unordered term
42    * matches. */

43   public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }
44
45   /** Set the maximum number of terms permitted between matching terms in a
46    * sloppy phrase match. */

47   public static void setSlop(int slop) { SLOP = slop; }
48
49   public BooleanQuery filter(Query input, BooleanQuery output) {
50     addTerms(input, output);
51     addSloppyPhrases(input, output);
52     return output;
53   }
54
55   private static void addTerms(Query input, BooleanQuery output) {
56     Clause[] clauses = input.getClauses();
57     for (int i = 0; i < clauses.length; i++) {
58       Clause c = clauses[i];
59
60       if (!c.getField().equals(Clause.DEFAULT_FIELD))
61         continue; // skip non-default fields
62

63       BooleanQuery out = new BooleanQuery();
64       for (int f = 0; f < FIELDS.length; f++) {
65
66         Clause o = c;
67         if (c.isPhrase()) { // optimize phrase clauses
68
String JavaDoc[] opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]);
69           if (opt.length==1) {
70             o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited());
71           } else {
72             o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited());
73           }
74         }
75
76         out.add(o.isPhrase()
77                 ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
78                 : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
79                 false, false);
80       }
81       output.add(out, c.isRequired(), c.isProhibited());
82     }
83   }
84
85   private static void addSloppyPhrases(Query input, BooleanQuery output) {
86     Clause[] clauses = input.getClauses();
87     for (int f = 0; f < FIELDS.length; f++) {
88
89       PhraseQuery sloppyPhrase = new PhraseQuery();
90       sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
91       sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
92                            ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
93                            : SLOP);
94       int sloppyTerms = 0;
95
96       for (int i = 0; i < clauses.length; i++) {
97         Clause c = clauses[i];
98         
99         if (!c.getField().equals(Clause.DEFAULT_FIELD))
100           continue; // skip non-default fields
101

102         if (c.isPhrase()) // skip exact phrases
103
continue;
104
105         if (c.isProhibited()) // skip prohibited terms
106
continue;
107         
108         sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
109         sloppyTerms++;
110       }
111
112       if (sloppyTerms > 1)
113         output.add(sloppyPhrase, false, false);
114     }
115   }
116
117
118   private static org.apache.lucene.search.Query
119         termQuery(String JavaDoc field, Term term, float boost) {
120     TermQuery result = new TermQuery(luceneTerm(field, term));
121     result.setBoost(boost);
122     return result;
123   }
124
125   /** Utility to construct a Lucene exact phrase query for a Nutch phrase. */
126   private static org.apache.lucene.search.Query
127        exactPhrase(Phrase nutchPhrase,
128                    String JavaDoc field, float boost) {
129     Term[] terms = nutchPhrase.getTerms();
130     PhraseQuery exactPhrase = new PhraseQuery();
131     for (int i = 0; i < terms.length; i++) {
132       exactPhrase.add(luceneTerm(field, terms[i]));
133     }
134     exactPhrase.setBoost(boost);
135     return exactPhrase;
136   }
137
138   /** Utility to construct a Lucene Term given a Nutch query term and field. */
139   private static org.apache.lucene.index.Term luceneTerm(String JavaDoc field,
140                                                          Term term) {
141     return new org.apache.lucene.index.Term(field, term.toString());
142   }
143 }
144
Popular Tags