KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > searcher > NutchBean


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.searcher;
5
6 import java.io.*;
7 import java.net.*;
8 import java.util.*;
9 import java.util.logging.Logger JavaDoc;
10 import javax.servlet.ServletContext JavaDoc;
11
12 import net.nutch.fs.*;
13 import net.nutch.util.*;
14 import net.nutch.parse.*;
15 import net.nutch.indexer.*;
16
17 /**
18  * One stop shopping for search-related functionality.
19  * @version $Id: NutchBean.java,v 1.18 2004/12/05 09:43:48 mike_cafarella Exp $
20  */

21 public class NutchBean
22   implements Searcher, HitDetailer, HitSummarizer, HitContent {
23
24   public static final Logger JavaDoc LOG =
25     LogFormatter.getLogger("net.nutch.searcher.NutchBean");
26
27   static {
28     LogFormatter.setShowThreadIDs(true);
29   }
30
31   private String JavaDoc[] segmentNames;
32
33   private Searcher searcher;
34   private HitDetailer detailer;
35   private HitSummarizer summarizer;
36   private HitContent content;
37
38   private float RAW_HITS_FACTOR =
39     NutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
40
41   /** BooleanQuery won't permit more than 32 required/prohibited clauses. We
42    * don't want to use too many of those. */

43   private static final int MAX_PROHIBITED_TERMS = 20;
44
45   /** Cache in servlet context. */
46   public static NutchBean get(ServletContext JavaDoc app) throws IOException {
47     NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
48     if (bean == null) {
49       LOG.info("creating new bean");
50       bean = new NutchBean();
51       app.setAttribute("nutchBean", bean);
52     }
53     return bean;
54   }
55
56   /** Construct reading from connected directory. */
57   public NutchBean() throws IOException {
58     this(new File(NutchConf.get("searcher.dir", ".")));
59   }
60
61   /** Construct in a named directory. */
62   public NutchBean(File dir) throws IOException {
63     File servers = new File(dir, "search-servers.txt");
64     if (servers.exists()) {
65       LOG.info("searching servers in " + servers.getCanonicalPath());
66       init(new DistributedSearch.Client(servers));
67     } else {
68       init(new File(dir, "index"), new File(dir, "segments"));
69     }
70   }
71
72   private void init(File indexDir, File segmentsDir) throws IOException {
73     IndexSearcher indexSearcher;
74     if (indexDir.exists()) {
75       LOG.info("opening merged index in " + indexDir.getCanonicalPath());
76       indexSearcher = new IndexSearcher(indexDir.getCanonicalPath());
77     } else {
78       LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
79       
80       Vector vDirs=new Vector();
81       File [] directories = segmentsDir.listFiles();
82       for(int i = 0; i < segmentsDir.listFiles().length; i++) {
83         File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
84         if(indexdone.exists() && indexdone.isFile()) {
85           vDirs.add(directories[i]);
86         }
87       }
88       
89       directories = new File[ vDirs.size() ];
90       for(int i = 0; vDirs.size()>0; i++) {
91         directories[i]=(File)vDirs.remove(0);
92       }
93       
94       indexSearcher = new IndexSearcher(directories);
95     }
96
97     FetchedSegments segments = new FetchedSegments(new LocalFileSystem(), segmentsDir.toString());
98     
99     this.segmentNames = segments.getSegmentNames();
100     
101     this.searcher = indexSearcher;
102     this.detailer = indexSearcher;
103     this.summarizer = segments;
104     this.content = segments;
105   }
106
107   private void init(DistributedSearch.Client client) throws IOException {
108     this.segmentNames = client.getSegmentNames();
109     this.searcher = client;
110     this.detailer = client;
111     this.summarizer = client;
112     this.content = client;
113   }
114
115
116   public String JavaDoc[] getSegmentNames() {
117     return segmentNames;
118   }
119
120   public Hits search(Query query, int numHits) throws IOException {
121     return searcher.search(query, numHits);
122   }
123   
124   private class SiteHits extends ArrayList {
125     private boolean maxSizeExceeded;
126   }
127
128   /**
129    * Search for pages matching a query, eliminating excessive hits from sites.
130    * Hits for a site in excess of <code>maxHitsPerSite</code> are removed from
131    * the results. The remaining hits for such sites have {@link
132    * Hit#moreFromSiteExcluded()} set.
133    * <p>
134    * If maxHitsPerSite is zero then all hits are returned.
135    *
136    * @param query query
137    * @param numHits number of requested hits
138    * @param maxHitsPerSite the maximum hits returned per site, or zero
139    * @return Hits the matching hits
140    * @throws IOException
141    */

142   public Hits search(Query query, int numHits, int maxHitsPerSite)
143        throws IOException {
144     if (maxHitsPerSite <= 0) // disable site checking
145
return searcher.search(query, numHits);
146
147     int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
148     LOG.info("searching for "+numHitsRaw+" raw hits");
149     Hits hits = searcher.search(query, numHitsRaw);
150     long total = hits.getTotal();
151     Map siteToHits = new HashMap();
152     List resultList = new ArrayList();
153     Set seen = new HashSet();
154     List excludedSites = new ArrayList();
155     boolean totalIsExact = true;
156     for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
157       // get the next raw hit
158
if (rawHitNum >= hits.getLength()) {
159         // optimize query by prohibiting more matches on some excluded sites
160
Query optQuery = (Query)query.clone();
161         for (int i = 0; i < excludedSites.size(); i++) {
162           if (i == MAX_PROHIBITED_TERMS)
163             break;
164           optQuery.addProhibitedTerm(((String JavaDoc)excludedSites.get(i)), "site");
165         }
166         numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
167         LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
168         hits = searcher.search(optQuery, numHitsRaw);
169         LOG.info("found "+hits.getTotal()+" raw hits");
170         rawHitNum = 0;
171         continue;
172       }
173
174       Hit hit = hits.getHit(rawHitNum);
175       if (seen.contains(hit))
176         continue;
177       seen.add(hit);
178       
179       // get site hits for its site
180
String JavaDoc site = hit.getSite();
181       SiteHits siteHits = (SiteHits)siteToHits.get(site);
182       if (siteHits == null)
183         siteToHits.put(site, siteHits = new SiteHits());
184
185       // does this hit exceed maxHitsPerSite?
186
if (siteHits.size() == maxHitsPerSite) { // yes -- ignore the hit
187
if (!siteHits.maxSizeExceeded) {
188
189           // mark prior hits with moreFromSiteExcluded
190
for (int i = 0; i < siteHits.size(); i++) {
191             ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true);
192           }
193           siteHits.maxSizeExceeded = true;
194
195           excludedSites.add(site); // exclude site
196
}
197         totalIsExact = false;
198       } else { // no -- collect the hit
199
resultList.add(hit);
200         siteHits.add(hit);
201
202         // are we done?
203
// we need to find one more than asked for, so that we can tell if
204
// there are more hits to be shown
205
if (resultList.size() > numHits)
206           break;
207       }
208     }
209
210     Hits results =
211       new Hits(total,
212                (Hit[])resultList.toArray(new Hit[resultList.size()]));
213     results.setTotalIsExact(totalIsExact);
214     return results;
215   }
216     
217
218   public String JavaDoc getExplanation(Query query, Hit hit) throws IOException {
219     return searcher.getExplanation(query, hit);
220   }
221
222   public HitDetails getDetails(Hit hit) throws IOException {
223     return detailer.getDetails(hit);
224   }
225
226   public HitDetails[] getDetails(Hit[] hits) throws IOException {
227     return detailer.getDetails(hits);
228   }
229
230   public String JavaDoc getSummary(HitDetails hit, Query query) throws IOException {
231     return summarizer.getSummary(hit, query);
232   }
233
234   public String JavaDoc[] getSummary(HitDetails[] hits, Query query)
235     throws IOException {
236     return summarizer.getSummary(hits, query);
237   }
238
239   public byte[] getContent(HitDetails hit) throws IOException {
240     return content.getContent(hit);
241   }
242
243   public ParseData getParseData(HitDetails hit) throws IOException {
244     return content.getParseData(hit);
245   }
246
247   public ParseText getParseText(HitDetails hit) throws IOException {
248     return content.getParseText(hit);
249   }
250
251   public String JavaDoc[] getAnchors(HitDetails hit) throws IOException {
252     return content.getAnchors(hit);
253   }
254
255   public long getFetchDate(HitDetails hit) throws IOException {
256     return content.getFetchDate(hit);
257   }
258
259   /** For debugging. */
260   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
261     String JavaDoc usage = "NutchBean query";
262
263     if (args.length == 0) {
264       System.err.println(usage);
265       System.exit(-1);
266     }
267
268     NutchBean bean = new NutchBean();
269     Query query = Query.parse(args[0]);
270
271     Hits hits = bean.search(query, 10);
272     System.out.println("Total hits: " + hits.getTotal());
273     int length = (int)Math.min(hits.getTotal(), 10);
274     Hit[] show = hits.getHits(0, length);
275     HitDetails[] details = bean.getDetails(show);
276     String JavaDoc[] summaries = bean.getSummary(details, query);
277
278     for (int i = 0; i < hits.getLength(); i++) {
279       System.out.println(" "+i+" "+ details[i]);// + "\n" + summaries[i]);
280
}
281   }
282
283
284
285 }
286
Popular Tags