NutchBean


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.searcher;
5   
6   import java.io.*;
7   import java.net.*;
8   import java.util.*;
9   import java.util.logging.Logger  ;
10  import javax.servlet.ServletContext  ;
11  
12  import net.nutch.fs.*;
13  import net.nutch.util.*;
14  import net.nutch.parse.*;
15  import net.nutch.indexer.*;
16  
17  /** 
18   * One stop shopping for search-related functionality.
19   * @version $Id: NutchBean.java,v 1.18 2004/12/05 09:43:48 mike_cafarella Exp $
20   */   
21  public class NutchBean
22    implements Searcher, HitDetailer, HitSummarizer, HitContent {
23  
24    public static final Logger   LOG =
25      LogFormatter.getLogger("net.nutch.searcher.NutchBean");
26  
27    static {
28      LogFormatter.setShowThreadIDs(true);
29    }
30  
31    private String  [] segmentNames;
32  
33    private Searcher searcher;
34    private HitDetailer detailer;
35    private HitSummarizer summarizer;
36    private HitContent content;
37  
38    private float RAW_HITS_FACTOR =
39      NutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
40  
41    /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
42     * don't want to use too many of those. */ 
43    private static final int MAX_PROHIBITED_TERMS = 20;
44  
45    /** Cache in servlet context. */
46    public static NutchBean get(ServletContext   app) throws IOException {
47      NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
48      if (bean == null) {
49        LOG.info("creating new bean");
50        bean = new NutchBean();
51        app.setAttribute("nutchBean", bean);
52      }
53      return bean;
54    }
55  
56    /** Construct reading from connected directory. */
57    public NutchBean() throws IOException {
58      this(new File(NutchConf.get("searcher.dir", ".")));
59    }
60  
61    /** Construct in a named directory. */
62    public NutchBean(File dir) throws IOException {
63      File servers = new File(dir, "search-servers.txt");
64      if (servers.exists()) {
65        LOG.info("searching servers in " + servers.getCanonicalPath());
66        init(new DistributedSearch.Client(servers));
67      } else {
68        init(new File(dir, "index"), new File(dir, "segments"));
69      }
70    }
71  
72    private void init(File indexDir, File segmentsDir) throws IOException {
73      IndexSearcher indexSearcher;
74      if (indexDir.exists()) {
75        LOG.info("opening merged index in " + indexDir.getCanonicalPath());
76        indexSearcher = new IndexSearcher(indexDir.getCanonicalPath());
77      } else {
78        LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
79        
80        Vector vDirs=new Vector();
81        File [] directories = segmentsDir.listFiles();
82        for(int i = 0; i < segmentsDir.listFiles().length; i++) {
83          File indexdone = new File(directories[i], IndexSegment.DONE_NAME);
84          if(indexdone.exists() && indexdone.isFile()) {
85            vDirs.add(directories[i]);
86          }
87        }
88        
89        directories = new File[ vDirs.size() ];
90        for(int i = 0; vDirs.size()>0; i++) {
91          directories[i]=(File)vDirs.remove(0);
92        }
93        
94        indexSearcher = new IndexSearcher(directories);
95      }
96  
97      FetchedSegments segments = new FetchedSegments(new LocalFileSystem(), segmentsDir.toString());
98      
99      this.segmentNames = segments.getSegmentNames();
100     
101     this.searcher = indexSearcher;
102     this.detailer = indexSearcher;
103     this.summarizer = segments;
104     this.content = segments;
105   }
106 
107   private void init(DistributedSearch.Client client) throws IOException {
108     this.segmentNames = client.getSegmentNames();
109     this.searcher = client;
110     this.detailer = client;
111     this.summarizer = client;
112     this.content = client;
113   }
114 
115 
116   public String  [] getSegmentNames() {
117     return segmentNames;
118   }
119 
120   public Hits search(Query query, int numHits) throws IOException {
121     return searcher.search(query, numHits);
122   }
123   
124   private class SiteHits extends ArrayList {
125     private boolean maxSizeExceeded;
126   }
127 
128   /**
129    * Search for pages matching a query, eliminating excessive hits from sites.
130    * Hits for a site in excess of <code>maxHitsPerSite</code> are removed from
131    * the results.  The remaining hits for such sites have {@link
132    * Hit#moreFromSiteExcluded()} set.
133    * <p>
134    * If maxHitsPerSite is zero then all hits are returned.
135    * 
136    * @param query query
137    * @param numHits number of requested hits
138    * @param maxHitsPerSite the maximum hits returned per site, or zero
139    * @return Hits the matching hits
140    * @throws IOException
141    */
142   public Hits search(Query query, int numHits, int maxHitsPerSite)
143        throws IOException {
144     if (maxHitsPerSite <= 0)                      // disable site checking
145       return searcher.search(query, numHits);
146 
147     int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
148     LOG.info("searching for "+numHitsRaw+" raw hits");
149     Hits hits = searcher.search(query, numHitsRaw);
150     long total = hits.getTotal();
151     Map siteToHits = new HashMap();
152     List resultList = new ArrayList();
153     Set seen = new HashSet();
154     List excludedSites = new ArrayList();
155     boolean totalIsExact = true;
156     for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
157       // get the next raw hit
158       if (rawHitNum >= hits.getLength()) {
159         // optimize query by prohibiting more matches on some excluded sites
160         Query optQuery = (Query)query.clone();
161         for (int i = 0; i < excludedSites.size(); i++) {
162           if (i == MAX_PROHIBITED_TERMS)
163             break;
164           optQuery.addProhibitedTerm(((String  )excludedSites.get(i)), "site");
165         }
166         numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
167         LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
168         hits = searcher.search(optQuery, numHitsRaw);
169         LOG.info("found "+hits.getTotal()+" raw hits");
170         rawHitNum = 0;
171         continue;
172       }
173 
174       Hit hit = hits.getHit(rawHitNum);
175       if (seen.contains(hit))
176         continue;
177       seen.add(hit);
178       
179       // get site hits for its site
180       String   site = hit.getSite();
181       SiteHits siteHits = (SiteHits)siteToHits.get(site);
182       if (siteHits == null)
183         siteToHits.put(site, siteHits = new SiteHits());
184 
185       // does this hit exceed maxHitsPerSite?
186       if (siteHits.size() == maxHitsPerSite) {    // yes -- ignore the hit
187         if (!siteHits.maxSizeExceeded) {
188 
189           // mark prior hits with moreFromSiteExcluded
190           for (int i = 0; i < siteHits.size(); i++) {
191             ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true);
192           }
193           siteHits.maxSizeExceeded = true;
194 
195           excludedSites.add(site);                // exclude site
196         }
197         totalIsExact = false;
198       } else {                                    // no -- collect the hit
199         resultList.add(hit);
200         siteHits.add(hit);
201 
202         // are we done?
203         // we need to find one more than asked for, so that we can tell if
204         // there are more hits to be shown
205         if (resultList.size() > numHits)
206           break;
207       }
208     }
209 
210     Hits results =
211       new Hits(total,
212                (Hit[])resultList.toArray(new Hit[resultList.size()]));
213     results.setTotalIsExact(totalIsExact);
214     return results;
215   }
216     
217 
218   public String   getExplanation(Query query, Hit hit) throws IOException {
219     return searcher.getExplanation(query, hit);
220   }
221 
222   public HitDetails getDetails(Hit hit) throws IOException {
223     return detailer.getDetails(hit);
224   }
225 
226   public HitDetails[] getDetails(Hit[] hits) throws IOException {
227     return detailer.getDetails(hits);
228   }
229 
230   public String   getSummary(HitDetails hit, Query query) throws IOException {
231     return summarizer.getSummary(hit, query);
232   }
233 
234   public String  [] getSummary(HitDetails[] hits, Query query)
235     throws IOException {
236     return summarizer.getSummary(hits, query);
237   }
238 
239   public byte[] getContent(HitDetails hit) throws IOException {
240     return content.getContent(hit);
241   }
242 
243   public ParseData getParseData(HitDetails hit) throws IOException {
244     return content.getParseData(hit);
245   }
246 
247   public ParseText getParseText(HitDetails hit) throws IOException {
248     return content.getParseText(hit);
249   }
250 
251   public String  [] getAnchors(HitDetails hit) throws IOException {
252     return content.getAnchors(hit);
253   }
254 
255   public long getFetchDate(HitDetails hit) throws IOException {
256     return content.getFetchDate(hit);
257   }
258 
259   /** For debugging. */
260   public static void main(String  [] args) throws Exception   {
261     String   usage = "NutchBean query";
262 
263     if (args.length == 0) {
264       System.err.println(usage);
265       System.exit(-1);
266     }
267 
268     NutchBean bean = new NutchBean();
269     Query query = Query.parse(args[0]);
270 
271     Hits hits = bean.search(query, 10);
272     System.out.println("Total hits: " + hits.getTotal());
273     int length = (int)Math.min(hits.getTotal(), 10);
274     Hit[] show = hits.getHits(0, length);
275     HitDetails[] details = bean.getDetails(show);
276     String  [] summaries = bean.getSummary(details, query);
277 
278     for (int i = 0; i < hits.getLength(); i++) {
279       System.out.println(" "+i+" "+ details[i]);// + "\n" + summaries[i]);
280     }
281   }
282 
283 
284 
285 }
286
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags