1 2 3 4 package net.nutch.searcher; 5 6 import java.io.*; 7 import java.net.*; 8 import java.util.*; 9 import java.util.logging.Logger ; 10 import javax.servlet.ServletContext ; 11 12 import net.nutch.fs.*; 13 import net.nutch.util.*; 14 import net.nutch.parse.*; 15 import net.nutch.indexer.*; 16 17 21 public class NutchBean 22 implements Searcher, HitDetailer, HitSummarizer, HitContent { 23 24 public static final Logger LOG = 25 LogFormatter.getLogger("net.nutch.searcher.NutchBean"); 26 27 static { 28 LogFormatter.setShowThreadIDs(true); 29 } 30 31 private String [] segmentNames; 32 33 private Searcher searcher; 34 private HitDetailer detailer; 35 private HitSummarizer summarizer; 36 private HitContent content; 37 38 private float RAW_HITS_FACTOR = 39 NutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); 40 41 43 private static final int MAX_PROHIBITED_TERMS = 20; 44 45 46 public static NutchBean get(ServletContext app) throws IOException { 47 NutchBean bean = (NutchBean)app.getAttribute("nutchBean"); 48 if (bean == null) { 49 LOG.info("creating new bean"); 50 bean = new NutchBean(); 51 app.setAttribute("nutchBean", bean); 52 } 53 return bean; 54 } 55 56 57 public NutchBean() throws IOException { 58 this(new File(NutchConf.get("searcher.dir", "."))); 59 } 60 61 62 public NutchBean(File dir) throws IOException { 63 File servers = new File(dir, "search-servers.txt"); 64 if (servers.exists()) { 65 LOG.info("searching servers in " + servers.getCanonicalPath()); 66 init(new DistributedSearch.Client(servers)); 67 } else { 68 init(new File(dir, "index"), new File(dir, "segments")); 69 } 70 } 71 72 private void init(File indexDir, File segmentsDir) throws IOException { 73 IndexSearcher indexSearcher; 74 if (indexDir.exists()) { 75 LOG.info("opening merged index in " + indexDir.getCanonicalPath()); 76 indexSearcher = new IndexSearcher(indexDir.getCanonicalPath()); 77 } else { 78 LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath()); 79 80 Vector vDirs=new Vector(); 81 File [] directories = segmentsDir.listFiles(); 82 for(int i = 0; i < segmentsDir.listFiles().length; i++) { 83 File indexdone = new File(directories[i], IndexSegment.DONE_NAME); 84 if(indexdone.exists() && indexdone.isFile()) { 85 vDirs.add(directories[i]); 86 } 87 } 88 89 directories = new File[ vDirs.size() ]; 90 for(int i = 0; vDirs.size()>0; i++) { 91 directories[i]=(File)vDirs.remove(0); 92 } 93 94 indexSearcher = new IndexSearcher(directories); 95 } 96 97 FetchedSegments segments = new FetchedSegments(new LocalFileSystem(), segmentsDir.toString()); 98 99 this.segmentNames = segments.getSegmentNames(); 100 101 this.searcher = indexSearcher; 102 this.detailer = indexSearcher; 103 this.summarizer = segments; 104 this.content = segments; 105 } 106 107 private void init(DistributedSearch.Client client) throws IOException { 108 this.segmentNames = client.getSegmentNames(); 109 this.searcher = client; 110 this.detailer = client; 111 this.summarizer = client; 112 this.content = client; 113 } 114 115 116 public String [] getSegmentNames() { 117 return segmentNames; 118 } 119 120 public Hits search(Query query, int numHits) throws IOException { 121 return searcher.search(query, numHits); 122 } 123 124 private class SiteHits extends ArrayList { 125 private boolean maxSizeExceeded; 126 } 127 128 142 public Hits search(Query query, int numHits, int maxHitsPerSite) 143 throws IOException { 144 if (maxHitsPerSite <= 0) return searcher.search(query, numHits); 146 147 int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR); 148 LOG.info("searching for "+numHitsRaw+" raw hits"); 149 Hits hits = searcher.search(query, numHitsRaw); 150 long total = hits.getTotal(); 151 Map siteToHits = new HashMap(); 152 List resultList = new ArrayList(); 153 Set seen = new HashSet(); 154 List excludedSites = new ArrayList(); 155 boolean totalIsExact = true; 156 for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { 157 if (rawHitNum >= hits.getLength()) { 159 Query optQuery = (Query)query.clone(); 161 for (int i = 0; i < excludedSites.size(); i++) { 162 if (i == MAX_PROHIBITED_TERMS) 163 break; 164 optQuery.addProhibitedTerm(((String )excludedSites.get(i)), "site"); 165 } 166 numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR); 167 LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery); 168 hits = searcher.search(optQuery, numHitsRaw); 169 LOG.info("found "+hits.getTotal()+" raw hits"); 170 rawHitNum = 0; 171 continue; 172 } 173 174 Hit hit = hits.getHit(rawHitNum); 175 if (seen.contains(hit)) 176 continue; 177 seen.add(hit); 178 179 String site = hit.getSite(); 181 SiteHits siteHits = (SiteHits)siteToHits.get(site); 182 if (siteHits == null) 183 siteToHits.put(site, siteHits = new SiteHits()); 184 185 if (siteHits.size() == maxHitsPerSite) { if (!siteHits.maxSizeExceeded) { 188 189 for (int i = 0; i < siteHits.size(); i++) { 191 ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true); 192 } 193 siteHits.maxSizeExceeded = true; 194 195 excludedSites.add(site); } 197 totalIsExact = false; 198 } else { resultList.add(hit); 200 siteHits.add(hit); 201 202 if (resultList.size() > numHits) 206 break; 207 } 208 } 209 210 Hits results = 211 new Hits(total, 212 (Hit[])resultList.toArray(new Hit[resultList.size()])); 213 results.setTotalIsExact(totalIsExact); 214 return results; 215 } 216 217 218 public String getExplanation(Query query, Hit hit) throws IOException { 219 return searcher.getExplanation(query, hit); 220 } 221 222 public HitDetails getDetails(Hit hit) throws IOException { 223 return detailer.getDetails(hit); 224 } 225 226 public HitDetails[] getDetails(Hit[] hits) throws IOException { 227 return detailer.getDetails(hits); 228 } 229 230 public String getSummary(HitDetails hit, Query query) throws IOException { 231 return summarizer.getSummary(hit, query); 232 } 233 234 public String [] getSummary(HitDetails[] hits, Query query) 235 throws IOException { 236 return summarizer.getSummary(hits, query); 237 } 238 239 public byte[] getContent(HitDetails hit) throws IOException { 240 return content.getContent(hit); 241 } 242 243 public ParseData getParseData(HitDetails hit) throws IOException { 244 return content.getParseData(hit); 245 } 246 247 public ParseText getParseText(HitDetails hit) throws IOException { 248 return content.getParseText(hit); 249 } 250 251 public String [] getAnchors(HitDetails hit) throws IOException { 252 return content.getAnchors(hit); 253 } 254 255 public long getFetchDate(HitDetails hit) throws IOException { 256 return content.getFetchDate(hit); 257 } 258 259 260 public static void main(String [] args) throws Exception { 261 String usage = "NutchBean query"; 262 263 if (args.length == 0) { 264 System.err.println(usage); 265 System.exit(-1); 266 } 267 268 NutchBean bean = new NutchBean(); 269 Query query = Query.parse(args[0]); 270 271 Hits hits = bean.search(query, 10); 272 System.out.println("Total hits: " + hits.getTotal()); 273 int length = (int)Math.min(hits.getTotal(), 10); 274 Hit[] show = hits.getHits(0, length); 275 HitDetails[] details = bean.getDetails(show); 276 String [] summaries = bean.getSummary(details, query); 277 278 for (int i = 0; i < hits.getLength(); i++) { 279 System.out.println(" "+i+" "+ details[i]); } 281 } 282 283 284 285 } 286 | Popular Tags |