1 4 package com.openedit.search; 5 6 import java.io.File ; 7 import java.io.IOException ; 8 import java.util.ArrayList ; 9 import java.util.List ; 10 11 import org.apache.commons.logging.Log; 12 import org.apache.commons.logging.LogFactory; 13 import org.apache.hadoop.conf.Configuration; 14 import org.apache.nutch.searcher.Hit; 15 import org.apache.nutch.searcher.HitDetails; 16 import org.apache.nutch.searcher.Hits; 17 import org.apache.nutch.searcher.NutchBean; 18 import org.apache.nutch.searcher.Query; 19 import org.apache.nutch.searcher.Summary; 20 import org.apache.nutch.util.NutchConfiguration; 21 22 import com.openedit.OpenEditRuntimeException; 23 import com.openedit.WebPageRequest; 24 import com.openedit.modules.BaseModule; 25 import com.openedit.util.Exec; 26 import com.openedit.util.FileUtils; 27 28 32 public class SearchModule extends BaseModule 33 { 34 protected NutchBean fieldNutchBean; 35 protected Configuration fieldConfiguration; 36 protected boolean fieldCrawling; 37 private static final Log log = LogFactory.getLog(SearchModule.class); 38 public synchronized boolean crawlWebSite(WebPageRequest inReq) throws Exception 39 { 40 fieldCrawling = true; 41 try 42 { 43 fieldNutchBean = null; 44 45 File cvs = new File (getRoot(), "/search/urls/CVS"); 46 new FileUtils().deleteAll(cvs); 47 48 File tmp = File.createTempFile("nutch", "index"); 49 tmp.delete(); 50 51 List command = new ArrayList (); 53 command.add("java"); 54 command.add("-cp"); 55 56 String home = new File (getRoot(),"WEB-INF/").getAbsolutePath(); 57 StringBuffer cp = new StringBuffer (); 58 cp.append(new File ( home, "classes").getAbsolutePath() ); 59 cp.append(File.pathSeparatorChar); 60 cp.append(new File ( home, "/lib/nutch-0.8.1.jar").getAbsolutePath() ); 61 cp.append(File.pathSeparatorChar); 62 cp.append(new File ( home, "/lib/jakarta-oro-2.0.7.jar").getAbsolutePath() ); 63 cp.append(File.pathSeparatorChar); 64 cp.append(new File ( home, "/lib/lucene-core-1.9.1.jar").getAbsolutePath() ); 65 cp.append(File.pathSeparatorChar); 66 cp.append(new File ( home, "/lib/lucene-misc-1.9.1.jar").getAbsolutePath() ); 67 cp.append(File.pathSeparatorChar); 68 cp.append(new File ( home, "/lib/hadoop-0.4.0-patched.jar").getAbsolutePath() ); 69 cp.append(File.pathSeparatorChar); 70 cp.append(new File ( home, "/lib/log4j-1.2.13.jar").getAbsolutePath() ); 71 cp.append(File.pathSeparatorChar); 72 cp.append(new File ( home, "/lib/commons-logging-1.0.4.jar").getAbsolutePath() ); 73 cp.append(File.pathSeparatorChar); 74 cp.append(new File ( home, "/lib/commons-cli-2.0-SNAPSHOT.jar").getAbsolutePath() ); 75 76 command.add(cp.toString()); 77 command.add("org.apache.nutch.crawl.Crawl"); 78 command.add( new File ( getRoot() , "/search/urls/").getAbsolutePath() ); 79 command.add("-dir"); 80 command.add(tmp.getAbsolutePath()); 81 Exec exec = new Exec(); 82 exec.setTrackOutput(true); 83 if ( exec.runExec(command) ) 84 { 85 log.info("Completed ok"); 86 FileUtils util = new FileUtils(); 87 util.deleteAll(new File (getIndexDir())); 88 util.copyFiles(tmp,new File (getIndexDir())); 89 return true; 90 } 91 log.error("Crawl failed"); 92 return false; 93 94 } 95 finally 96 { 97 fieldCrawling = false; 98 } 99 } 100 101 public Configuration getConfiguration() 102 { 103 if ( fieldConfiguration == null) 104 { 105 fieldConfiguration = NutchConfiguration.create(); 106 fieldConfiguration.set( "searcher.dir", getIndexDir() ); 107 } 108 return fieldConfiguration; 109 } 110 111 protected NutchBean getNutchBean() 112 { 113 if ( fieldNutchBean == null) 114 { 115 try 116 { 117 fieldNutchBean = new NutchBean(getConfiguration()); 118 } 119 catch (IOException ex) 120 { 121 throw new OpenEditRuntimeException(ex); 122 } 123 } 124 return fieldNutchBean; 125 } 126 127 public void search(WebPageRequest inReq) throws Exception 128 { 129 if ( fieldCrawling ) 130 { 131 return; 132 } 133 String queryString = inReq.getRequestParameter("query"); 135 if (queryString == null) 136 { 137 queryString = ""; 138 } 139 140 int start = 0; String startString = inReq.getRequestParameter("start"); 142 if (startString != null && startString.length() > 0) 143 { 144 start = Integer.parseInt(startString); 145 } 146 147 int hitsPerPage = 10; String hitsString = inReq.getRequestParameter("hitsPerPage"); 149 if (hitsString != null && hitsString.length() > 0) 150 { 151 hitsPerPage = Integer.parseInt(hitsString); 152 } 153 154 161 Query query = Query.parse(queryString, getConfiguration()); 162 log.info("query: " + queryString); 163 164 NutchBean bean = getNutchBean(); 165 Hits hits = bean.search(query, start + hitsPerPage); 166 int end = (int)Math.min(hits.getLength(), start + hitsPerPage); 167 int length = end-start; 168 Hit[] show = hits.getHits(start, length); 169 170 log.info("total hits: " + hits.getTotal()); 171 inReq.putPageValue("hits",makeHitRows(show, query)); 172 inReq.putPageValue("query",queryString); 173 inReq.putPageValue("totalHits", new Long (hits.getTotal())); 174 inReq.putPageValue("firstHitIndex", new Integer (start)); 175 inReq.putPageValue("lastHitIndex", new Integer (end)); 176 } 177 178 182 private Object makeHitRows(Hit[] inShow, Query inQuery) throws Exception 183 { 184 HitDetails[] details = getNutchBean().getDetails(inShow); 185 Summary[] summaries = getNutchBean().getSummary(details, inQuery); 186 List hits = new ArrayList (inShow.length); 187 for (int i = 0; i < inShow.length; i++) 188 { 189 HitRow row = new HitRow(); 190 row.setHit(inShow[i]); 191 row.setDetail(details[i]); 192 row.setSummary(summaries[i].toString()); 193 hits.add(row); 194 } 195 return hits; 196 } 197 198 protected String getIndexDir() 199 { 200 return new File ( getRoot(), "search/index" ).getAbsolutePath(); 201 } 202 } 203 | Popular Tags |