KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > indexer > IndexSegment


1 /* Copyright (c) 2003-2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.indexer;
5
6 import net.nutch.pagedb.*;
7 import net.nutch.linkdb.*;
8 import net.nutch.fetcher.*;
9 import net.nutch.parse.*;
10 import net.nutch.analysis.NutchDocumentAnalyzer;
11 import net.nutch.db.*;
12 import net.nutch.io.*;
13 import net.nutch.fs.*;
14 import net.nutch.segment.SegmentReader;
15 import net.nutch.util.*;
16
17 import org.apache.lucene.index.IndexReader;
18 import org.apache.lucene.index.IndexWriter;
19 import org.apache.lucene.document.Document;
20 import org.apache.lucene.document.Field;
21
22 import java.util.logging.*;
23 import java.util.*;
24 import java.io.*;
25
26 /** Creates an index for the output corresponding to a single fetcher run. */
27 public class IndexSegment {
28   public static final String JavaDoc DONE_NAME = "index.done";
29   public static final Logger LOG =
30     LogFormatter.getLogger("net.nutch.index.IndexSegment");
31   
32   public static int LOG_STEP = 20000;
33
34   private boolean boostByLinkCount =
35     NutchConf.getBoolean("indexer.boost.by.link.count", false);
36
37   private float scorePower = NutchConf.getFloat("indexer.score.power", 0.5f);
38   private int maxFieldLength = NutchConf.getInt("indexer.max.tokens", 10000);
39   private NutchFileSystem nfs;
40   private long maxDocs = Long.MAX_VALUE;
41   private File srcDir;
42   private File localWorkingDir;
43
44   /**
45    * Index a segment in the given NFS.
46    */

47   public IndexSegment(NutchFileSystem nfs, long maxDocs, File srcDir, File localWorkingDir) {
48       this.nfs = nfs;
49       this.maxDocs = maxDocs;
50       this.srcDir = srcDir;
51       this.localWorkingDir = localWorkingDir;
52   }
53
54   /** Determines the power of link analyis scores. Each pages's boost is
55    * set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link
56    * analysis score and <i>scorePower</i> is the value passed to this method.
57    */

58   public void setScorePower(float power) { scorePower = power; }
59
60   public void indexPages() throws Exception JavaDoc {
61       //
62
// First, see if it's ever been indexed before
63
//
64
File doneFile = new File(srcDir, DONE_NAME);
65       if (nfs.exists(doneFile)) {
66           throw new IOException("already indexed: " + doneFile + " exists");
67       }
68
69       //
70
// OK, fine. Build the writer to the local file, set params
71
//
72
File outputIndex = new File(srcDir, "index");
73       File tmpOutputIndex = new File(localWorkingDir, "index");
74
75       File localOutput = nfs.startLocalOutput(outputIndex, tmpOutputIndex);
76
77       IndexWriter writer
78           = new IndexWriter(localOutput,
79                             new NutchDocumentAnalyzer(), true);
80       writer.mergeFactor = 50;
81       writer.minMergeDocs = 50;
82       writer.maxFieldLength = maxFieldLength;
83       //writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
84
writer.setUseCompoundFile(false);
85       writer.setSimilarity(new NutchSimilarity());
86
87       SegmentReader sr = null;
88
89       long start = System.currentTimeMillis();
90       long delta = start;
91       long curTime, total = 0;
92       long count = 0;
93       try {
94           LOG.info("* Opening segment " + srcDir.getName());
95           sr = new SegmentReader(nfs, srcDir, false, true, true, true);
96
97           total = sr.size;
98           
99           String JavaDoc segmentName = srcDir.getCanonicalFile().getName();
100           FetcherOutput fetcherOutput = new FetcherOutput();
101           ParseText parseText = new ParseText();
102           ParseData parseData = new ParseData();
103           LOG.info("* Indexing segment " + srcDir.getName());
104
105           //
106
// Iterate through all docs in the input
107
//
108
maxDocs = Math.min(sr.size, maxDocs);
109           for (count = 0; count < maxDocs; count++) {
110               if (!sr.get(count, fetcherOutput, null, parseText, parseData)) continue;
111
112               // only index the page if it was fetched correctly
113
if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) {
114                   continue;
115               }
116
117               // reconstruct parse
118
Parse parse = new ParseImpl(parseText.getText(), parseData);
119
120               // build initial document w/ core fields
121
Document doc = makeDocument(segmentName, count,
122                                           fetcherOutput, parse);
123
124               // run filters to add more fields to the document
125
doc = IndexingFilters.filter(doc, parse, fetcherOutput);
126     
127               // add the document to the index
128
writer.addDocument(doc);
129               if (count > 0 && count % LOG_STEP == 0) {
130                 curTime = System.currentTimeMillis();
131                 LOG.info(" Processed " + count + " records (" +
132                         ((float)LOG_STEP * 1000.0f / (float)(curTime - delta)) +
133                         " rec/s)");
134                 delta = curTime;
135               }
136           }
137       } catch (EOFException e) {
138           LOG.warning("Unexpected EOF in: " + srcDir +
139                       " at entry #" + count + ". Ignoring.");
140       } finally {
141         sr.close();
142       }
143       LOG.info("* Optimizing index...");
144       writer.optimize();
145       writer.close();
146
147       //
148
// Put the local file in its place via NFS
149
//
150
//nfs.completeLocalOutput(new File(outputDir, "index"), new File(srcDir, "index"));
151
LOG.info("* Moving index to NFS if needed...");
152       nfs.completeLocalOutput(outputIndex, tmpOutputIndex);
153
154       //
155
// Emit "done" file
156
//
157
OutputStream out = nfs.create(doneFile);
158       out.close();
159       delta = System.currentTimeMillis() - start;
160       float eps = (float) count / (float) (delta / 1000);
161       LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + total +
162               " records in " + ((float) delta / 1000f) + " s (" + eps + " rec/s).");
163   }
164
165   /**
166    * Add core fields, required by other core components & features (i.e.,
167    * merge, dedup, explain).
168    */

169   private Document makeDocument(String JavaDoc segmentName, long docNo,
170                                 FetcherOutput fo, Parse parse) {
171
172     Document doc = new Document();
173
174     // add docno & segment, used to map from merged index back to segment files
175
doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
176     doc.add(Field.UnIndexed("segment", segmentName));
177
178     // add digest, used by dedup
179
doc.add(Field.UnIndexed("digest", fo.getMD5Hash().toString()));
180
181     // compute boost
182
// 1. Start with page's score from DB -- 1.0 if no link analysis.
183
float boost = fo.getFetchListEntry().getPage().getScore();
184     // 2. Apply scorePower to this.
185
boost = (float)Math.pow(boost, scorePower);
186     // 3. Optionally boost by log of incoming anchor count.
187
if (boostByLinkCount)
188       boost *= (float)Math.log(Math.E + fo.getAnchors().length);
189     // 4. Apply boost to all indexed fields.
190
doc.setBoost(boost);
191
192     // store boost for use by explain and dedup
193
doc.add(Field.UnIndexed("boost", Float.toString(boost)));
194
195     return doc;
196   }
197
198
199   /**
200    * Create an index for the input files in the named directory.
201    */

202   public static void main(String JavaDoc[] args) throws Exception JavaDoc {
203       String JavaDoc usage = "IndexSegment (-local | -ndfs <namenode:port>) <segment_directory> [-dir <workingdir>]";
204       if (args.length == 0) {
205           System.err.println("Usage: " + usage);
206           return;
207       }
208
209       NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
210       try {
211           int maxDocs = Integer.MAX_VALUE;
212           File srcDir = null;
213           File workingDir = new File(new File("").getCanonicalPath());
214           for (int i = 0; i < args.length; i++) {
215               if (args[i] != null) {
216                   if (args[i].equals("-max")) { // parse -max option
217
i++;
218                       maxDocs = Integer.parseInt(args[i]);
219                   } else if (args[i].equals("-dir")) {
220                       i++;
221                       workingDir = new File(new File(args[i]).getCanonicalPath());
222                   } else {
223                       srcDir = new File(args[i]);
224                   }
225               }
226           }
227
228           workingDir = new File(workingDir, "indexsegment-workingdir");
229           if (workingDir.exists()) {
230               FileUtil.fullyDelete(workingDir);
231           }
232           IndexSegment indexer = new IndexSegment(nfs, maxDocs, srcDir, workingDir);
233           LOG.info("indexing segment: " + srcDir);
234           indexer.indexPages();
235           LOG.info("done indexing");
236           FileUtil.fullyDelete(workingDir);
237       } finally {
238           nfs.close();
239       }
240   }
241 }
242
Popular Tags