1 package org.apache.lucene.index; 2 3 18 19 import java.io.IOException ; 20 import java.io.PrintStream ; 21 import java.io.Reader ; 22 import java.io.StringReader ; 23 import java.util.Hashtable ; 24 import java.util.Enumeration ; 25 import java.util.Arrays ; 26 27 import org.apache.lucene.document.Document; 28 import org.apache.lucene.document.Field; 29 import org.apache.lucene.analysis.Analyzer; 30 import org.apache.lucene.analysis.TokenStream; 31 import org.apache.lucene.analysis.Token; 32 import org.apache.lucene.store.Directory; 33 import org.apache.lucene.store.IndexOutput; 34 import org.apache.lucene.search.Similarity; 35 36 final class DocumentWriter { 37 private Analyzer analyzer; 38 private Directory directory; 39 private Similarity similarity; 40 private FieldInfos fieldInfos; 41 private int maxFieldLength; 42 private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; 43 private PrintStream infoStream; 44 45 52 DocumentWriter(Directory directory, Analyzer analyzer, 53 Similarity similarity, int maxFieldLength) { 54 this.directory = directory; 55 this.analyzer = analyzer; 56 this.similarity = similarity; 57 this.maxFieldLength = maxFieldLength; 58 } 59 60 DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) { 61 this.directory = directory; 62 this.analyzer = analyzer; 63 this.similarity = writer.getSimilarity(); 64 this.maxFieldLength = writer.getMaxFieldLength(); 65 this.termIndexInterval = writer.getTermIndexInterval(); 66 } 67 68 final void addDocument(String segment, Document doc) 69 throws IOException { 70 fieldInfos = new FieldInfos(); 72 fieldInfos.add(doc); 73 fieldInfos.write(directory, segment + ".fnm"); 74 75 FieldsWriter fieldsWriter = 77 new FieldsWriter(directory, segment, fieldInfos); 78 try { 79 fieldsWriter.addDocument(doc); 80 } finally { 81 fieldsWriter.close(); 82 } 83 84 postingTable.clear(); fieldLengths = new int[fieldInfos.size()]; fieldPositions = new int[fieldInfos.size()]; fieldOffsets = new int[fieldInfos.size()]; 90 fieldBoosts = new float[fieldInfos.size()]; Arrays.fill(fieldBoosts, doc.getBoost()); 92 93 invertDocument(doc); 94 95 Posting[] postings = sortPostingTable(); 97 98 110 111 writePostings(postings, segment); 113 114 writeNorms(segment); 116 117 } 118 119 private final Hashtable postingTable = new Hashtable (); 122 private int[] fieldLengths; 123 private int[] fieldPositions; 124 private int[] fieldOffsets; 125 private float[] fieldBoosts; 126 127 private final void invertDocument(Document doc) 129 throws IOException { 130 Enumeration fields = doc.fields(); 131 while (fields.hasMoreElements()) { 132 Field field = (Field) fields.nextElement(); 133 String fieldName = field.name(); 134 int fieldNumber = fieldInfos.fieldNumber(fieldName); 135 136 int length = fieldLengths[fieldNumber]; int position = fieldPositions[fieldNumber]; if (length>0) position+=analyzer.getPositionIncrementGap(fieldName); 139 int offset = fieldOffsets[fieldNumber]; 141 if (field.isIndexed()) { 142 if (!field.isTokenized()) { String stringValue = field.stringValue(); 144 if(field.isStoreOffsetWithTermVector()) 145 addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); 146 else 147 addPosition(fieldName, stringValue, position++, null); 148 offset += stringValue.length(); 149 length++; 150 } else 151 { 152 Reader reader; if (field.readerValue() != null) 154 reader = field.readerValue(); 155 else if (field.stringValue() != null) 156 reader = new StringReader (field.stringValue()); 157 else 158 throw new IllegalArgumentException 159 ("field must have either String or Reader value"); 160 161 TokenStream stream = analyzer.tokenStream(fieldName, reader); 163 try { 164 Token lastToken = null; 165 for (Token t = stream.next(); t != null; t = stream.next()) { 166 position += (t.getPositionIncrement() - 1); 167 168 if(field.isStoreOffsetWithTermVector()) 169 addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); 170 else 171 addPosition(fieldName, t.termText(), position++, null); 172 173 lastToken = t; 174 if (++length > maxFieldLength) { 175 if (infoStream != null) 176 infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens"); 177 break; 178 } 179 } 180 181 if(lastToken != null) 182 offset += lastToken.endOffset() + 1; 183 184 } finally { 185 stream.close(); 186 } 187 } 188 189 fieldLengths[fieldNumber] = length; fieldPositions[fieldNumber] = position; fieldBoosts[fieldNumber] *= field.getBoost(); 192 fieldOffsets[fieldNumber] = offset; 193 } 194 } 195 } 196 197 private final Term termBuffer = new Term("", ""); 199 private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { 200 termBuffer.set(field, text); 201 Posting ti = (Posting) postingTable.get(termBuffer); 203 if (ti != null) { int freq = ti.freq; 205 if (ti.positions.length == freq) { int[] newPositions = new int[freq * 2]; int[] positions = ti.positions; 208 for (int i = 0; i < freq; i++) newPositions[i] = positions[i]; 210 ti.positions = newPositions; 211 } 212 ti.positions[freq] = position; 214 if (offset != null) { 215 if (ti.offsets.length == freq){ 216 TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; 217 TermVectorOffsetInfo [] offsets = ti.offsets; 218 for (int i = 0; i < freq; i++) 219 { 220 newOffsets[i] = offsets[i]; 221 } 222 ti.offsets = newOffsets; 223 } 224 ti.offsets[freq] = offset; 225 } 226 ti.freq = freq + 1; } else { Term term = new Term(field, text, false); 229 postingTable.put(term, new Posting(term, position, offset)); 230 } 231 } 232 233 private final Posting[] sortPostingTable() { 234 Posting[] array = new Posting[postingTable.size()]; 236 Enumeration postings = postingTable.elements(); 237 for (int i = 0; postings.hasMoreElements(); i++) 238 array[i] = (Posting) postings.nextElement(); 239 240 quickSort(array, 0, array.length - 1); 242 243 return array; 244 } 245 246 private static final void quickSort(Posting[] postings, int lo, int hi) { 247 if (lo >= hi) 248 return; 249 250 int mid = (lo + hi) / 2; 251 252 if (postings[lo].term.compareTo(postings[mid].term) > 0) { 253 Posting tmp = postings[lo]; 254 postings[lo] = postings[mid]; 255 postings[mid] = tmp; 256 } 257 258 if (postings[mid].term.compareTo(postings[hi].term) > 0) { 259 Posting tmp = postings[mid]; 260 postings[mid] = postings[hi]; 261 postings[hi] = tmp; 262 263 if (postings[lo].term.compareTo(postings[mid].term) > 0) { 264 Posting tmp2 = postings[lo]; 265 postings[lo] = postings[mid]; 266 postings[mid] = tmp2; 267 } 268 } 269 270 int left = lo + 1; 271 int right = hi - 1; 272 273 if (left >= right) 274 return; 275 276 Term partition = postings[mid].term; 277 278 for (; ;) { 279 while (postings[right].term.compareTo(partition) > 0) 280 --right; 281 282 while (left < right && postings[left].term.compareTo(partition) <= 0) 283 ++left; 284 285 if (left < right) { 286 Posting tmp = postings[left]; 287 postings[left] = postings[right]; 288 postings[right] = tmp; 289 --right; 290 } else { 291 break; 292 } 293 } 294 295 quickSort(postings, lo, left); 296 quickSort(postings, left + 1, hi); 297 } 298 299 private final void writePostings(Posting[] postings, String segment) 300 throws IOException { 301 IndexOutput freq = null, prox = null; 302 TermInfosWriter tis = null; 303 TermVectorsWriter termVectorWriter = null; 304 try { 305 freq = directory.createOutput(segment + ".frq"); 307 prox = directory.createOutput(segment + ".prx"); 308 tis = new TermInfosWriter(directory, segment, fieldInfos, 309 termIndexInterval); 310 TermInfo ti = new TermInfo(); 311 String currentField = null; 312 313 for (int i = 0; i < postings.length; i++) { 314 Posting posting = postings[i]; 315 316 ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); 318 tis.add(posting.term, ti); 319 320 int postingFreq = posting.freq; 322 if (postingFreq == 1) freq.writeVInt(1); else { 325 freq.writeVInt(0); freq.writeVInt(postingFreq); } 328 329 int lastPosition = 0; int[] positions = posting.positions; 331 for (int j = 0; j < postingFreq; j++) { int position = positions[j]; 333 prox.writeVInt(position - lastPosition); 334 lastPosition = position; 335 } 336 String termField = posting.term.field(); 338 if (currentField != termField) { 339 currentField = termField; 341 FieldInfo fi = fieldInfos.fieldInfo(currentField); 342 if (fi.storeTermVector) { 343 if (termVectorWriter == null) { 344 termVectorWriter = 345 new TermVectorsWriter(directory, segment, fieldInfos); 346 termVectorWriter.openDocument(); 347 } 348 termVectorWriter.openField(currentField); 349 350 } else if (termVectorWriter != null) { 351 termVectorWriter.closeField(); 352 } 353 } 354 if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { 355 termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); 356 } 357 } 358 if (termVectorWriter != null) 359 termVectorWriter.closeDocument(); 360 } finally { 361 IOException keep = null; 364 if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; } 365 if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; } 366 if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; } 367 if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; } 368 if (keep != null) throw (IOException ) keep.fillInStackTrace(); 369 } 370 } 371 372 private final void writeNorms(String segment) throws IOException { 373 for(int n = 0; n < fieldInfos.size(); n++){ 374 FieldInfo fi = fieldInfos.fieldInfo(n); 375 if(fi.isIndexed && !fi.omitNorms){ 376 float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]); 377 IndexOutput norms = directory.createOutput(segment + ".f" + n); 378 try { 379 norms.writeByte(Similarity.encodeNorm(norm)); 380 } finally { 381 norms.close(); 382 } 383 } 384 } 385 } 386 387 389 void setInfoStream(PrintStream infoStream) { 390 this.infoStream = infoStream; 391 } 392 393 } 394 395 final class Posting { Term term; int freq; int[] positions; TermVectorOffsetInfo [] offsets; 400 401 Posting(Term t, int position, TermVectorOffsetInfo offset) { 402 term = t; 403 freq = 1; 404 positions = new int[1]; 405 positions[0] = position; 406 if(offset != null){ 407 offsets = new TermVectorOffsetInfo[1]; 408 offsets[0] = offset; 409 } 410 else 411 offsets = null; 412 } 413 } 414 | Popular Tags |