KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > WebDBWriter


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.db;
5
6 import java.io.*;
7 import java.util.*;
8 import java.util.logging.*;
9
10 import net.nutch.io.*;
11 import net.nutch.fs.*;
12 import net.nutch.util.*;
13 import net.nutch.pagedb.*;
14 import net.nutch.linkdb.*;
15
16 /***************************************************
17  * This is a wrapper class that allows us to reorder
18  * write operations to the linkdb and pagedb. It is
19  * useful only for objects like UpdateDatabaseTool,
20  * which just does writes.
21  *
22  * The WebDBWriter is a traditional single-pass database writer.
23  * It does not cache any instructions to disk (but it does
24  * in memory, with possible resorting). It certainly does
25  * nothing in a distributed fashion.
26  *
27  * There are other implementors of IWebDBWriter that do
28  * all that fancy stuff.
29  *
30  * @author Mike Cafarella
31  *************************************************/

32 public class WebDBWriter implements IWebDBWriter {
33     static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBWriter");
34     static final byte CUR_VERSION = 0;
35
36     // db opcodes
37
static final byte ADD_PAGE = 0;
38     static final byte ADD_PAGE_WITH_SCORE = 1;
39     static final byte ADD_PAGE_IFN_PRESENT = 2;
40     static final byte DEL_PAGE = 3;
41     static final int ADD_LINK = 0;
42     static final int DEL_LINK = 1;
43     static final int DEL_SINGLE_LINK = 2;
44
45     // filenames
46
static final String JavaDoc PAGES_BY_URL = "pagesByURL";
47     static final String JavaDoc PAGES_BY_MD5 = "pagesByMD5";
48     static final String JavaDoc LINKS_BY_URL = "linksByURL";
49     static final String JavaDoc LINKS_BY_MD5 = "linksByMD5";
50     static final String JavaDoc STATS_FILE = "stats";
51
52     // Result codes for page-url comparisons
53
static final int NO_OUTLINKS = 0;
54     static final int HAS_OUTLINKS = 1;
55     static final int LINK_INVALID = 2;
56
57     /********************************************
58      * PageInstruction holds an operation over a Page.
59      *********************************************/

60     public static class PageInstruction implements WritableComparable {
61         byte opcode;
62         boolean hasLink;
63         Page page;
64         Link link;
65
66         /**
67          */

68         public PageInstruction() {}
69
70         /**
71          */

72         public PageInstruction(Page page, int opcode) {
73             set(page, opcode);
74         }
75
76         /**
77          */

78         public PageInstruction(Page page, Link link, int opcode) {
79             set(page, link, opcode);
80         }
81
82         /**
83          * Init from another PageInstruction object.
84          */

85         public void set(PageInstruction that) {
86             this.opcode = that.opcode;
87
88             if (this.page == null) {
89                 this.page = new Page();
90             }
91             this.page.set(that.page);
92
93             if (this.link == null) {
94                 this.link = new Link();
95             }
96             this.hasLink = that.hasLink;
97             if (this.hasLink) {
98                 this.link.set(that.link);
99             }
100         }
101
102         /**
103          * Init PageInstruction with no Link
104          */

105         public void set(Page page, int opcode) {
106             this.opcode = (byte) opcode;
107             this.page = page;
108             this.hasLink = false;
109             this.link = null;
110         }
111
112         /**
113          * Init PageInstruction with a Link
114          */

115         public void set(Page page, Link link, int opcode) {
116             this.opcode = (byte) opcode;
117             this.page = page;
118             this.hasLink = true;
119             this.link = link;
120         }
121
122         //
123
// WritableComparable
124
//
125
public int compareTo(Object JavaDoc o) {
126             int pageResult = this.page.compareTo(((PageInstruction) o).page);
127             if (pageResult != 0) {
128                 return pageResult;
129             } else {
130                 return this.opcode - (((PageInstruction) o).opcode);
131             }
132         }
133         public void write(DataOutput out) throws IOException {
134             out.writeByte(opcode);
135             page.write(out);
136             out.writeByte(hasLink ? 1 : 0);
137             if (hasLink) {
138                 link.write(out);
139             }
140         }
141         public void readFields(DataInput in) throws IOException {
142             opcode = in.readByte();
143             if (page == null) {
144                 page = new Page();
145             }
146             page.readFields(in);
147             
148             if (link == null) {
149                 link = new Link();
150             }
151             hasLink = (1 == in.readByte());
152             if (hasLink) {
153                 link.readFields(in);
154             }
155         }
156         public Page getPage() {
157             return page;
158         }
159         public Link getLink() {
160             if (hasLink) {
161                 return link;
162             } else {
163                 return null;
164             }
165         }
166         public int getInstruction() {
167             return opcode;
168         }
169
170         /**
171          * Sorts the instruction first by Page, then by opcode.
172          */

173         public static class PageComparator extends WritableComparator {
174             private static final Page.Comparator PAGE_COMPARATOR =
175             new Page.Comparator();
176
177             public PageComparator() { super(PageInstruction.class); }
178
179             /** Optimized comparator. */
180             public int compare(byte[] b1, int s1, int l1,
181                                byte[] b2, int s2, int l2) {
182                 int opcode1 = b1[s1];
183                 int opcode2 = b2[s2];
184                 int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
185                 if (c != 0)
186                     return c;
187                 return opcode1 - opcode2;
188             }
189         }
190  
191         /*****************************************************
192          * Sorts the instruction first by url, then by opcode.
193          *****************************************************/

194         public static class UrlComparator extends WritableComparator {
195             private static final Page.UrlComparator PAGE_COMPARATOR =
196             new Page.UrlComparator();
197
198             public UrlComparator() { super(PageInstruction.class); }
199
200             /**
201              * We need to sort by ordered URLs. First, we sort by
202              * URL, then by opcode.
203              */

204             public int compare(WritableComparable a, WritableComparable b) {
205                 PageInstruction instructionA = (PageInstruction)a;
206                 PageInstruction instructionB = (PageInstruction)b;
207                 Page pageA = instructionA.getPage();
208                 Page pageB = instructionB.getPage();
209
210                 int result = pageA.getURL().compareTo(pageB.getURL());
211                 if (result != 0) {
212                     return result;
213                 } else {
214                     return instructionA.opcode - instructionB.opcode;
215                 }
216             }
217
218             /**
219              * Optimized comparator.
220              */

221             public int compare(byte[] b1, int s1, int l1,
222                                byte[] b2, int s2, int l2) {
223                 int opcode1 = b1[s1];
224                 int opcode2 = b2[s2];
225                 int c = PAGE_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
226                 if (c != 0)
227                     return c;
228                 return opcode1 - opcode2;
229             }
230         }
231     }
232
233     /********************************************************
234      * PageInstructionWriter very efficiently writes a
235      * PageInstruction to a SequenceFile.Writer. Much better
236      * than calling "writer.append(new PageInstruction())"
237      ********************************************************/

238     public static class PageInstructionWriter {
239         PageInstruction pi = new PageInstruction();
240
241         /**
242          */

243         public PageInstructionWriter() {
244         }
245
246         /**
247          * Append the PageInstruction info to the indicated SequenceFile,
248          * and keep the PI for later reuse.
249          */

250         public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Page page, int opcode, Writable val) throws IOException {
251             pi.set(page, opcode);
252             writer.append(pi, val);
253         }
254
255         /**
256          * Append the PageInstruction info to the indicated SequenceFile,
257          * and keep the PI for later reuse.
258          */

259         public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Page page, Link link, int opcode, Writable val) throws IOException {
260             pi.set(page, link, opcode);
261             writer.append(pi, val);
262         }
263     }
264
265     /*************************************************************
266      * Reduce multiple instructions for a given url to the single effective
267      * instruction. ADD is prioritized highest, then ADD_IFN_PRESENT, and then
268      * DEL. Not coincidentally, this is opposite the order they're sorted in.
269      **************************************************************/

270     private static class DeduplicatingPageSequenceReader {
271         SequenceFile.Reader edits;
272         PageInstruction current = new PageInstruction();
273         UTF8 currentUrl = new UTF8();
274         boolean haveCurrent;
275
276         /**
277          */

278         public DeduplicatingPageSequenceReader(SequenceFile.Reader edits) throws IOException {
279             this.edits = edits;
280             this.haveCurrent = edits.next(current, NullWritable.get());
281         }
282
283         /**
284          */

285         public boolean next(PageInstruction result) throws IOException {
286             if (!haveCurrent) {
287                 return false;
288             }
289         
290             currentUrl.set(current.getPage().getURL());
291             result.set(current); // take the first instruction
292

293             do {
294                 // skip the rest
295
} while ((haveCurrent = edits.next(current, NullWritable.get())) &&
296                      currentUrl.compareTo(current.getPage().getURL()) == 0);
297             return true;
298         }
299     }
300
301
302     /*************************************************
303      * Holds an instruction over a Link.
304      *************************************************/

305     public static class LinkInstruction implements WritableComparable {
306         Link link;
307         int instruction;
308
309         /**
310          */

311         public LinkInstruction() {
312         }
313
314         /**
315          */

316         public LinkInstruction(Link link, int instruction) {
317             set(link, instruction);
318         }
319
320         /**
321          * Re-init from another LinkInstruction's info.
322          */

323         public void set(LinkInstruction that) {
324             this.instruction = that.instruction;
325           
326             if (this.link == null)
327                 this.link = new Link();
328
329             this.link.set(that.link);
330         }
331
332         /**
333          * Re-init with a Link and an instruction
334          */

335         public void set(Link link, int instruction) {
336             this.link = link;
337             this.instruction = instruction;
338         }
339
340         //
341
// WritableComparable
342
//
343
public int compareTo(Object JavaDoc o) {
344             return this.link.compareTo(((LinkInstruction) o).link);
345         }
346         public void write(DataOutput out) throws IOException {
347             out.writeByte(instruction);
348             link.write(out);
349         }
350         public void readFields(DataInput in) throws IOException {
351             this.instruction = in.readByte();
352             if (link == null)
353                 link = new Link();
354             link.readFields(in);
355         }
356         public Link getLink() {
357             return link;
358         }
359         public int getInstruction() {
360             return instruction;
361         }
362
363         /*******************************************************
364          * Sorts the instruction first by Md5, then by opcode.
365          *******************************************************/

366         public static class MD5Comparator extends WritableComparator {
367             private static final Link.MD5Comparator MD5_COMPARATOR =
368             new Link.MD5Comparator();
369
370             public MD5Comparator() { super(LinkInstruction.class); }
371
372             public int compare(WritableComparable a, WritableComparable b) {
373                 LinkInstruction instructionA = (LinkInstruction)a;
374                 LinkInstruction instructionB = (LinkInstruction)b;
375                 return instructionA.link.md5Compare(instructionB.link);
376             }
377
378             /** Optimized comparator. */
379             public int compare(byte[] b1, int s1, int l1,
380                                byte[] b2, int s2, int l2) {
381                 return MD5_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
382             }
383         }
384  
385         /*********************************************************
386          * Sorts the instruction first by url, then by opcode.
387          *********************************************************/

388         public static class UrlComparator extends WritableComparator {
389             private static final Link.UrlComparator URL_COMPARATOR =
390             new Link.UrlComparator();
391
392             public UrlComparator() { super(LinkInstruction.class); }
393
394             public int compare(WritableComparable a, WritableComparable b) {
395                 LinkInstruction instructionA = (LinkInstruction)a;
396                 LinkInstruction instructionB = (LinkInstruction)b;
397                 return instructionA.link.urlCompare(instructionB.link);
398
399             }
400
401             /**
402              * Optimized comparator.
403              */

404             public int compare(byte[] b1, int s1, int l1,
405                                byte[] b2, int s2, int l2) {
406                 return URL_COMPARATOR.compare(b1, s1+1, l1-1, b2, s2+1, l2-1);
407             }
408         }
409     }
410
411     /*******************************************************
412      * LinkInstructionWriter very efficiently writes a
413      * LinkInstruction to a SequenceFile.Writer. Much better
414      * than calling "writer.append(new LinkInstruction())"
415      ********************************************************/

416     public static class LinkInstructionWriter {
417         LinkInstruction li = new LinkInstruction();
418
419         /**
420          */

421         public LinkInstructionWriter() {
422         }
423
424         /**
425          * Append the LinkInstruction info to the indicated SequenceFile
426          * and keep the LI for later reuse.
427          */

428         public synchronized void appendInstructionInfo(SequenceFile.Writer writer, Link link, int opcode, Writable val) throws IOException {
429             li.set(link, opcode);
430             writer.append(li, val);
431         }
432     }
433
434     /********************************************************
435      * This class deduplicates link operations. We want to
436      * sort by MD5, then by URL. But all operations
437      * should be unique.
438      *********************************************************/

439     class DeduplicatingLinkSequenceReader {
440         Link currentKey = new Link();
441         LinkInstruction current = new LinkInstruction();
442         SequenceFile.Reader edits;
443         boolean haveCurrent;
444
445         /**
446          */

447         public DeduplicatingLinkSequenceReader(SequenceFile.Reader edits) throws IOException {
448             this.edits = edits;
449             this.haveCurrent = edits.next(current, NullWritable.get());
450         }
451
452
453         /**
454          * The incoming stream of edits is sorted first by MD5, then by URL.
455          * MD5-only values always come before MD5+URL.
456          */

457         public boolean next(LinkInstruction key) throws IOException {
458             if (! haveCurrent) {
459                 return false;
460             }
461
462             currentKey.set(current.getLink());
463             
464             do {
465                 key.set(current);
466             } while ((haveCurrent = edits.next(current, NullWritable.get())) &&
467                      currentKey.compareTo(current.getLink()) == 0);
468             return true;
469         }
470     }
471
472
473     /**************************************************
474      * The CloseProcessor class is used when we close down
475      * the webdb. We give it the path, members, and class values
476      * needed to apply changes to any of our 4 data tables.
477      *
478      * This is an abstract class. Each subclass must define
479      * the exact merge procedure. However, file-handling
480      * and edit-processing is standardized as much as possible.
481      *
482      **************************************************/

483     private abstract class CloseProcessor {
484         String JavaDoc basename;
485         MapFile.Reader oldDb;
486         SequenceFile.Writer editWriter;
487         SequenceFile.Sorter sorter;
488         WritableComparator comparator;
489         Class JavaDoc keyClass, valueClass;
490         long itemsWritten = 0;
491
492         /**
493          * Store away these members for later use.
494          */

495         CloseProcessor(String JavaDoc basename, MapFile.Reader oldDb, SequenceFile.Writer editWriter, SequenceFile.Sorter sorter, WritableComparator comparator, Class JavaDoc keyClass, Class JavaDoc valueClass) {
496             this.basename = basename;
497             this.oldDb = oldDb;
498             this.editWriter = editWriter;
499             this.sorter = sorter;
500             this.comparator = comparator;
501             this.keyClass = keyClass;
502             this.valueClass = valueClass;
503         }
504
505         /**
506          * Perform the shutdown sequence for this Processor.
507          * There is a lot of file-moving and edit-sorting that
508          * is common across all the 4 tables.
509          *
510          * Returns how many items were written out by this close().
511          */

512         long closeDown(File workingDir, File outputDir, long numEdits) throws IOException {
513             File editsFile = new File(workingDir, basename + ".out");
514             File newDbFile = new File(outputDir, basename);
515             File sortedEditsFile = new File(editsFile.getPath() + ".sorted");
516             editWriter.close();
517
518             // If there are edits, then process them.
519
if (numEdits != 0) {
520                 // Sort the edits
521
long startSort = System.currentTimeMillis();
522                 sorter.sort(editsFile.getPath(), sortedEditsFile.getPath());
523                 // sorter.close();
524
long endSort = System.currentTimeMillis();
525                 LOG.info("Processing " + basename + ": Sorted " + numEdits + " instructions in " + ((endSort - startSort) / 1000.0) + " seconds.");
526                 LOG.info("Processing " + basename + ": Sorted " + (numEdits / ((endSort - startSort) / 1000.0)) + " instructions/second");
527             
528                 // Rename appropriately
529
fs.delete(editsFile);
530                 fs.rename(sortedEditsFile, editsFile);
531
532                 // Read the sorted edits
533
SequenceFile.Reader sortedEdits = new SequenceFile.Reader(fs, editsFile.getPath());
534
535                 // Create a brand-new output db for the integrated data
536
MapFile.Writer newDb = (comparator == null) ? new MapFile.Writer(fs, newDbFile.getPath(), keyClass, valueClass) : new MapFile.Writer(fs, newDbFile.getPath(), comparator, valueClass);
537
538                 // Iterate through the edits, and merge changes with existing
539
// db into the brand-new file
540
oldDb.reset();
541             
542                 // Merge the edits. We did it!
543
long startMerge = System.currentTimeMillis();
544                 mergeEdits(oldDb, sortedEdits, newDb);
545                 long endMerge = System.currentTimeMillis();
546                 LOG.info("Processing " + basename + ": Merged to new DB containing " + itemsWritten + " records in " + ((endMerge - startMerge) / 1000.0) + " seconds");
547                 LOG.info("Processing " + basename + ": Merged " + (itemsWritten / ((endMerge - startMerge) / 1000.0)) + " records/second");
548
549                 // Close down readers, writers
550
sortedEdits.close();
551                 newDb.close();
552             } else {
553                 // Otherwise, simply copy the file into place,
554
// without all the processing overhead.
555
long startCopy = System.currentTimeMillis();
556                 File curFile = new File(dbFile, basename);
557                 FileUtil.recursiveCopy(fs, curFile, newDbFile);
558                 long endCopy = System.currentTimeMillis();
559
560                 LOG.info("Processing " + basename + ": Copied file (" + newDbFile.length()+ " bytes) in " + ((endCopy - startCopy) / 1000.0) + " secs.");
561             }
562
563             // Delete the now-consumed edits file to save space
564
fs.delete(editsFile);
565
566             return itemsWritten;
567         }
568
569         /**
570          * The loop that actually applies the changes and writes to
571          * a new db. This is different for every subclass!
572          */

573         abstract void mergeEdits(MapFile.Reader db, SequenceFile.Reader edits, MapFile.Writer newDb) throws IOException;
574     }
575
576     /***
577      * The PagesByURLProcessor is used during close() time for
578      * the pagesByURL table. We instantiate one of these, and it
579      * takes care of the entire shutdown process.
580      */

581     private class PagesByURLProcessor extends CloseProcessor {
582         SequenceFile.Writer futureEdits;
583
584         /**
585          * We store "futureEdits" so we can write edits for the
586          * next table-db step
587          */

588         PagesByURLProcessor(MapFile.Reader db, SequenceFile.Writer editWriter, SequenceFile.Writer futureEdits) {
589             super(PAGES_BY_URL, db, editWriter, new SequenceFile.Sorter(fs, new PageInstruction.UrlComparator(), NullWritable.class), new UTF8.Comparator(), null, Page.class);
590             this.futureEdits = futureEdits;
591         }
592
593         /**
594          * Merge the existing db with the edit-stream into a brand-new file.
595          */

596         void mergeEdits(MapFile.Reader db, SequenceFile.Reader sortedEdits, MapFile.Writer newDb) throws IOException {
597             // Create the keys and vals we'll be using
598
DeduplicatingPageSequenceReader edits = new DeduplicatingPageSequenceReader(sortedEdits);
599             WritableComparable readerKey = new UTF8();
600             Page readerVal = new Page();
601             PageInstruction editItem = new PageInstruction();
602             int futureOrdering = 0;
603
604             // Read the first items from both streams
605
boolean hasEntries = db.next(readerKey, readerVal);
606             boolean hasEdits = edits.next(editItem);
607
608             // As long as we have both edits and entries, we need to
609
// interleave them....
610
while (hasEntries && hasEdits) {
611                 int comparison = readerKey.compareTo(editItem.getPage().getURL());
612                 int curInstruction = editItem.getInstruction();
613
614                 // Perform operations
615
if ((curInstruction == ADD_PAGE) ||
616                     (curInstruction == ADD_PAGE_WITH_SCORE) ||
617                     (curInstruction == ADD_PAGE_IFN_PRESENT)) {
618
619                     if (comparison < 0) {
620                         // Write readerKey, just passing it along.
621
// Don't process the edit yet.
622
newDb.append(readerKey, readerVal);
623                         itemsWritten++;
624                         hasEntries = db.next(readerKey, readerVal);
625                     } else if (comparison == 0) {
626                         // The keys are equal. If the instruction
627
// is ADD_PAGE, we write the edit's key and
628
// replace the old one.
629
//
630
// Otherwise, if it's ADD_IFN_PRESENT,
631
// keep the reader's item intact.
632
//
633
if ((curInstruction == ADD_PAGE) ||
634                             (curInstruction == ADD_PAGE_WITH_SCORE)) {
635                             // An ADD_PAGE with an identical pair
636
// of pages replaces the existing one.
637
// We may need to note the fact for
638
// Garbage Collection.
639
//
640
// This happens in three stages.
641
// 1. We write necessary items to the future
642
// edits-list.
643
//
644
pagesByMD5Edits++;
645
646                             // If this is a replacing add, we don't want
647
// to disturb the score from the old Page! This,
648
// way, we can run some link analysis scoring
649
// while the new Pages are being fetched and
650
// not lose the info when a Page is replaced.
651
//
652
// If it is an ADD_PAGE_WITH_SCORE, then we
653
// go ahead and replace the old one.
654
//
655
// Either way, from now on we treat it
656
// as an ADD_PAGE
657
//
658
Page editItemPage = editItem.getPage();
659
660                             if (curInstruction == ADD_PAGE) {
661                                 editItemPage.setScore(readerVal.getScore(), readerVal.getNextScore());
662                             }
663
664                             piwriter.appendInstructionInfo(futureEdits, editItemPage, ADD_PAGE, NullWritable.get());
665
666                             //
667
// 2. We write the edit-page to *this* table.
668
//
669
newDb.append(editItemPage.getURL(), editItemPage);
670
671                             //
672
// 3. We want the ADD in the next step (the
673
// MD5-driven table) to be a "replacing add".
674
// But that won't happen if the readerItem and
675
// the editItem Pages are not identical.
676
// (In this scenario, that means their URLs
677
// are the same, but their MD5s are different.)
678
// So, we need to explicitly handle that
679
// case by issuing a DELETE for the now-obsolete
680
// item.
681
if (editItemPage.compareTo(readerVal) != 0) {
682                                 pagesByMD5Edits++;
683                                 piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
684                             }
685
686                             itemsWritten++;
687
688                             // "Delete" the readerVal by skipping it.
689
hasEntries = db.next(readerKey, readerVal);
690                         } else {
691                             // ADD_PAGE_IFN_PRESENT. We only add IF_NOT
692
// present. And it was present! So, we treat
693
// this case like we treat a no-op.
694
// Just move to the next edit.
695
}
696                         // In either case, we process the edit.
697
hasEdits = edits.next(editItem);
698
699                     } else if (comparison > 0) {
700                         // We have inserted a Page that's before some
701
// entry in the existing database. So, we just
702
// need to write down the Page from the Edit file.
703
// It's like the above case, except we don't tell
704
// the future-edits to delete anything.
705
//
706
// 1. Write the item down for the future.
707
pagesByMD5Edits++;
708
709                         //
710
// If this is an ADD_PAGE_IFN_PRESENT, then
711
// we may also have a Link we have to take care of!
712
//
713
if (curInstruction == ADD_PAGE_IFN_PRESENT) {
714                             Link editLink = editItem.getLink();
715                             if (editLink != null) {
716                                 addLink(editLink);
717                             }
718                         }
719                         piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());
720
721                         //
722
// 2. Write the edit-page to *this* table
723
newDb.append(editItem.getPage().getURL(), editItem.getPage());
724                         itemsWritten++;
725
726                         // Process the edit
727
hasEdits = edits.next(editItem);
728                     }
729                 } else if (curInstruction == DEL_PAGE) {
730                     if (comparison < 0) {
731                         // Write the readerKey, just passing it along.
732
// We don't process the edit yet.
733
newDb.append(readerKey, readerVal);
734                         itemsWritten++;
735                         hasEntries = db.next(readerKey, readerVal);
736                     } else if (comparison == 0) {
737                         // Delete it! We can only delete one item
738
// at a time, as all URLs are unique.
739
// 1. Tell the future-edits what page will need to
740
// be deleted.
741
pagesByMD5Edits++;
742                         piwriter.appendInstructionInfo(futureEdits, readerVal, DEL_PAGE, NullWritable.get());
743
744                         //
745
// 2. "Delete" the entry by skipping the Reader
746
// key.
747
hasEntries = db.next(readerKey, readerVal);
748
749                         // Process the edit
750
hasEdits = edits.next(editItem);
751                     } else if (comparison > 0) {
752                         // Ignore it. We tried to delete an item that's
753
// not here.
754
hasEdits = edits.next(editItem);
755                     }
756                 }
757             }
758
759             // Now we have only edits. No more preexisting items!
760
while (! hasEntries && hasEdits) {
761                 int curInstruction = editItem.getInstruction();
762                 if (curInstruction == ADD_PAGE ||
763                     curInstruction == ADD_PAGE_WITH_SCORE ||
764                     curInstruction == ADD_PAGE_IFN_PRESENT) {
765                     // No more reader entries, so ADD_PAGE_IFN_PRESENT
766
// is treated like a simple ADD_PAGE.
767

768                     // 1. Tell the future edits-list about this new item
769
pagesByMD5Edits++;
770                     
771                     //
772
// If this is an ADD_PAGE_IFN_PRESENT, then
773
// we may also have a Link we have to take care of!
774
//
775
if (curInstruction == ADD_PAGE_IFN_PRESENT) {
776                         Link editLink = editItem.getLink();
777                         if (editLink != null) {
778                             addLink(editLink);
779                         }
780                     }
781                     piwriter.appendInstructionInfo(futureEdits, editItem.getPage(), ADD_PAGE, NullWritable.get());
782
783                     // 2. Write the edit page to this table.
784
newDb.append(editItem.getPage().getURL(), editItem.getPage());
785                     itemsWritten++;
786                 } else if (curInstruction == DEL_PAGE) {
787                     // Ignore it. We tried to delete an item
788
// that's not here.
789
}
790
791                 // Either way, we always process the edit.
792
hasEdits = edits.next(editItem);
793             }
794
795             // Now we have only preexisting items. We just copy
796
// them to the new file, in order.
797
while (hasEntries && ! hasEdits) {
798                 newDb.append(readerKey, readerVal);
799                 itemsWritten++;
800                 hasEntries = db.next(readerKey, readerVal);
801             }
802         }
803     }
804
805     /***
806      * The PagesByMD5Processor is used during close() time for
807      * the pagesByMD5 table. We instantiate one of these, and it
808      * takes care of the entire shutdown process.
809      */

810     private class PagesByMD5Processor extends CloseProcessor {
811         /**
812          */

813         PagesByMD5Processor(MapFile.Reader db, SequenceFile.Writer editWriter) {
814             super(PAGES_BY_MD5, db, editWriter, new SequenceFile.Sorter(fs, new PageInstruction.PageComparator(),