WebDBInjector


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.db;
5   
6   import java.io.*;
7   import java.net.*;
8   import java.util.*;
9   import java.util.logging.*;
10  import java.net.MalformedURLException  ;
11  import java.util.regex.*;
12  
13  import javax.xml.parsers.*;
14  import org.xml.sax.*;
15  import org.xml.sax.helpers.*;
16  import org.apache.xerces.util.XMLChar;
17  
18  import net.nutch.io.*;
19  import net.nutch.fs.*;
20  import net.nutch.net.*;
21  import net.nutch.util.*;
22  import net.nutch.pagedb.*;
23  import net.nutch.linkdb.*;
24  import net.nutch.util.NutchConf;
25  
26  /*********************************************
27   * This class takes a flat file of URLs and adds
28   * them as entries into a pagedb.  Useful for 
29   * bootstrapping the system.
30   *
31   * @author Mike Cafarella
32   * @author Doug Cutting
33   *********************************************/
34  public class WebDBInjector {
35      private static final String   DMOZ_PAGENAME = "http://www.dmoz.org/";
36  
37      private static final byte DEFAULT_INTERVAL =
38        (byte)NutchConf.getInt("db.default.fetch.interval", 30);
39  
40      private static final float NEW_INJECTED_PAGE_SCORE =
41        NutchConf.getFloat("db.score.injected", 2.0f);
42  
43      public static final Logger LOG = LogFormatter.getLogger("net.nutch.db.WebDBInjector");
44  
45      /**
46       * This filter fixes characters that might offend our parser.
47       * This lets us be tolerant of errors that might appear in the input XML.
48       */
49      private static class XMLCharFilter extends FilterReader {
50        private boolean lastBad = false;
51  
52        public XMLCharFilter(Reader reader) {
53          super(reader);
54        }
55  
56        public int read() throws IOException {
57          int c = in.read();
58          int value = c;
59          if (c != -1 && !(XMLChar.isValid(c)))     // fix invalid characters
60            value = 'X';
61          else if (lastBad && c == '<') {           // fix mis-matched brackets
62            in.mark(1);
63            if (in.read() != '/')
64              value = 'X';
65            in.reset();
66          }
67          lastBad = (c == 65533);
68  
69          return value;
70        }
71  
72        public int read(char[] cbuf, int off, int len)
73          throws IOException {
74          int n = in.read(cbuf, off, len);
75          if (n != -1) {
76            for (int i = 0; i < n; i++) {
77              char c = cbuf[off+i];
78              char value = c;
79              if (!(XMLChar.isValid(c)))            // fix invalid characters
80                value = 'X';
81              else if (lastBad && c == '<') {       // fix mis-matched brackets
82                if (i != n-1 && cbuf[off+i+1] != '/')
83                  value = 'X';
84              }
85              lastBad = (c == 65533);
86              cbuf[off+i] = value;
87            }
88          }
89          return n;
90        }
91      }
92  
93  
94      /**
95       * The RDFProcessor receives tag messages during a parse
96       * of RDF XML data.  We build whatever structures we need
97       * from these messages.
98       */
99      class RDFProcessor extends DefaultHandler {
100         String   curURL = null, curSection = null;
101         boolean titlePending = false, descPending = false, insideAdultSection = false;
102         Pattern topicPattern = null; 
103         StringBuffer   title = new StringBuffer  (), desc = new StringBuffer  ();
104         XMLReader reader;
105         int subsetDenom;
106         int hashSkew;
107         boolean includeAdult, includeDmozDesc;
108         MD5Hash srcDmozID;
109         long srcDmozDomainID;
110         Locator location;
111 
112         /**
113          * Pass in an XMLReader, plus a flag as to whether we 
114          * should include adult material.
115          */
116         public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException {
117             this.reader = reader;
118             this.subsetDenom = subsetDenom;
119             this.includeAdult = includeAdult;
120             this.includeDmozDesc = includeDmozDesc;
121             this.topicPattern = topicPattern;
122 
123             // We create a Page entry for the "Dmoz" page, from
124             // which all descriptive links originate.  The name
125             // of this page is always the same, stored in 
126             // DMOZ_PAGENAME.  The MD5 is generated over the current
127             // timestamp.  Until this page is deleted, the descriptive
128             // links will always be kept.
129             //
130             // If the DMOZ page is updated with new content, you 
131             // *could* update these links, if you really wanted to.
132             // Just run inject again!  This will replace the old
133             // Dmoz Page, because we always keep the same name.
134             // That obsolete Page will be deleted, and all its 
135             // outlinks (the descriptive ones) garbage-collected.
136             // 
137             // Then we just proceed to add the new descriptive 
138             // links, with the brand-new page's src MD5.
139             //
140             this.srcDmozID = MD5Hash.digest(DMOZ_PAGENAME + "_" + nextFetch);
141             Page dmozPage = new Page(DMOZ_PAGENAME, srcDmozID);
142             dmozPage.setNextFetchTime(Long.MAX_VALUE);
143             dbWriter.addPageIfNotPresent(dmozPage);
144 
145             this.srcDmozDomainID = MD5Hash.digest(new URL(DMOZ_PAGENAME).getHost()).halfDigest();
146 
147             this.hashSkew = skew != 0 ? skew : new Random().nextInt();
148         }
149 
150         //
151         // Interface ContentHandler
152         //
153 
154         /**
155          * Start of an XML elt
156          */
157         public void startElement(String   namespaceURI, String   localName, String   qName, Attributes atts) throws SAXException {
158             if ("Topic".equals(qName)) {
159                 curSection = atts.getValue("r:id");
160             } else if ("ExternalPage".equals(qName)) {
161                 // Porn filter
162                 if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
163                     return;
164                 }
165           
166                 if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
167                    return;
168                 }
169 
170                 // Subset denominator filter.  
171                 // Only emit with a chance of 1/denominator.
172                 String   url = atts.getValue("about");
173                 int hashValue = MD5Hash.digest(url).hashCode();
174                 hashValue = Math.abs(hashValue ^ hashSkew);
175                 if ((hashValue % subsetDenom) != 0) {
176                     return;
177                 }
178 
179                 // We actually claim the URL!
180                 curURL = url;
181             } else if (curURL != null && "d:Title".equals(qName)) {
182                 titlePending = true;
183             } else if (curURL != null && "d:Description".equals(qName)) {
184                 descPending = true;
185             }
186         }
187 
188         /**
189          * The contents of an XML elt
190          */
191         public void characters(char ch[], int start, int length) {
192             if (titlePending) {
193                 title.append(ch, start, length);
194             } else if (descPending) {
195                 desc.append(ch, start, length);
196             }
197         }
198 
199         /**
200          * Termination of XML elt
201          */
202         public void endElement(String   namespaceURI, String   localName, String   qName) throws SAXException {
203             if (curURL != null) {
204                 if ("ExternalPage".equals(qName)) {
205                     //
206                     // Inc the number of pages, insert the page, and 
207                     // possibly print status.
208                     //
209                     try {
210                       // First, manufacture the Page entry for the
211                       // given DMOZ listing.
212                       if (addPage(curURL)) {
213 
214                         // Second, add a link from the DMOZ page TO the
215                         // just-added target Page.  The anchor text should 
216                         // be the merged Title and Desc that we get from 
217                         // the DMOZ listing.  For testing reasons, the 
218                         // caller may choose to disallow this.
219                         if (includeDmozDesc) {
220                           String   fullDesc = title + " " + desc;
221                           Link descLink = new Link(srcDmozID, srcDmozDomainID, curURL, fullDesc);
222                           dbWriter.addLink(descLink);
223                         }
224                         pages++;
225                       }
226 
227                     } catch (MalformedURLException   e) {
228                         LOG.fine("skipping " + curURL + ":" + e);
229                     } catch (IOException ie) {
230                         LOG.severe("problem adding url " + curURL + ": " + ie);
231                     }
232                     printStatusBar(2000, 50000);
233 
234                     //
235                     // Clear out the link text.  This is what
236                     // you would use for adding to the linkdb.
237                     //
238                     if (title.length() > 0) {
239                         title.delete(0, title.length());
240                     }
241                     if (desc.length() > 0) {
242                         desc.delete(0, desc.length());
243                     }
244 
245                     // Null out the URL.
246                     curURL = null;
247                 } else if ("d:Title".equals(qName)) {
248                     titlePending = false;
249                 } else if ("d:Description".equals(qName)) {
250                     descPending = false;
251                 }
252             }
253         }
254 
255         /**
256          * When parsing begins
257          */
258         public void startDocument() {
259             LOG.info("Begin parse");
260         }
261 
262         /**
263          * When parsing ends
264          */
265         public void endDocument() {
266             LOG.info("Completed parse.  Added " + pages + " pages.");
267         }
268 
269         /**
270          * From time to time the Parser will set the "current location"
271          * by calling this function.  It's useful for emitting locations
272          * for error messages.
273          */
274         public void setDocumentLocator(Locator locator) {
275             location = locator;
276         }
277 
278 
279         //
280         // Interface ErrorHandler
281         //
282 
283         /**
284          * Emit the exception message
285          */
286         public void error(SAXParseException spe) {
287             LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
288             spe.printStackTrace(System.out);
289         }
290 
291         /**
292          * Emit the exception message, with line numbers
293          */
294         public void fatalError(SAXParseException spe) {
295             LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage());
296             LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber());
297             spe.printStackTrace(System.out);
298         }
299         
300         /**
301          * Emit exception warning message
302          */
303         public void warning(SAXParseException spe) {
304             LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
305             spe.printStackTrace(System.out);
306         }
307     }
308 
309     private IWebDBWriter dbWriter;
310 
311     /**
312      * WebDBInjector takes a reference to a WebDBWriter that it should add to.
313      */
314     public WebDBInjector(IWebDBWriter dbWriter) {
315         this.dbWriter = dbWriter;
316     }
317 
318     /**
319      * Close dbWriter and save changes
320      */
321     public void close() throws IOException {
322         dbWriter.close();
323     }
324 
325     /**
326      * Utility to present small status bar
327      */
328     public void printStatusBar(int small, int big){
329         if ((pages % small ) == 0) {
330             System.out.print(".");
331         }
332         if ((pages % big ) == 0) {
333             printStatus();
334         }
335     }
336 
337     long startTime = System.currentTimeMillis();
338     long pages = 0;
339     long nextFetch = System.currentTimeMillis();
340 
341     /**
342      * Utility to present performance stats
343      */
344     public void printStatus(){
345         long elapsed = (System.currentTimeMillis() - this.startTime); 
346         if ( this.pages == 0) {
347         } else {
348             LOG.info("\t" + this.pages + "\t" + 
349                      (int)((1000 *  pages)/elapsed) + " pages/second\t" );
350         }
351     }
352 
353     /**
354      * Iterate through all the items in this flat text file and
355      * add them to the db.
356      */
357     public void injectURLFile(File urlList) throws IOException {
358         nextFetch = urlList.lastModified();
359         BufferedReader reader = new BufferedReader(new FileReader(urlList));
360         try {
361             String   curStr = null; 
362             LOG.info("Starting URL processing");
363             while ((curStr = reader.readLine()) != null) {
364                 String   url = curStr.trim();
365                 if (addPage(url))
366                   this.pages++;
367                 printStatusBar(2000,50000);
368             }
369             LOG.info("Added " + pages + " pages");
370         } catch (Exception   e) {
371           LOG.severe("error while injecting:" + e);
372           e.printStackTrace();
373         } finally {
374           reader.close();
375         }
376     }
377 
378     /**
379      * Iterate through all the items in this structured DMOZ file.
380      * Add each URL to the web db.
381      */
382     public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException, SAXException, ParserConfigurationException {
383         nextFetch = dmozFile.lastModified();
384 
385         SAXParserFactory parserFactory = SAXParserFactory.newInstance();
386         SAXParser parser = parserFactory.newSAXParser();
387         XMLReader reader = parser.getXMLReader();
388 
389         // Create our own processor to receive SAX events
390         RDFProcessor rp =
391           new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
392         reader.setContentHandler(rp);
393         reader.setErrorHandler(rp);
394         LOG.info("skew = " + rp.hashSkew);
395 
396         //
397         // Open filtered text stream.  The UTF8Filter makes sure that
398         // only appropriate XML-approved UTF8 characters are received.
399         // Any non-conforming characters are silently skipped.
400         //
401         XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
402         try {
403             InputSource is = new InputSource(in);
404             reader.parse(is);
405         } catch (Exception   e) {
406             LOG.severe(e.toString());
407             e.printStackTrace(System.out);
408             System.exit(0);
409         } finally {
410             in.close();
411         }
412     }
413 
414     private boolean addPage(String   url) throws IOException {
415       url = URLFilterFactory.getFilter().filter(url);
416       if (url != null) {
417         try {
418           Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch);
419           dbWriter.addPageIfNotPresent(page);
420           return true;
421         } catch (MalformedURLException   e) {
422           LOG.warning("bad url: "+url);
423         }
424       }
425       return false;
426     }
427 
428     private static void addTopicsFromFile(String   topicFile, Vector topics) throws IOException {
429       BufferedReader in = null;
430       try {
431         in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8"));
432         String   line = null;
433         while ((line = in.readLine()) != null) {
434           topics.addElement(new String  (line));
435         }
436       } 
437       catch (Exception   e) {
438         LOG.severe(e.toString());
439         e.printStackTrace(System.out);
440         System.exit(0);
441       } finally {
442        in.close();
443       }
444     }
445     
446 
447     /**
448      * Command-line access.  User may add URLs via a flat text file
449      * or the structured DMOZ file.  By default, we ignore Adult
450      * material (as categorized by DMOZ).
451      */
452     public static void main(String   argv[]) throws Exception   {
453       if (argv.length < 3) {
454         System.out.println("Usage: WebDBInjector (-local | -ndfs <namenode:port>) <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
455         return;
456       }
457 
458       //
459       // Parse the command line, figure out what kind of
460       // URL file we need to load
461       //
462       int subsetDenom = 1;
463       int skew = 0;
464       String   command = null, loadfile = null;
465       boolean includeAdult = false, includeDmozDesc = true;
466       Pattern topicPattern = null; 
467       Vector topics = new Vector(); 
468 
469       int i = 0;
470       NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
471       try {
472           File root = new File(argv[i++]);
473 
474           for (; i < argv.length; i++) {
475               if ("-urlfile".equals(argv[i]) || 
476                   "-dmozfile".equals(argv[i])) {
477                   command = argv[i];
478                   loadfile = argv[i+1];
479                   i++;
480               } else if ("-includeAdultMaterial".equals(argv[i])) {
481                   includeAdult = true;
482               } else if ("-noDmozDesc".equals(argv[i])) {
483                   includeDmozDesc = false;
484               } else if ("-subset".equals(argv[i])) {
485                   subsetDenom = Integer.parseInt(argv[i+1]);
486                   i++;
487               } else if ("-topic".equals(argv[i])) {
488                   topics.addElement(argv[i+1]); 
489                   i++;
490               } else if ("-topicFile".equals(argv[i])) {
491                   addTopicsFromFile(argv[i+1], topics);
492                   i++;
493               } else if ("-skew".equals(argv[i])) {
494                   skew = Integer.parseInt(argv[i+1]);
495                   i++;
496               }
497           }
498 
499           //
500           // Create the webdbWriter, the injector, and then inject the
501           // right kind of URL file.
502           //
503           IWebDBWriter writer = new WebDBWriter(nfs, root);
504           WebDBInjector injector = new WebDBInjector(writer);
505           try {
506               if ("-urlfile".equals(command)) {
507                   if (!topics.isEmpty()) {
508                       System.out.println("You can't select URLs based on a topic when usin a URL-file");
509                   }
510                   injector.injectURLFile(new File(loadfile));
511               } else if ("-dmozfile".equals(command)) {
512                   if (!topics.isEmpty()) {
513                       String   regExp = new String  ("^("); 
514                       int j = 0;
515                       for ( ; j < topics.size() - 1; ++j) {
516                           regExp = regExp.concat((String  ) topics.get(j));
517                           regExp = regExp.concat("|");
518                       }
519                       regExp = regExp.concat((String  ) topics.get(j));
520                       regExp = regExp.concat(").*"); 
521                       LOG.info("Topic selection pattern = " + regExp);
522                       topicPattern = Pattern.compile(regExp); 
523                   }
524                   injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
525               } else {
526                   System.out.println("No command indicated.");
527                   return;
528               }
529           } finally {
530               injector.close();
531           }
532       } finally {
533           nfs.close();
534       }
535     }
536 }
537
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags