CCIndexingFilter


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package org.creativecommons.nutch;
5   
6   import org.apache.lucene.document.Document;
7   import org.apache.lucene.document.Field;
8   
9   import net.nutch.parse.Parse;
10  
11  import net.nutch.indexer.IndexingFilter;
12  import net.nutch.indexer.IndexingException;
13  
14  import net.nutch.fetcher.FetcherOutput;
15  import net.nutch.pagedb.FetchListEntry;
16  
17  import java.util.logging.Logger  ;
18  import net.nutch.util.LogFormatter;
19  
20  import java.util.*;
21  import java.net.URL  ;
22  import java.net.MalformedURLException  ;
23  
24  /** Adds basic searchable fields to a document. */
25  public class CCIndexingFilter implements IndexingFilter {
26    public static final Logger   LOG
27      = LogFormatter.getLogger(CCIndexingFilter.class.getName());
28  
29    /** The name of the document field we use. */
30    public static String   FIELD = "cc";
31  
32    public Document filter(Document doc, Parse parse, FetcherOutput fo)
33      throws IndexingException {
34      
35      // index the license
36      String   licenseUrl = parse.getData().get("License-Url");
37      if (licenseUrl != null) {
38        LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl());
39  
40        // add the entire license as cc:license=xxx
41        addFeature(doc, "license="+licenseUrl);
42  
43        // index license attributes extracted of the license url
44        addUrlFeatures(doc, licenseUrl);
45      }
46  
47      // index the license location as cc:meta=xxx
48      String   licenseLocation = parse.getData().get("License-Location");
49      if (licenseLocation != null) {
50        addFeature(doc, "meta="+licenseLocation);
51      }
52  
53      // index the work type cc:type=xxx
54      String   workType = parse.getData().get("Work-Type");
55      if (workType != null) {
56        addFeature(doc, workType);
57      }
58  
59      return doc;
60    }
61  
62    /** Add the features represented by a license URL.  Urls are of the form
63     * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
64     * license feature. */
65    public void addUrlFeatures(Document doc, String   urlString) {
66      try {
67        URL   url = new URL  (urlString);
68  
69        // tokenize the path of the url, breaking at slashes and dashes
70        StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
71  
72        if (names.hasMoreTokens())
73          names.nextToken();                        // throw away "licenses"
74  
75        // add a feature per component after "licenses"
76        while (names.hasMoreTokens()) {
77          String   feature = names.nextToken();
78          addFeature(doc, feature);
79        }
80      } catch (MalformedURLException   e) {
81        LOG.warning("CC: failed to parse url: "+urlString+" : "+e);
82      }
83    }
84    
85    private void addFeature(Document doc, String   feature) {
86      doc.add(Field.Keyword(FIELD, feature));
87    }
88  
89  }
90
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags