KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > creativecommons > nutch > CCIndexingFilter


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package org.creativecommons.nutch;
5
6 import org.apache.lucene.document.Document;
7 import org.apache.lucene.document.Field;
8
9 import net.nutch.parse.Parse;
10
11 import net.nutch.indexer.IndexingFilter;
12 import net.nutch.indexer.IndexingException;
13
14 import net.nutch.fetcher.FetcherOutput;
15 import net.nutch.pagedb.FetchListEntry;
16
17 import java.util.logging.Logger JavaDoc;
18 import net.nutch.util.LogFormatter;
19
20 import java.util.*;
21 import java.net.URL JavaDoc;
22 import java.net.MalformedURLException JavaDoc;
23
24 /** Adds basic searchable fields to a document. */
25 public class CCIndexingFilter implements IndexingFilter {
26   public static final Logger JavaDoc LOG
27     = LogFormatter.getLogger(CCIndexingFilter.class.getName());
28
29   /** The name of the document field we use. */
30   public static String JavaDoc FIELD = "cc";
31
32   public Document filter(Document doc, Parse parse, FetcherOutput fo)
33     throws IndexingException {
34     
35     // index the license
36
String JavaDoc licenseUrl = parse.getData().get("License-Url");
37     if (licenseUrl != null) {
38       LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl());
39
40       // add the entire license as cc:license=xxx
41
addFeature(doc, "license="+licenseUrl);
42
43       // index license attributes extracted of the license url
44
addUrlFeatures(doc, licenseUrl);
45     }
46
47     // index the license location as cc:meta=xxx
48
String JavaDoc licenseLocation = parse.getData().get("License-Location");
49     if (licenseLocation != null) {
50       addFeature(doc, "meta="+licenseLocation);
51     }
52
53     // index the work type cc:type=xxx
54
String JavaDoc workType = parse.getData().get("Work-Type");
55     if (workType != null) {
56       addFeature(doc, workType);
57     }
58
59     return doc;
60   }
61
62   /** Add the features represented by a license URL. Urls are of the form
63    * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
64    * license feature. */

65   public void addUrlFeatures(Document doc, String JavaDoc urlString) {
66     try {
67       URL JavaDoc url = new URL JavaDoc(urlString);
68
69       // tokenize the path of the url, breaking at slashes and dashes
70
StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
71
72       if (names.hasMoreTokens())
73         names.nextToken(); // throw away "licenses"
74

75       // add a feature per component after "licenses"
76
while (names.hasMoreTokens()) {
77         String JavaDoc feature = names.nextToken();
78         addFeature(doc, feature);
79       }
80     } catch (MalformedURLException JavaDoc e) {
81       LOG.warning("CC: failed to parse url: "+urlString+" : "+e);
82     }
83   }
84   
85   private void addFeature(Document doc, String JavaDoc feature) {
86     doc.add(Field.Keyword(FIELD, feature));
87   }
88
89 }
90
Popular Tags