1 2 3 4 package org.creativecommons.nutch; 5 6 import org.apache.lucene.document.Document; 7 import org.apache.lucene.document.Field; 8 9 import net.nutch.parse.Parse; 10 11 import net.nutch.indexer.IndexingFilter; 12 import net.nutch.indexer.IndexingException; 13 14 import net.nutch.fetcher.FetcherOutput; 15 import net.nutch.pagedb.FetchListEntry; 16 17 import java.util.logging.Logger ; 18 import net.nutch.util.LogFormatter; 19 20 import java.util.*; 21 import java.net.URL ; 22 import java.net.MalformedURLException ; 23 24 25 public class CCIndexingFilter implements IndexingFilter { 26 public static final Logger LOG 27 = LogFormatter.getLogger(CCIndexingFilter.class.getName()); 28 29 30 public static String FIELD = "cc"; 31 32 public Document filter(Document doc, Parse parse, FetcherOutput fo) 33 throws IndexingException { 34 35 String licenseUrl = parse.getData().get("License-Url"); 37 if (licenseUrl != null) { 38 LOG.info("CC: indexing "+licenseUrl+" for: "+fo.getUrl()); 39 40 addFeature(doc, "license="+licenseUrl); 42 43 addUrlFeatures(doc, licenseUrl); 45 } 46 47 String licenseLocation = parse.getData().get("License-Location"); 49 if (licenseLocation != null) { 50 addFeature(doc, "meta="+licenseLocation); 51 } 52 53 String workType = parse.getData().get("Work-Type"); 55 if (workType != null) { 56 addFeature(doc, workType); 57 } 58 59 return doc; 60 } 61 62 65 public void addUrlFeatures(Document doc, String urlString) { 66 try { 67 URL url = new URL (urlString); 68 69 StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); 71 72 if (names.hasMoreTokens()) 73 names.nextToken(); 75 while (names.hasMoreTokens()) { 77 String feature = names.nextToken(); 78 addFeature(doc, feature); 79 } 80 } catch (MalformedURLException e) { 81 LOG.warning("CC: failed to parse url: "+urlString+" : "+e); 82 } 83 } 84 85 private void addFeature(Document doc, String feature) { 86 doc.add(Field.Keyword(FIELD, feature)); 87 } 88 89 } 90
| Popular Tags
|