CCParseFilter


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package org.creativecommons.nutch;
5   
6   import net.nutch.parse.*;
7   import net.nutch.protocol.Content;
8   import net.nutch.util.NutchConf;
9   
10  import java.util.*;
11  import java.io.*;
12  import java.net.*;
13  import javax.xml.parsers.*;
14  import org.xml.sax.InputSource  ;
15  import org.w3c.dom.*;
16  
17  import java.util.logging.Logger  ;
18  import net.nutch.util.LogFormatter;
19  
20  /** Adds metadata identifying the Creative Commons license used, if any. */
21  public class CCParseFilter implements HtmlParseFilter {
22    public static final Logger   LOG
23      = LogFormatter.getLogger(CCParseFilter.class.getName());
24  
25    private static final boolean EXCLUDE_UNLICENSED =
26      NutchConf.getBoolean("creativecommons.exclude.unlicensed", false);
27  
28    /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
29    public static class Walker {
30      private URL base;                             // base url of page
31      private String   rdfLicense;                    // subject url found, if any
32      private URL relLicense;                       // license url found, if any
33      private URL anchorLicense;                    // anchor url found, if any
34      private String   workType;                      // work type URI
35  
36      private Walker(URL base) {
37        this.base = base;
38      }
39  
40      /** Scan the document adding attributes to metadata.*/
41      public static void walk(Node doc, URL base, Properties metadata)
42        throws ParseException {
43  
44        // walk the DOM tree, scanning for license data
45        Walker walker = new Walker(base);
46        walker.walk(doc);
47  
48        // interpret results of walk
49        String   licenseUrl = null;
50        String   licenseLocation = null;
51        if (walker.rdfLicense != null) {            // 1st choice: subject in RDF
52          licenseLocation = "rdf";
53          licenseUrl = walker.rdfLicense;
54        } else if (walker.relLicense != null) {     // 2nd: anchor w/ rel=license
55          licenseLocation = "rel";
56          licenseUrl = walker.relLicense.toString();
57        } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
58          licenseLocation = "a";
59          licenseUrl = walker.anchorLicense.toString();
60        } else if (EXCLUDE_UNLICENSED) {
61          throw new ParseException("No CC license.  Excluding.");
62        }
63  
64        // add license to metadata
65        if (licenseUrl != null) {
66          LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
67          metadata.put("License-Url", licenseUrl);
68          metadata.put("License-Location", licenseLocation);
69        }
70  
71        if (walker.workType != null) {
72          LOG.info("CC: found "+walker.workType+" in "+base);
73          metadata.put("Work-Type", walker.workType);
74        }
75  
76      }
77  
78      /** Scan the document looking for RDF in comments and license elements.*/
79      private void walk(Node node) {
80        
81        // check element nodes for license URL
82        if (node instanceof Element) {
83          findLicenseUrl((Element)node);
84        }
85  
86        // check comment nodes for license RDF
87        if (node instanceof Comment) {
88          findRdf(((Comment)node).getData());
89        }
90  
91        // recursively walk child nodes
92        NodeList children = node.getChildNodes();
93        for (int i = 0; children != null && i < children.getLength(); i++ ) {
94          walk(children.item(i));
95        }
96      }
97  
98      /** Extract license url from element, if any.  Thse are the href attribute
99       * of anchor elements with rel="license".  These must also point to
100      * http://creativecommons.org/licenses/. */
101     private void findLicenseUrl(Element element) {
102       // only look in Anchor elements
103       if (!"a".equalsIgnoreCase(element.getTagName()))
104         return;
105 
106       // require an href
107       String   href = element.getAttribute("href");
108       if (href == null)
109         return;
110       
111       try {
112         URL url = new URL(base, href);            // resolve the url
113 
114         // check that it's a CC license URL
115         if ("http".equalsIgnoreCase(url.getProtocol()) &&
116             "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
117             url.getPath() != null &&
118             url.getPath().startsWith("/licenses/") &&
119             url.getPath().length() > "/licenses/".length()) {
120 
121           // check rel="license"
122           String   rel = element.getAttribute("rel");
123           if (rel != null && "license".equals(rel) && this.relLicense == null) {
124             this.relLicense = url;                   // found rel license
125           } else if (this.anchorLicense == null) {
126             this.anchorLicense = url;             // found anchor license
127           }
128         }
129       } catch (MalformedURLException e) {         // ignore malformed urls
130       }
131     }
132 
133    /** Configure a namespace aware XML parser. */
134     private static final DocumentBuilderFactory FACTORY
135       = DocumentBuilderFactory.newInstance();
136     static {
137       FACTORY.setNamespaceAware(true);
138     }
139 
140     /** Creative Commons' namespace URI. */
141     private static final String   CC_NS = "http://web.resource.org/cc/";
142     
143     /** Dublin Core namespace URI. */
144     private static final String   DC_NS = "http://purl.org/dc/elements/1.1/";
145     
146     /** RDF syntax namespace URI. */
147     private static final String   RDF_NS
148       = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
149 
150     private void findRdf(String   comment) {
151       // first check for likely RDF in comment
152       int rdfPosition = comment.indexOf("RDF");
153       if (rdfPosition < 0)
154         return;                                   // no RDF, abort
155       int nsPosition = comment.indexOf(CC_NS);
156       if (nsPosition < 0)
157         return;                                   // no RDF, abort
158 
159       // try to parse the XML
160       Document doc;
161       try {
162         DocumentBuilder parser = FACTORY.newDocumentBuilder();
163         doc = parser.parse(new InputSource  (new StringReader(comment)));
164       } catch (Exception   e) {
165         LOG.warning("CC: Failed to parse RDF in "+base+": "+e);
166         //e.printStackTrace();
167         return;
168       }
169 
170       // check that root is rdf:RDF
171       NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
172       if (roots.getLength() != 1) {
173         LOG.warning("CC: No RDF root in "+base);
174         return;
175       }
176       Element rdf = (Element)roots.item(0);
177 
178       // get cc:License nodes inside rdf:RDF
179       NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
180       for (int i = 0; i < licenses.getLength(); i++) {
181 
182         Element l = (Element)licenses.item(i);
183 
184         // license is rdf:about= attribute from cc:License
185         this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
186 
187         // walk predicates of cc:License
188         NodeList predicates = l.getChildNodes();
189         for (int j = 0; j < predicates.getLength(); j++) {
190           Node predicateNode = predicates.item(j);
191           if (!(predicateNode instanceof Element))
192             continue;
193           Element predicateElement = (Element)predicateNode;
194 
195           // extract predicates of cc:xxx predicates
196           if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
197             continue;
198           }
199           String   predicate = predicateElement.getLocalName();
200 
201           // object is rdf:resource from cc:xxx predicates
202           String   object =
203             predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
204         
205           // add object and predicate to metadata
206           // metadata.put(object, predicate);
207           // LOG.info("CC: found: "+predicate+"="+object);
208         }
209       }
210 
211       // get cc:Work nodes from rdf:RDF
212       NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
213       for (int i = 0; i < works.getLength(); i++) {
214         Element l = (Element)works.item(i);
215         
216         // get dc:type nodes from cc:Work
217         NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
218         for (int j = 0; j < types.getLength(); j++) {
219           Element type = (Element)types.item(j);
220           String   workUri = 
221             type.getAttributeNodeNS(RDF_NS, "resource").getValue();
222           this.workType = (String  )WORK_TYPE_NAMES.get(workUri);
223           break;
224         }
225       }
226     }
227   }
228 
229   private static final HashMap WORK_TYPE_NAMES = new HashMap();
230   static {
231     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
232     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
233     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
234     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
235     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
236     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
237     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
238   }
239 
240   /** Adds metadata or otherwise modifies a parse of an HTML document, given
241    * the DOM tree of a page. */
242   public Parse filter(Content content, Parse parse, DocumentFragment doc)
243     throws ParseException {
244 
245     // construct base url
246     URL base;
247     try {
248       base = new URL(content.getBaseUrl());
249     } catch (MalformedURLException e) {
250       throw new ParseException(e);
251     }
252 
253     // extract license metadata
254     Walker.walk(doc, base, parse.getData().getMetadata());
255 
256     return parse;
257   }
258 
259 }
260
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags