KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > creativecommons > nutch > CCParseFilter


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package org.creativecommons.nutch;
5
6 import net.nutch.parse.*;
7 import net.nutch.protocol.Content;
8 import net.nutch.util.NutchConf;
9
10 import java.util.*;
11 import java.io.*;
12 import java.net.*;
13 import javax.xml.parsers.*;
14 import org.xml.sax.InputSource JavaDoc;
15 import org.w3c.dom.*;
16
17 import java.util.logging.Logger JavaDoc;
18 import net.nutch.util.LogFormatter;
19
20 /** Adds metadata identifying the Creative Commons license used, if any. */
21 public class CCParseFilter implements HtmlParseFilter {
22   public static final Logger JavaDoc LOG
23     = LogFormatter.getLogger(CCParseFilter.class.getName());
24
25   private static final boolean EXCLUDE_UNLICENSED =
26     NutchConf.getBoolean("creativecommons.exclude.unlicensed", false);
27
28   /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
29   public static class Walker {
30     private URL base; // base url of page
31
private String JavaDoc rdfLicense; // subject url found, if any
32
private URL relLicense; // license url found, if any
33
private URL anchorLicense; // anchor url found, if any
34
private String JavaDoc workType; // work type URI
35

36     private Walker(URL base) {
37       this.base = base;
38     }
39
40     /** Scan the document adding attributes to metadata.*/
41     public static void walk(Node doc, URL base, Properties metadata)
42       throws ParseException {
43
44       // walk the DOM tree, scanning for license data
45
Walker walker = new Walker(base);
46       walker.walk(doc);
47
48       // interpret results of walk
49
String JavaDoc licenseUrl = null;
50       String JavaDoc licenseLocation = null;
51       if (walker.rdfLicense != null) { // 1st choice: subject in RDF
52
licenseLocation = "rdf";
53         licenseUrl = walker.rdfLicense;
54       } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
55
licenseLocation = "rel";
56         licenseUrl = walker.relLicense.toString();
57       } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
58
licenseLocation = "a";
59         licenseUrl = walker.anchorLicense.toString();
60       } else if (EXCLUDE_UNLICENSED) {
61         throw new ParseException("No CC license. Excluding.");
62       }
63
64       // add license to metadata
65
if (licenseUrl != null) {
66         LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
67         metadata.put("License-Url", licenseUrl);
68         metadata.put("License-Location", licenseLocation);
69       }
70
71       if (walker.workType != null) {
72         LOG.info("CC: found "+walker.workType+" in "+base);
73         metadata.put("Work-Type", walker.workType);
74       }
75
76     }
77
78     /** Scan the document looking for RDF in comments and license elements.*/
79     private void walk(Node node) {
80       
81       // check element nodes for license URL
82
if (node instanceof Element) {
83         findLicenseUrl((Element)node);
84       }
85
86       // check comment nodes for license RDF
87
if (node instanceof Comment) {
88         findRdf(((Comment)node).getData());
89       }
90
91       // recursively walk child nodes
92
NodeList children = node.getChildNodes();
93       for (int i = 0; children != null && i < children.getLength(); i++ ) {
94         walk(children.item(i));
95       }
96     }
97
98     /** Extract license url from element, if any. Thse are the href attribute
99      * of anchor elements with rel="license". These must also point to
100      * http://creativecommons.org/licenses/. */

101     private void findLicenseUrl(Element element) {
102       // only look in Anchor elements
103
if (!"a".equalsIgnoreCase(element.getTagName()))
104         return;
105
106       // require an href
107
String JavaDoc href = element.getAttribute("href");
108       if (href == null)
109         return;
110       
111       try {
112         URL url = new URL(base, href); // resolve the url
113

114         // check that it's a CC license URL
115
if ("http".equalsIgnoreCase(url.getProtocol()) &&
116             "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
117             url.getPath() != null &&
118             url.getPath().startsWith("/licenses/") &&
119             url.getPath().length() > "/licenses/".length()) {
120
121           // check rel="license"
122
String JavaDoc rel = element.getAttribute("rel");
123           if (rel != null && "license".equals(rel) && this.relLicense == null) {
124             this.relLicense = url; // found rel license
125
} else if (this.anchorLicense == null) {
126             this.anchorLicense = url; // found anchor license
127
}
128         }
129       } catch (MalformedURLException e) { // ignore malformed urls
130
}
131     }
132
133    /** Configure a namespace aware XML parser. */
134     private static final DocumentBuilderFactory FACTORY
135       = DocumentBuilderFactory.newInstance();
136     static {
137       FACTORY.setNamespaceAware(true);
138     }
139
140     /** Creative Commons' namespace URI. */
141     private static final String JavaDoc CC_NS = "http://web.resource.org/cc/";
142     
143     /** Dublin Core namespace URI. */
144     private static final String JavaDoc DC_NS = "http://purl.org/dc/elements/1.1/";
145     
146     /** RDF syntax namespace URI. */
147     private static final String JavaDoc RDF_NS
148       = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
149
150     private void findRdf(String JavaDoc comment) {
151       // first check for likely RDF in comment
152
int rdfPosition = comment.indexOf("RDF");
153       if (rdfPosition < 0)
154         return; // no RDF, abort
155
int nsPosition = comment.indexOf(CC_NS);
156       if (nsPosition < 0)
157         return; // no RDF, abort
158

159       // try to parse the XML
160
Document doc;
161       try {
162         DocumentBuilder parser = FACTORY.newDocumentBuilder();
163         doc = parser.parse(new InputSource JavaDoc(new StringReader(comment)));
164       } catch (Exception JavaDoc e) {
165         LOG.warning("CC: Failed to parse RDF in "+base+": "+e);
166         //e.printStackTrace();
167
return;
168       }
169
170       // check that root is rdf:RDF
171
NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
172       if (roots.getLength() != 1) {
173         LOG.warning("CC: No RDF root in "+base);
174         return;
175       }
176       Element rdf = (Element)roots.item(0);
177
178       // get cc:License nodes inside rdf:RDF
179
NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
180       for (int i = 0; i < licenses.getLength(); i++) {
181
182         Element l = (Element)licenses.item(i);
183
184         // license is rdf:about= attribute from cc:License
185
this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
186
187         // walk predicates of cc:License
188
NodeList predicates = l.getChildNodes();
189         for (int j = 0; j < predicates.getLength(); j++) {
190           Node predicateNode = predicates.item(j);
191           if (!(predicateNode instanceof Element))
192             continue;
193           Element predicateElement = (Element)predicateNode;
194
195           // extract predicates of cc:xxx predicates
196
if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
197             continue;
198           }
199           String JavaDoc predicate = predicateElement.getLocalName();
200
201           // object is rdf:resource from cc:xxx predicates
202
String JavaDoc object =
203             predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
204         
205           // add object and predicate to metadata
206
// metadata.put(object, predicate);
207
// LOG.info("CC: found: "+predicate+"="+object);
208
}
209       }
210
211       // get cc:Work nodes from rdf:RDF
212
NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
213       for (int i = 0; i < works.getLength(); i++) {
214         Element l = (Element)works.item(i);
215         
216         // get dc:type nodes from cc:Work
217
NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
218         for (int j = 0; j < types.getLength(); j++) {
219           Element type = (Element)types.item(j);
220           String JavaDoc workUri =
221             type.getAttributeNodeNS(RDF_NS, "resource").getValue();
222           this.workType = (String JavaDoc)WORK_TYPE_NAMES.get(workUri);
223           break;
224         }
225       }
226     }
227   }
228
229   private static final HashMap WORK_TYPE_NAMES = new HashMap();
230   static {
231     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
232     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
233     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
234     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
235     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
236     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
237     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
238   }
239
240   /** Adds metadata or otherwise modifies a parse of an HTML document, given
241    * the DOM tree of a page. */

242   public Parse filter(Content content, Parse parse, DocumentFragment doc)
243     throws ParseException {
244
245     // construct base url
246
URL base;
247     try {
248       base = new URL(content.getBaseUrl());
249     } catch (MalformedURLException e) {
250       throw new ParseException(e);
251     }
252
253     // extract license metadata
254
Walker.walk(doc, base, parse.getData().getMetadata());
255
256     return parse;
257   }
258
259 }
260
Popular Tags