1 2 3 4 package org.creativecommons.nutch; 5 6 import net.nutch.parse.*; 7 import net.nutch.protocol.Content; 8 import net.nutch.util.NutchConf; 9 10 import java.util.*; 11 import java.io.*; 12 import java.net.*; 13 import javax.xml.parsers.*; 14 import org.xml.sax.InputSource ; 15 import org.w3c.dom.*; 16 17 import java.util.logging.Logger ; 18 import net.nutch.util.LogFormatter; 19 20 21 public class CCParseFilter implements HtmlParseFilter { 22 public static final Logger LOG 23 = LogFormatter.getLogger(CCParseFilter.class.getName()); 24 25 private static final boolean EXCLUDE_UNLICENSED = 26 NutchConf.getBoolean("creativecommons.exclude.unlicensed", false); 27 28 29 public static class Walker { 30 private URL base; private String rdfLicense; private URL relLicense; private URL anchorLicense; private String workType; 36 private Walker(URL base) { 37 this.base = base; 38 } 39 40 41 public static void walk(Node doc, URL base, Properties metadata) 42 throws ParseException { 43 44 Walker walker = new Walker(base); 46 walker.walk(doc); 47 48 String licenseUrl = null; 50 String licenseLocation = null; 51 if (walker.rdfLicense != null) { licenseLocation = "rdf"; 53 licenseUrl = walker.rdfLicense; 54 } else if (walker.relLicense != null) { licenseLocation = "rel"; 56 licenseUrl = walker.relLicense.toString(); 57 } else if (walker.anchorLicense != null) { licenseLocation = "a"; 59 licenseUrl = walker.anchorLicense.toString(); 60 } else if (EXCLUDE_UNLICENSED) { 61 throw new ParseException("No CC license. Excluding."); 62 } 63 64 if (licenseUrl != null) { 66 LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base); 67 metadata.put("License-Url", licenseUrl); 68 metadata.put("License-Location", licenseLocation); 69 } 70 71 if (walker.workType != null) { 72 LOG.info("CC: found "+walker.workType+" in "+base); 73 metadata.put("Work-Type", walker.workType); 74 } 75 76 } 77 78 79 private void walk(Node node) { 80 81 if (node instanceof Element) { 83 findLicenseUrl((Element)node); 84 } 85 86 if (node instanceof Comment) { 88 findRdf(((Comment)node).getData()); 89 } 90 91 NodeList children = node.getChildNodes(); 93 for (int i = 0; children != null && i < children.getLength(); i++ ) { 94 walk(children.item(i)); 95 } 96 } 97 98 101 private void findLicenseUrl(Element element) { 102 if (!"a".equalsIgnoreCase(element.getTagName())) 104 return; 105 106 String href = element.getAttribute("href"); 108 if (href == null) 109 return; 110 111 try { 112 URL url = new URL(base, href); 114 if ("http".equalsIgnoreCase(url.getProtocol()) && 116 "creativecommons.org".equalsIgnoreCase(url.getHost()) && 117 url.getPath() != null && 118 url.getPath().startsWith("/licenses/") && 119 url.getPath().length() > "/licenses/".length()) { 120 121 String rel = element.getAttribute("rel"); 123 if (rel != null && "license".equals(rel) && this.relLicense == null) { 124 this.relLicense = url; } else if (this.anchorLicense == null) { 126 this.anchorLicense = url; } 128 } 129 } catch (MalformedURLException e) { } 131 } 132 133 134 private static final DocumentBuilderFactory FACTORY 135 = DocumentBuilderFactory.newInstance(); 136 static { 137 FACTORY.setNamespaceAware(true); 138 } 139 140 141 private static final String CC_NS = "http://web.resource.org/cc/"; 142 143 144 private static final String DC_NS = "http://purl.org/dc/elements/1.1/"; 145 146 147 private static final String RDF_NS 148 = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; 149 150 private void findRdf(String comment) { 151 int rdfPosition = comment.indexOf("RDF"); 153 if (rdfPosition < 0) 154 return; int nsPosition = comment.indexOf(CC_NS); 156 if (nsPosition < 0) 157 return; 159 Document doc; 161 try { 162 DocumentBuilder parser = FACTORY.newDocumentBuilder(); 163 doc = parser.parse(new InputSource (new StringReader(comment))); 164 } catch (Exception e) { 165 LOG.warning("CC: Failed to parse RDF in "+base+": "+e); 166 return; 168 } 169 170 NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF"); 172 if (roots.getLength() != 1) { 173 LOG.warning("CC: No RDF root in "+base); 174 return; 175 } 176 Element rdf = (Element)roots.item(0); 177 178 NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License"); 180 for (int i = 0; i < licenses.getLength(); i++) { 181 182 Element l = (Element)licenses.item(i); 183 184 this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue(); 186 187 NodeList predicates = l.getChildNodes(); 189 for (int j = 0; j < predicates.getLength(); j++) { 190 Node predicateNode = predicates.item(j); 191 if (!(predicateNode instanceof Element)) 192 continue; 193 Element predicateElement = (Element)predicateNode; 194 195 if (!CC_NS.equals(predicateElement.getNamespaceURI())) { 197 continue; 198 } 199 String predicate = predicateElement.getLocalName(); 200 201 String object = 203 predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue(); 204 205 } 209 } 210 211 NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work"); 213 for (int i = 0; i < works.getLength(); i++) { 214 Element l = (Element)works.item(i); 215 216 NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type"); 218 for (int j = 0; j < types.getLength(); j++) { 219 Element type = (Element)types.item(j); 220 String workUri = 221 type.getAttributeNodeNS(RDF_NS, "resource").getValue(); 222 this.workType = (String )WORK_TYPE_NAMES.get(workUri); 223 break; 224 } 225 } 226 } 227 } 228 229 private static final HashMap WORK_TYPE_NAMES = new HashMap(); 230 static { 231 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video"); 232 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image"); 233 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio"); 234 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text"); 235 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive"); 236 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software"); 237 WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image"); 238 } 239 240 242 public Parse filter(Content content, Parse parse, DocumentFragment doc) 243 throws ParseException { 244 245 URL base; 247 try { 248 base = new URL(content.getBaseUrl()); 249 } catch (MalformedURLException e) { 250 throw new ParseException(e); 251 } 252 253 Walker.walk(doc, base, parse.getData().getMetadata()); 255 256 return parse; 257 } 258 259 } 260
| Popular Tags
|