1 2 3 package net.nutch.analysis.lang; 4 import net.nutch.parse.Parse; 5 import net.nutch.parse.HtmlParseFilter; 6 import net.nutch.parse.ParseException; 7 import net.nutch.protocol.Content; 8 import org.w3c.dom.*; 9 10 import java.util.logging.Logger ; 11 import net.nutch.util.LogFormatter; 12 13 16 public class HTMLLanguageParser implements HtmlParseFilter { 17 public static final String META_LANG_NAME="X-meta-lang"; 18 public static final Logger LOG = LogFormatter 19 .getLogger(HTMLLanguageParser.class.getName()); 20 21 28 public Parse filter(Content content, Parse parse, DocumentFragment doc) 29 throws ParseException { 30 String lang = findLanguage(doc); 31 32 if (lang != null) { 33 parse.getData().getMetadata().put(META_LANG_NAME, lang); 34 } 35 36 return parse; 37 } 38 39 private String findLanguage(Node node) { 40 String lang = null; 41 42 if (node.getNodeType() == Node.ELEMENT_NODE) { 43 44 lang = ((Element) node).getAttribute("lang"); 46 if (lang != null && lang.length()>1) { 47 return lang; 48 } 49 if ("meta".equalsIgnoreCase(node.getNodeName())) { 50 51 NamedNodeMap attrs=node.getAttributes(); 52 53 for(int i=0;i<attrs.getLength();i++){ 55 Node attrnode=attrs.item(i); 56 if("name".equalsIgnoreCase(attrnode.getNodeName())){ 57 if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){ 58 Node valueattr=attrs.getNamedItem("content"); 59 lang = (valueattr!=null)?valueattr.getNodeValue():null; 60 } 61 } 62 } 63 64 for(int i=0;i<attrs.getLength();i++){ 66 Node attrnode=attrs.item(i); 67 if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){ 68 if("content-language".equals(attrnode.getNodeValue().toLowerCase())){ 69 Node valueattr=attrs.getNamedItem("content"); 70 lang = (valueattr!=null)?valueattr.getNodeValue():null; 71 } 72 } 73 } 74 } 75 } 76 77 NodeList children = node.getChildNodes(); 79 for (int i = 0; children != null && i < children.getLength(); i++) { 80 lang = findLanguage(children.item(i)); 81 if(lang != null && lang.length()>1) return lang; 82 } 83 84 return lang; 85 } 86 } 87 | Popular Tags |