KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > lang > HTMLLanguageParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.analysis.lang;
4 import net.nutch.parse.Parse;
5 import net.nutch.parse.HtmlParseFilter;
6 import net.nutch.parse.ParseException;
7 import net.nutch.protocol.Content;
8 import org.w3c.dom.*;
9
10 import java.util.logging.Logger JavaDoc;
11 import net.nutch.util.LogFormatter;
12
13 /** Adds metadata identifying language of document if found
14  * We could also run statistical analysis here but we'd miss all other formats
15  */

16 public class HTMLLanguageParser implements HtmlParseFilter {
17   public static final String JavaDoc META_LANG_NAME="X-meta-lang";
18   public static final Logger JavaDoc LOG = LogFormatter
19     .getLogger(HTMLLanguageParser.class.getName());
20
21   /**
22    * Scan the HTML document looking at possible indications of content language<br>
23    * <li>1. html lang attribute (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
24    * <li>2. meta dc.language (http://dublincore.org/documents/2000/07/16/usageguide/qualified-html.shtml#language)
25    * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
26    * <br>Only the first occurence of language is stored.
27    */

28   public Parse filter(Content content, Parse parse, DocumentFragment doc)
29     throws ParseException {
30     String JavaDoc lang = findLanguage(doc);
31
32     if (lang != null) {
33       parse.getData().getMetadata().put(META_LANG_NAME, lang);
34     }
35                 
36     return parse;
37   }
38         
39   private String JavaDoc findLanguage(Node node) {
40     String JavaDoc lang = null;
41
42     if (node.getNodeType() == Node.ELEMENT_NODE) {
43                         
44       //lang attribute
45
lang = ((Element) node).getAttribute("lang");
46       if (lang != null && lang.length()>1) {
47         return lang;
48       }
49       if ("meta".equalsIgnoreCase(node.getNodeName())) {
50
51         NamedNodeMap attrs=node.getAttributes();
52
53         //dc.language
54
for(int i=0;i<attrs.getLength();i++){
55           Node attrnode=attrs.item(i);
56           if("name".equalsIgnoreCase(attrnode.getNodeName())){
57             if("dc.language".equalsIgnoreCase(attrnode.getNodeValue())){
58               Node valueattr=attrs.getNamedItem("content");
59               lang = (valueattr!=null)?valueattr.getNodeValue():null;
60             }
61           }
62         }
63         
64         //http-equiv content-language
65
for(int i=0;i<attrs.getLength();i++){
66           Node attrnode=attrs.item(i);
67           if("http-equiv".equalsIgnoreCase(attrnode.getNodeName())){
68             if("content-language".equals(attrnode.getNodeValue().toLowerCase())){
69               Node valueattr=attrs.getNamedItem("content");
70               lang = (valueattr!=null)?valueattr.getNodeValue():null;
71             }
72           }
73         }
74       }
75     }
76                 
77     //recurse
78
NodeList children = node.getChildNodes();
79     for (int i = 0; children != null && i < children.getLength(); i++) {
80       lang = findLanguage(children.item(i));
81       if(lang != null && lang.length()>1) return lang;
82     }
83
84     return lang;
85   }
86 }
87
Popular Tags