HtmlParser


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.parse.html;
5   
6   import java.util.Properties  ;
7   import java.util.ArrayList  ;
8   import java.util.logging.*;
9   import java.net.URL  ;
10  import java.net.MalformedURLException  ;
11  import java.io.ByteArrayInputStream  ;
12  import java.io.IOException  ;
13  import java.util.regex.*;
14  
15  import org.cyberneko.html.parsers.*;
16  import org.xml.sax.InputSource  ;
17  import org.xml.sax.SAXException  ;
18  import org.w3c.dom.*;
19  import org.w3c.dom.html.*;
20  import org.apache.html.dom.*;
21  
22  import net.nutch.fetcher.FetcherOutput;
23  import net.nutch.protocol.Content;
24  import net.nutch.util.*;
25  import net.nutch.parse.*;
26  import net.nutch.parse.html.RobotsMetaProcessor.*;
27  
28  
29  public class HtmlParser implements Parser {
30    public static final Logger LOG =
31      LogFormatter.getLogger("net.nutch.parse.html");
32  
33    // I used 1000 bytes at first, but  found that some documents have 
34    // meta tag well past the first 1000 bytes. 
35    // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
36    private static final int CHUNK_SIZE = 2000;
37    private static Pattern metaPattern =
38      Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
39                      Pattern.CASE_INSENSITIVE);
40    private static Pattern charsetPattern =
41      Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
42                      Pattern.CASE_INSENSITIVE);
43  
44    /**
45     * Given a <code>byte[]</code> representing an html file of an 
46     * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag   
47     * from the first <code>CHUNK_SIZE</code> bytes.
48     * If there's no meta tag for Content-Type or no charset is specified,
49     * <code>null</code> is returned.  <br />
50     * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
51     * can't be handled with this. 
52     * We need to do something similar to what's done by mozilla
53     * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
54     * See also http://www.w3.org/TR/REC-xml/#sec-guessing
55     * <br />
56     *
57     * @param content <code>byte[]</code> representation of an html file
58     */
59  
60    private static String   sniffCharacterEncoding(byte[] content) {
61      int length = content.length < CHUNK_SIZE ? 
62                   content.length : CHUNK_SIZE;
63  
64      // We don't care about non-ASCII parts so that it's sufficient
65      // to just inflate each byte to a 16-bit value by padding. 
66      // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into 
67      // {U+0041, U+0082, U+00B7}. 
68      String   str = new String  (content, 0, 0, length); 
69  
70      Matcher metaMatcher = metaPattern.matcher(str);
71      String   encoding = null;
72      if (metaMatcher.find()) {
73        Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
74        if (charsetMatcher.find()) 
75          encoding = new String  (charsetMatcher.group(1));
76      }
77  
78      return encoding;
79    }
80  
81  
82    private static String   defaultCharEncoding =
83      NutchConf.get("parser.character.encoding.default", "windows-1252");
84  
85    public Parse getParse(Content content) throws ParseException {
86      DOMFragmentParser parser = new DOMFragmentParser();
87      
88      // some plugins, e.g., creativecommons, need to examine html comments
89      try {
90        parser.setFeature("http://apache.org/xml/features/include-comments", 
91                          true);
92      } catch (SAXException   e) {}
93  
94      RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
95  
96      URL   base;
97      try {
98        base = new URL  (content.getBaseUrl());
99      } catch (MalformedURLException   e) {
100       throw new ParseException(e);
101     }
102 
103     String   text = "";
104     String   title = "";
105     Outlink[] outlinks = new Outlink[0];
106     Properties   metadata = new Properties  ();
107 
108     // check that contentType is one we can handle
109     String   contentType = content.getContentType();
110     if (!"".equals(contentType) && !contentType.startsWith("text/html"))
111       throw new ParseException("Content-Type not text/html: " + contentType);
112     
113     // parse the content
114     HTMLDocumentImpl impl = new HTMLDocumentImpl();
115     impl.setErrorChecking(false);
116     DocumentFragment root = impl.createDocumentFragment();
117     try {
118       byte[] contentInOctets = content.getContent();
119       InputSource   input =
120         new InputSource  (new ByteArrayInputStream  (contentInOctets));
121       String   encoding = StringUtil.parseCharacterEncoding(contentType);
122       if (encoding!=null) {
123         metadata.put("OriginalCharEncoding", encoding);
124         if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
125       input.setEncoding(encoding); 
126           metadata.put("CharEncodingForConversion", encoding);
127           LOG.fine(base + ": setting encoding to " + encoding);
128         }
129       }
130 
131       // sniff out 'charset' value from the beginning of a document
132       if (encoding == null) {
133         encoding = sniffCharacterEncoding(contentInOctets);
134         if (encoding!=null) {
135           metadata.put("OriginalCharEncoding", encoding);
136           if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
137         input.setEncoding(encoding); 
138             metadata.put("CharEncodingForConversion", encoding);
139             LOG.fine(base + ": setting encoding to " + encoding);
140           }
141         }
142       }
143 
144       if (encoding == null) {
145         // fallback encoding.
146         // FIXME : In addition to the global fallback value,
147         // we should make it possible to specify fallback encodings for each ccTLD.
148         // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
149         // doesn't work for jp because euc-jp and shift_jis have about the
150         // same share)
151        
152         metadata.put("CharEncodingForConversion", defaultCharEncoding);
153         input.setEncoding(defaultCharEncoding);
154         LOG.fine(base + ": falling back to " + defaultCharEncoding);
155       }
156 
157       parser.parse(input, root);
158     } catch (IOException   e) {
159       throw new ParseException(e);
160     } catch (DOMException e) {
161       throw new ParseException(e);
162     } catch (SAXException   e) {
163       throw new ParseException(e);
164     }
165       
166     // get meta directives
167     RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base);
168       
169     // check meta directives
170     if (!robotsMeta.getNoIndex()) {               // okay to index
171       StringBuffer   sb = new StringBuffer  ();
172       DOMContentUtils.getText(sb, root);          // extract text
173       text = sb.toString();
174       sb.setLength(0);
175       DOMContentUtils.getTitle(sb, root);         // extract title
176       title = sb.toString().trim();
177     }
178       
179     if (!robotsMeta.getNoFollow()) {              // okay to follow links
180       ArrayList   l = new ArrayList  ();              // extract outlinks
181       URL   baseTag = DOMContentUtils.getBase(root);
182       DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root);
183       outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
184       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
185     }
186     
187     if (!robotsMeta.getNoCache()) {             // okay to cache
188       // ??? FIXME ???
189     }
190     
191     // copy content metadata through
192     metadata.putAll(content.getMetadata());
193 
194     ParseData parseData = new ParseData(title, outlinks, metadata);
195     Parse parse = new ParseImpl(text, parseData);
196 
197     // run filters on parse
198     return HtmlParseFilters.filter(content, parse, root);
199   }
200 }
201
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags