KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > html > HtmlParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.html;
5
6 import java.util.Properties JavaDoc;
7 import java.util.ArrayList JavaDoc;
8 import java.util.logging.*;
9 import java.net.URL JavaDoc;
10 import java.net.MalformedURLException JavaDoc;
11 import java.io.ByteArrayInputStream JavaDoc;
12 import java.io.IOException JavaDoc;
13 import java.util.regex.*;
14
15 import org.cyberneko.html.parsers.*;
16 import org.xml.sax.InputSource JavaDoc;
17 import org.xml.sax.SAXException JavaDoc;
18 import org.w3c.dom.*;
19 import org.w3c.dom.html.*;
20 import org.apache.html.dom.*;
21
22 import net.nutch.fetcher.FetcherOutput;
23 import net.nutch.protocol.Content;
24 import net.nutch.util.*;
25 import net.nutch.parse.*;
26 import net.nutch.parse.html.RobotsMetaProcessor.*;
27
28
29 public class HtmlParser implements Parser {
30   public static final Logger LOG =
31     LogFormatter.getLogger("net.nutch.parse.html");
32
33   // I used 1000 bytes at first, but found that some documents have
34
// meta tag well past the first 1000 bytes.
35
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
36
private static final int CHUNK_SIZE = 2000;
37   private static Pattern metaPattern =
38     Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
39                     Pattern.CASE_INSENSITIVE);
40   private static Pattern charsetPattern =
41     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
42                     Pattern.CASE_INSENSITIVE);
43
44   /**
45    * Given a <code>byte[]</code> representing an html file of an
46    * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
47    * from the first <code>CHUNK_SIZE</code> bytes.
48    * If there's no meta tag for Content-Type or no charset is specified,
49    * <code>null</code> is returned. <br />
50    * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
51    * can't be handled with this.
52    * We need to do something similar to what's done by mozilla
53    * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
54    * See also http://www.w3.org/TR/REC-xml/#sec-guessing
55    * <br />
56    *
57    * @param content <code>byte[]</code> representation of an html file
58    */

59
60   private static String JavaDoc sniffCharacterEncoding(byte[] content) {
61     int length = content.length < CHUNK_SIZE ?
62                  content.length : CHUNK_SIZE;
63
64     // We don't care about non-ASCII parts so that it's sufficient
65
// to just inflate each byte to a 16-bit value by padding.
66
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
67
// {U+0041, U+0082, U+00B7}.
68
String JavaDoc str = new String JavaDoc(content, 0, 0, length);
69
70     Matcher metaMatcher = metaPattern.matcher(str);
71     String JavaDoc encoding = null;
72     if (metaMatcher.find()) {
73       Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
74       if (charsetMatcher.find())
75         encoding = new String JavaDoc(charsetMatcher.group(1));
76     }
77
78     return encoding;
79   }
80
81
82   private static String JavaDoc defaultCharEncoding =
83     NutchConf.get("parser.character.encoding.default", "windows-1252");
84
85   public Parse getParse(Content content) throws ParseException {
86     DOMFragmentParser parser = new DOMFragmentParser();
87     
88     // some plugins, e.g., creativecommons, need to examine html comments
89
try {
90       parser.setFeature("http://apache.org/xml/features/include-comments",
91                         true);
92     } catch (SAXException JavaDoc e) {}
93
94     RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
95
96     URL JavaDoc base;
97     try {
98       base = new URL JavaDoc(content.getBaseUrl());
99     } catch (MalformedURLException JavaDoc e) {
100       throw new ParseException(e);
101     }
102
103     String JavaDoc text = "";
104     String JavaDoc title = "";
105     Outlink[] outlinks = new Outlink[0];
106     Properties JavaDoc metadata = new Properties JavaDoc();
107
108     // check that contentType is one we can handle
109
String JavaDoc contentType = content.getContentType();
110     if (!"".equals(contentType) && !contentType.startsWith("text/html"))
111       throw new ParseException("Content-Type not text/html: " + contentType);
112     
113     // parse the content
114
HTMLDocumentImpl impl = new HTMLDocumentImpl();
115     impl.setErrorChecking(false);
116     DocumentFragment root = impl.createDocumentFragment();
117     try {
118       byte[] contentInOctets = content.getContent();
119       InputSource JavaDoc input =
120         new InputSource JavaDoc(new ByteArrayInputStream JavaDoc(contentInOctets));
121       String JavaDoc encoding = StringUtil.parseCharacterEncoding(contentType);
122       if (encoding!=null) {
123         metadata.put("OriginalCharEncoding", encoding);
124         if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
125       input.setEncoding(encoding);
126           metadata.put("CharEncodingForConversion", encoding);
127           LOG.fine(base + ": setting encoding to " + encoding);
128         }
129       }
130
131       // sniff out 'charset' value from the beginning of a document
132
if (encoding == null) {
133         encoding = sniffCharacterEncoding(contentInOctets);
134         if (encoding!=null) {
135           metadata.put("OriginalCharEncoding", encoding);
136           if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
137         input.setEncoding(encoding);
138             metadata.put("CharEncodingForConversion", encoding);
139             LOG.fine(base + ": setting encoding to " + encoding);
140           }
141         }
142       }
143
144       if (encoding == null) {
145         // fallback encoding.
146
// FIXME : In addition to the global fallback value,
147
// we should make it possible to specify fallback encodings for each ccTLD.
148
// (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
149
// doesn't work for jp because euc-jp and shift_jis have about the
150
// same share)
151

152         metadata.put("CharEncodingForConversion", defaultCharEncoding);
153         input.setEncoding(defaultCharEncoding);
154         LOG.fine(base + ": falling back to " + defaultCharEncoding);
155       }
156
157       parser.parse(input, root);
158     } catch (IOException JavaDoc e) {
159       throw new ParseException(e);
160     } catch (DOMException e) {
161       throw new ParseException(e);
162     } catch (SAXException JavaDoc e) {
163       throw new ParseException(e);
164     }
165       
166     // get meta directives
167
RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base);
168       
169     // check meta directives
170
if (!robotsMeta.getNoIndex()) { // okay to index
171
StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
172       DOMContentUtils.getText(sb, root); // extract text
173
text = sb.toString();
174       sb.setLength(0);
175       DOMContentUtils.getTitle(sb, root); // extract title
176
title = sb.toString().trim();
177     }
178       
179     if (!robotsMeta.getNoFollow()) { // okay to follow links
180
ArrayList JavaDoc l = new ArrayList JavaDoc(); // extract outlinks
181
URL JavaDoc baseTag = DOMContentUtils.getBase(root);
182       DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root);
183       outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
184       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
185     }
186     
187     if (!robotsMeta.getNoCache()) { // okay to cache
188
// ??? FIXME ???
189
}
190     
191     // copy content metadata through
192
metadata.putAll(content.getMetadata());
193
194     ParseData parseData = new ParseData(title, outlinks, metadata);
195     Parse parse = new ParseImpl(text, parseData);
196
197     // run filters on parse
198
return HtmlParseFilters.filter(content, parse, root);
199   }
200 }
201
Popular Tags