1 2 3 4 package net.nutch.parse.html; 5 6 import java.util.Properties ; 7 import java.util.ArrayList ; 8 import java.util.logging.*; 9 import java.net.URL ; 10 import java.net.MalformedURLException ; 11 import java.io.ByteArrayInputStream ; 12 import java.io.IOException ; 13 import java.util.regex.*; 14 15 import org.cyberneko.html.parsers.*; 16 import org.xml.sax.InputSource ; 17 import org.xml.sax.SAXException ; 18 import org.w3c.dom.*; 19 import org.w3c.dom.html.*; 20 import org.apache.html.dom.*; 21 22 import net.nutch.fetcher.FetcherOutput; 23 import net.nutch.protocol.Content; 24 import net.nutch.util.*; 25 import net.nutch.parse.*; 26 import net.nutch.parse.html.RobotsMetaProcessor.*; 27 28 29 public class HtmlParser implements Parser { 30 public static final Logger LOG = 31 LogFormatter.getLogger("net.nutch.parse.html"); 32 33 private static final int CHUNK_SIZE = 2000; 37 private static Pattern metaPattern = 38 Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", 39 Pattern.CASE_INSENSITIVE); 40 private static Pattern charsetPattern = 41 Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 42 Pattern.CASE_INSENSITIVE); 43 44 59 60 private static String sniffCharacterEncoding(byte[] content) { 61 int length = content.length < CHUNK_SIZE ? 62 content.length : CHUNK_SIZE; 63 64 String str = new String (content, 0, 0, length); 69 70 Matcher metaMatcher = metaPattern.matcher(str); 71 String encoding = null; 72 if (metaMatcher.find()) { 73 Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); 74 if (charsetMatcher.find()) 75 encoding = new String (charsetMatcher.group(1)); 76 } 77 78 return encoding; 79 } 80 81 82 private static String defaultCharEncoding = 83 NutchConf.get("parser.character.encoding.default", "windows-1252"); 84 85 public Parse getParse(Content content) throws ParseException { 86 DOMFragmentParser parser = new DOMFragmentParser(); 87 88 try { 90 parser.setFeature("http://apache.org/xml/features/include-comments", 91 true); 92 } catch (SAXException e) {} 93 94 RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator(); 95 96 URL base; 97 try { 98 base = new URL (content.getBaseUrl()); 99 } catch (MalformedURLException e) { 100 throw new ParseException(e); 101 } 102 103 String text = ""; 104 String title = ""; 105 Outlink[] outlinks = new Outlink[0]; 106 Properties metadata = new Properties (); 107 108 String contentType = content.getContentType(); 110 if (!"".equals(contentType) && !contentType.startsWith("text/html")) 111 throw new ParseException("Content-Type not text/html: " + contentType); 112 113 HTMLDocumentImpl impl = new HTMLDocumentImpl(); 115 impl.setErrorChecking(false); 116 DocumentFragment root = impl.createDocumentFragment(); 117 try { 118 byte[] contentInOctets = content.getContent(); 119 InputSource input = 120 new InputSource (new ByteArrayInputStream (contentInOctets)); 121 String encoding = StringUtil.parseCharacterEncoding(contentType); 122 if (encoding!=null) { 123 metadata.put("OriginalCharEncoding", encoding); 124 if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) { 125 input.setEncoding(encoding); 126 metadata.put("CharEncodingForConversion", encoding); 127 LOG.fine(base + ": setting encoding to " + encoding); 128 } 129 } 130 131 if (encoding == null) { 133 encoding = sniffCharacterEncoding(contentInOctets); 134 if (encoding!=null) { 135 metadata.put("OriginalCharEncoding", encoding); 136 if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) { 137 input.setEncoding(encoding); 138 metadata.put("CharEncodingForConversion", encoding); 139 LOG.fine(base + ": setting encoding to " + encoding); 140 } 141 } 142 } 143 144 if (encoding == null) { 145 152 metadata.put("CharEncodingForConversion", defaultCharEncoding); 153 input.setEncoding(defaultCharEncoding); 154 LOG.fine(base + ": falling back to " + defaultCharEncoding); 155 } 156 157 parser.parse(input, root); 158 } catch (IOException e) { 159 throw new ParseException(e); 160 } catch (DOMException e) { 161 throw new ParseException(e); 162 } catch (SAXException e) { 163 throw new ParseException(e); 164 } 165 166 RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base); 168 169 if (!robotsMeta.getNoIndex()) { StringBuffer sb = new StringBuffer (); 172 DOMContentUtils.getText(sb, root); text = sb.toString(); 174 sb.setLength(0); 175 DOMContentUtils.getTitle(sb, root); title = sb.toString().trim(); 177 } 178 179 if (!robotsMeta.getNoFollow()) { ArrayList l = new ArrayList (); URL baseTag = DOMContentUtils.getBase(root); 182 DOMContentUtils.getOutlinks(baseTag!=null?baseTag:base, l, root); 183 outlinks = (Outlink[])l.toArray(new Outlink[l.size()]); 184 LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl()); 185 } 186 187 if (!robotsMeta.getNoCache()) { } 190 191 metadata.putAll(content.getMetadata()); 193 194 ParseData parseData = new ParseData(title, outlinks, metadata); 195 Parse parse = new ParseImpl(text, parseData); 196 197 return HtmlParseFilters.filter(content, parse, root); 199 } 200 } 201 | Popular Tags |