1 11 package org.eclipse.help.internal.search; 12 13 import java.io.IOException ; 14 import java.io.InputStream ; 15 import java.io.InputStreamReader ; 16 import java.io.Reader ; 17 import java.io.StreamTokenizer ; 18 import java.io.UnsupportedEncodingException ; 19 import java.net.URL ; 20 import java.util.Locale ; 21 import java.util.StringTokenizer ; 22 23 import org.apache.lucene.demo.html.HTMLParser; 24 25 29 public class HTMLDocParser { 30 public static final int MAX_OFFSET = 2048; 33 34 final static String ELEMENT_META = "META"; final static String ELEMENT_BODY = "body"; final static String ELEMENT_HEAD = "head"; final static String ATTRIBUTE_HTTP = "http-equiv"; final static String ATTRIBUTE_HTTP_VALUE = "content-type"; final static String ATTRIBUTE_CONTENT = "content"; 42 final static int STATE_ELEMENT_START = 0; 44 final static int STATE_ELEMENT_AFTER_LT = 1; 45 final static int STATE_ELEMENT_AFTER_LT_SLASH = 2; 46 final static int STATE_ELEMENT_META = 3; 47 final static int STATE_HTTP_START = 0; 49 final static int STATE_HTTP_AFTER_NAME = 1; 50 final static int STATE_HTTP_AFTER_EQ = 2; 51 final static int STATE_HTTP_DONE = 3; 52 final static int STATE_CONTENT_START = 0; 54 final static int STATE_CONTENT_AFTER_NAME = 1; 55 final static int STATE_CONTENT_AFTER_EQ = 2; 56 final static int STATE_CONTENT_DONE = 3; 57 58 private HTMLParser htmlParser; 59 private InputStream inputStream = null; 60 64 public void openDocument(URL url) throws IOException { 65 inputStream = url.openStream(); 66 67 String encoding = getCharsetFromHTML(inputStream); 68 try { 69 inputStream.close(); 70 } catch (IOException closeIOE) { 71 } 72 inputStream = url.openStream(); 73 if (encoding != null) { 74 try { 75 htmlParser = new HTMLParser(new InputStreamReader (inputStream, 76 encoding)); 77 78 } 79 catch (UnsupportedEncodingException uee) { 80 htmlParser = new HTMLParser(new InputStreamReader (inputStream)); 81 } 82 } 83 else { 84 htmlParser = new HTMLParser(new InputStreamReader (inputStream)); 85 } 86 } 87 90 public void closeDocument() { 91 if (inputStream != null) { 92 try { 93 inputStream.close(); 94 } catch (IOException closeIOE) { 95 } 96 } 97 } 98 public String getTitle() throws IOException { 99 if (htmlParser == null) { 100 throw new NullPointerException (); 101 } 102 try { 103 return htmlParser.getTitle(); 104 } catch (InterruptedException ie) { 105 return ""; } 107 } 108 public String getSummary(String title) throws IOException { 109 try { 110 return htmlParser.getSummary(); 111 } catch (InterruptedException ie) { 112 return ""; } 114 } 115 116 public Reader getContentReader() throws IOException { 117 if (htmlParser == null) { 118 throw new NullPointerException (); 119 } 120 return htmlParser.getReader(); 121 } 122 129 public static String getCharsetFromHTML(InputStream is) { 130 Reader asciiReader = new ASCIIReader(is, MAX_OFFSET); 133 StreamTokenizer tokenizer = new StreamTokenizer (asciiReader); 134 135 tokenizer.lowerCaseMode(false); 139 140 tokenizer.ordinaryChar('\''); tokenizer.ordinaryChar('/'); 144 String charset = getCharsetFromHTMLTokens(tokenizer); 145 if (asciiReader != null) { 146 try { 147 asciiReader.close(); 148 } catch (IOException ioe) { 149 } 150 } 151 return charset; 152 } 153 public static String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) { 154 String contentValue = null; 157 158 int stateContent = STATE_HTTP_START; 160 int stateElement = STATE_ELEMENT_START; 161 int stateHttp = STATE_HTTP_START; 162 163 try { 164 for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer 166 .nextToken()) { 167 181 switch (stateElement) { 183 case STATE_ELEMENT_START : 184 if (token == '<') { 185 stateElement = STATE_ELEMENT_AFTER_LT; 186 } break; 188 case STATE_ELEMENT_AFTER_LT : 189 if (token == StreamTokenizer.TT_WORD) { 190 if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) { 192 stateElement = STATE_ELEMENT_META; 194 stateHttp = STATE_HTTP_START; 196 stateContent = STATE_CONTENT_START; 197 contentValue = null; 198 } else if (ELEMENT_BODY 199 .equalsIgnoreCase(tokenizer.sval)) { 200 return null; 203 } else { 204 stateElement = STATE_ELEMENT_START; 207 } 208 } else if (token == '/') { 209 stateElement = STATE_ELEMENT_AFTER_LT_SLASH; 211 } else { 212 stateElement = STATE_ELEMENT_START; 216 } 217 break; 218 case STATE_ELEMENT_AFTER_LT_SLASH : 219 if (token == StreamTokenizer.TT_WORD 220 && ELEMENT_HEAD 221 .equalsIgnoreCase(tokenizer.sval)) { 222 return null; 225 } 226 stateElement = STATE_ELEMENT_START; 227 break; 228 default : switch (token) { 230 case '>' : 231 stateElement = STATE_ELEMENT_START; 234 break; 235 case StreamTokenizer.TT_WORD : 236 if (ATTRIBUTE_HTTP 238 .equalsIgnoreCase(tokenizer.sval)) { 239 stateHttp = STATE_HTTP_AFTER_NAME; 241 } else if (ATTRIBUTE_CONTENT 242 .equalsIgnoreCase(tokenizer.sval)) { 243 stateContent = STATE_CONTENT_AFTER_NAME; 245 } else if (stateHttp == STATE_HTTP_AFTER_EQ 246 && ATTRIBUTE_HTTP_VALUE 247 .equalsIgnoreCase(tokenizer.sval)) { 248 stateHttp = STATE_HTTP_DONE; 252 } else { 253 if (stateHttp != STATE_HTTP_DONE) { 257 stateHttp = STATE_HTTP_START; 258 } 259 if (stateContent != STATE_CONTENT_DONE) { 260 stateContent = STATE_CONTENT_START; 261 } 262 } 263 break; 264 case '=' : 265 if (stateHttp == STATE_HTTP_AFTER_NAME) { 268 stateHttp = STATE_HTTP_AFTER_EQ; 270 } else if (stateContent == STATE_CONTENT_AFTER_NAME) { 271 stateContent = STATE_CONTENT_AFTER_EQ; 273 } else { 274 if (stateHttp != STATE_HTTP_DONE) { 279 stateHttp = STATE_HTTP_START; 280 } 281 if (stateContent != STATE_CONTENT_DONE) { 282 stateContent = STATE_CONTENT_START; 283 } 284 } 285 break; 286 case '\"' : 287 if (stateHttp == STATE_HTTP_AFTER_EQ) { 290 if (ATTRIBUTE_HTTP_VALUE 292 .equalsIgnoreCase(tokenizer.sval)) { 293 stateHttp = STATE_HTTP_DONE; 296 } 297 } else if (stateContent == STATE_CONTENT_AFTER_EQ) { 298 stateContent = STATE_CONTENT_DONE; 300 contentValue = tokenizer.sval; 307 } else { 308 stateHttp = STATE_HTTP_START; 311 stateContent = STATE_CONTENT_START; 312 } 313 break; 314 default : 315 if (stateHttp != STATE_HTTP_DONE) { 319 stateHttp = STATE_HTTP_START; 320 } 321 if (stateContent != STATE_CONTENT_DONE) { 322 stateContent = STATE_CONTENT_START; 323 } 324 break; 325 } 326 break; 327 } 328 if (contentValue != null && stateHttp == STATE_HTTP_DONE 329 && stateContent == STATE_CONTENT_DONE) { 330 return getCharsetFromHTTP(contentValue); 333 } 334 335 } 336 } catch (IOException ioe) { 337 return null; 338 } 339 return null; 341 } 342 350 public static String getCharsetFromHTTP(String contentValue) { 351 StringTokenizer t = new StringTokenizer (contentValue, ";"); while (t.hasMoreTokens()) { 353 String parameter = t.nextToken().trim(); 354 if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { String charset = parameter 356 .substring("charset=".length()).trim(); if (charset.length() > 0) { 358 return charset; 359 } 360 } 361 } 362 return null; 363 } 364 } 365 | Popular Tags |