KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > help > internal > search > HTMLDocParser


1 /*******************************************************************************
2  * Copyright (c) 2000, 2007 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.help.internal.search;
12
13 import java.io.IOException JavaDoc;
14 import java.io.InputStream JavaDoc;
15 import java.io.InputStreamReader JavaDoc;
16 import java.io.Reader JavaDoc;
17 import java.io.StreamTokenizer JavaDoc;
18 import java.io.UnsupportedEncodingException JavaDoc;
19 import java.net.URL JavaDoc;
20 import java.util.Locale JavaDoc;
21 import java.util.StringTokenizer JavaDoc;
22
23 import org.apache.lucene.demo.html.HTMLParser;
24
25 /**
26  * Parser HTML documents. Extracts document encoding from header, and delegates
27  * to lucene HTML parser for extraction of title, summary, and content.
28  */

29 public class HTMLDocParser {
30     // maximum number of characters that will be searched
31
// from the beginning of HTML document to charset declaration
32
public static final int MAX_OFFSET = 2048;
33
34     // elements, atributes and values contstants
35
final static String JavaDoc ELEMENT_META = "META"; //$NON-NLS-1$
36
final static String JavaDoc ELEMENT_BODY = "body"; //$NON-NLS-1$
37
final static String JavaDoc ELEMENT_HEAD = "head"; //$NON-NLS-1$
38
final static String JavaDoc ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$
39
final static String JavaDoc ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$
40
final static String JavaDoc ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$
41

42     // states for parsing elements
43
final static int STATE_ELEMENT_START = 0;
44     final static int STATE_ELEMENT_AFTER_LT = 1;
45     final static int STATE_ELEMENT_AFTER_LT_SLASH = 2;
46     final static int STATE_ELEMENT_META = 3;
47     // states for parsing HTTP-EQUIV attribute
48
final static int STATE_HTTP_START = 0;
49     final static int STATE_HTTP_AFTER_NAME = 1;
50     final static int STATE_HTTP_AFTER_EQ = 2;
51     final static int STATE_HTTP_DONE = 3;
52     // states for parsing CONTENT attribute
53
final static int STATE_CONTENT_START = 0;
54     final static int STATE_CONTENT_AFTER_NAME = 1;
55     final static int STATE_CONTENT_AFTER_EQ = 2;
56     final static int STATE_CONTENT_DONE = 3;
57
58     private HTMLParser htmlParser;
59     private InputStream JavaDoc inputStream = null;
60     /**
61      * @param url
62      * @throws IOException
63      */

64     public void openDocument(URL JavaDoc url) throws IOException JavaDoc {
65         inputStream = url.openStream();
66
67         String JavaDoc encoding = getCharsetFromHTML(inputStream);
68         try {
69             inputStream.close();
70         } catch (IOException JavaDoc closeIOE) {
71         }
72         inputStream = url.openStream();
73         if (encoding != null) {
74             try {
75                 htmlParser = new HTMLParser(new InputStreamReader JavaDoc(inputStream,
76                         encoding));
77
78             }
79             catch (UnsupportedEncodingException JavaDoc uee) {
80                 htmlParser = new HTMLParser(new InputStreamReader JavaDoc(inputStream));
81             }
82         }
83         else {
84             htmlParser = new HTMLParser(new InputStreamReader JavaDoc(inputStream));
85         }
86     }
87     /**
88      * Releases resources (closes streams)
89      */

90     public void closeDocument() {
91         if (inputStream != null) {
92             try {
93                 inputStream.close();
94             } catch (IOException JavaDoc closeIOE) {
95             }
96         }
97     }
98     public String JavaDoc getTitle() throws IOException JavaDoc {
99         if (htmlParser == null) {
100             throw new NullPointerException JavaDoc();
101         }
102         try {
103             return htmlParser.getTitle();
104         } catch (InterruptedException JavaDoc ie) {
105             return ""; //$NON-NLS-1$
106
}
107     }
108     public String JavaDoc getSummary(String JavaDoc title) throws IOException JavaDoc {
109         try {
110             return htmlParser.getSummary();
111         } catch (InterruptedException JavaDoc ie) {
112             return ""; //$NON-NLS-1$
113
}
114     }
115     
116     public Reader JavaDoc getContentReader() throws IOException JavaDoc {
117         if (htmlParser == null) {
118             throw new NullPointerException JavaDoc();
119         }
120         return htmlParser.getReader();
121     }
122     /**
123      * Private. Parses HTML to extract document encoding specified in HTTP
124      * equivalent META tag in the document header. Example of such META tag is
125      * <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
126      *
127      * @return String or null if encoding not found
128      */

129     public static String JavaDoc getCharsetFromHTML(InputStream JavaDoc is) {
130         // Set up an ascii reader for the document (documents should not use
131
// other characters before encoding is defined)
132
Reader JavaDoc asciiReader = new ASCIIReader(is, MAX_OFFSET);
133         StreamTokenizer JavaDoc tokenizer = new StreamTokenizer JavaDoc(asciiReader);
134
135         // tokenizer.eolIsSignificant(false);// default false
136
// tokenizer.slashSlashComments(false); // default false
137
// tokenizer.slashStarComments(false);// default false
138
tokenizer.lowerCaseMode(false);
139
140         // tokenizer.quoteChar('\"'); // default quote char
141
tokenizer.ordinaryChar('\''); // default quote char
142
tokenizer.ordinaryChar('/'); // default comment character
143

144         String JavaDoc charset = getCharsetFromHTMLTokens(tokenizer);
145         if (asciiReader != null) {
146             try {
147                 asciiReader.close();
148             } catch (IOException JavaDoc ioe) {
149             }
150         }
151         return charset;
152     }
153     public static String JavaDoc getCharsetFromHTMLTokens(StreamTokenizer JavaDoc tokenizer) {
154         // keeps track of content attribute attribute until parsing
155
// of the meta tag is complete
156
String JavaDoc contentValue = null;
157
158         // initialize states
159
int stateContent = STATE_HTTP_START;
160         int stateElement = STATE_ELEMENT_START;
161         int stateHttp = STATE_HTTP_START;
162
163         try {
164             // in the worst case, process tokens until end of file
165
for (int token = tokenizer.nextToken(); token != StreamTokenizer.TT_EOF; token = tokenizer
166                     .nextToken()) {
167                 // debug tokens
168
// if (token == StreamTokenizer.TT_WORD) {
169
// System.out.println("word =" + tokenizer.sval);
170
// } else if (token == StreamTokenizer.TT_NUMBER) {
171
// System.out.println("number =" + tokenizer.nval);
172
// } else if (token == StreamTokenizer.TT_EOL) {
173
// System.out.println("endofline=");
174
// } else if ((char) token == '\"') {
175
// System.out.println("\" =" + tokenizer.sval);
176
//
177
// } else {
178
// System.out.println("else =" + (char) token);
179
// }
180

181                 // process input based depending on current state
182
switch (stateElement) {
183                     case STATE_ELEMENT_START :
184                         if (token == '<') {
185                             stateElement = STATE_ELEMENT_AFTER_LT;
186                         } // else do nothing, cannot be beginning of META tag
187
break;
188                     case STATE_ELEMENT_AFTER_LT :
189                         if (token == StreamTokenizer.TT_WORD) {
190                             // some element opened
191
if (ELEMENT_META.equalsIgnoreCase(tokenizer.sval)) {
192                                 // META element opened
193
stateElement = STATE_ELEMENT_META;
194                                 // initialize state of attributes
195
stateHttp = STATE_HTTP_START;
196                                 stateContent = STATE_CONTENT_START;
197                                 contentValue = null;
198                             } else if (ELEMENT_BODY
199                                     .equalsIgnoreCase(tokenizer.sval)) {
200                                 // body element opened, we are too far, stop
201
// processing input
202
return null;
203                             } else {
204                                 // some other element opened, start from initial
205
// state
206
stateElement = STATE_ELEMENT_START;
207                             }
208                         } else if (token == '/') {
209                             // can be begging of head closing
210
stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
211                         } else {
212                             // not an element opened, could be openning of
213
// declaration
214
// or element closing e.t.c.
215
stateElement = STATE_ELEMENT_START;
216                         }
217                         break;
218                     case STATE_ELEMENT_AFTER_LT_SLASH :
219                         if (token == StreamTokenizer.TT_WORD
220                                 && ELEMENT_HEAD
221                                         .equalsIgnoreCase(tokenizer.sval)) {
222                             // head element closed, we are too far, stop
223
// processing input
224
return null;
225                         }
226                         stateElement = STATE_ELEMENT_START;
227                         break;
228                     default : // STATE_META_IN :
229
switch (token) {
230                             case '>' :
231                                 // no longer inside META, start from initial
232
// state
233
stateElement = STATE_ELEMENT_START;
234                                 break;
235                             case StreamTokenizer.TT_WORD :
236                                 // string inside META tag, can be attribute name
237
if (ATTRIBUTE_HTTP
238                                         .equalsIgnoreCase(tokenizer.sval)) {
239                                     // found HTTP-EQUIV attribute name
240
stateHttp = STATE_HTTP_AFTER_NAME;
241                                 } else if (ATTRIBUTE_CONTENT
242                                         .equalsIgnoreCase(tokenizer.sval)) {
243                                     // found CONTENT attribute name
244
stateContent = STATE_CONTENT_AFTER_NAME;
245                                 } else if (stateHttp == STATE_HTTP_AFTER_EQ
246                                         && ATTRIBUTE_HTTP_VALUE
247                                                 .equalsIgnoreCase(tokenizer.sval)) {
248                                     // value of HTTP-EQUIV attribute (unquoted)
249
// we found <META ...
250
// HTTP-EQUIV=content-type
251
stateHttp = STATE_HTTP_DONE;
252                                 } else {
253                                     // some other attribute name or string,
254
// reset states of seeked attributes,
255
// unless successfully processed earlier
256
if (stateHttp != STATE_HTTP_DONE) {
257                                         stateHttp = STATE_HTTP_START;
258                                     }
259                                     if (stateContent != STATE_CONTENT_DONE) {
260                                         stateContent = STATE_CONTENT_START;
261                                     }
262                                 }
263                                 break;
264                             case '=' :
265                                 // = inside META tag, can separate interesing us
266
// attribute names from values
267
if (stateHttp == STATE_HTTP_AFTER_NAME) {
268                                     // we have HTTP-EQUIV=
269
stateHttp = STATE_HTTP_AFTER_EQ;
270                                 } else if (stateContent == STATE_CONTENT_AFTER_NAME) {
271                                     // we have CONTENT=
272
stateContent = STATE_CONTENT_AFTER_EQ;
273                                 } else {
274                                     // equal sign after some other attribute
275
// name or string,
276
// reset states of seeked attributes,
277
// unless successfully processed earlier
278
if (stateHttp != STATE_HTTP_DONE) {
279                                         stateHttp = STATE_HTTP_START;
280                                     }
281                                     if (stateContent != STATE_CONTENT_DONE) {
282                                         stateContent = STATE_CONTENT_START;
283                                     }
284                                 }
285                                 break;
286                             case '\"' :
287                                 // quoted string inside META tag, can be
288
// attribute value
289
if (stateHttp == STATE_HTTP_AFTER_EQ) {
290                                     // value of HTTP-EQUIV attribute
291
if (ATTRIBUTE_HTTP_VALUE
292                                             .equalsIgnoreCase(tokenizer.sval)) {
293                                         // we found <META ...
294
// HTTP-EQUIV="content-type"
295
stateHttp = STATE_HTTP_DONE;
296                                     }
297                                 } else if (stateContent == STATE_CONTENT_AFTER_EQ) {
298                                     // value of CONTENT attribute
299
stateContent = STATE_CONTENT_DONE;
300                                     // save the value of the attribute
301
// if attribue HTTP-EQUIV="content-type" is
302
// found
303
// in the same META tag, this value might
304
// have
305
// Content-type entity header
306
contentValue = tokenizer.sval;
307                                 } else {
308                                     // value for the attribute is missing
309
// reset states of seeked attributes
310
stateHttp = STATE_HTTP_START;
311                                     stateContent = STATE_CONTENT_START;
312                                 }
313                                 break;
314                             default :
315                                 // other unexpected token inside META tag
316
// reset states of seeked attributes,
317
// unless successfully processed earlier
318
if (stateHttp != STATE_HTTP_DONE) {
319                                     stateHttp = STATE_HTTP_START;
320                                 }
321                                 if (stateContent != STATE_CONTENT_DONE) {
322                                     stateContent = STATE_CONTENT_START;
323                                 }
324                                 break;
325                         }
326                         break;
327                 }
328                 if (contentValue != null && stateHttp == STATE_HTTP_DONE
329                         && stateContent == STATE_CONTENT_DONE) {
330                     // <META HTTP-EQUIV="content-type" CONTENT="*******"
331
// parse vale of content attribute to extract encoding
332
return getCharsetFromHTTP(contentValue);
333                 }
334
335             }
336         } catch (IOException JavaDoc ioe) {
337             return null;
338         }
339         // end of file
340
return null;
341     }
342     /**
343      * Parses HTTP1.1 Content-Type entity-header field for example,
344      * Content-Type: text/html; charset=ISO-8859-4, and extracts charset
345      * parameter value of the media sub type.
346      *
347      * @return value of charset parameter, for example ISO-8859-4 or null if
348      * parameter does not exist
349      */

350     public static String JavaDoc getCharsetFromHTTP(String JavaDoc contentValue) {
351         StringTokenizer JavaDoc t = new StringTokenizer JavaDoc(contentValue, ";"); //$NON-NLS-1$
352
while (t.hasMoreTokens()) {
353             String JavaDoc parameter = t.nextToken().trim();
354             if (parameter.toLowerCase(Locale.ENGLISH).startsWith("charset=")) { //$NON-NLS-1$
355
String JavaDoc charset = parameter
356                         .substring("charset=".length()).trim(); //$NON-NLS-1$
357
if (charset.length() > 0) {
358                     return charset;
359                 }
360             }
361         }
362         return null;
363     }
364 }
365
Popular Tags