KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > blandware > atleap > common > parsers > html > HTMLPlainTextExtractor


1 /*
2  * Copyright 2005 Blandware (http://www.blandware.com)
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package com.blandware.atleap.common.parsers.html;
17
18
19 import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException;
20 import com.blandware.atleap.common.parsers.SpecificPlainTextExtractor;
21 import com.blandware.atleap.common.Constants;
22
23 import javax.swing.text.ChangedCharSetException JavaDoc;
24 import java.io.*;
25 import java.util.StringTokenizer JavaDoc;
26 import java.util.List JavaDoc;
27
28 /**
29  * <p>
30  * An extractor that extracts a plain text from HTML. That extractor can accept
31  * not only a full HTML-documents with &lt;html&gt;, &lt;body&gt; and other
32  * tags that are needed, but it can accept just a piece of HTML text.
33  * </p>
34  * <p>
35  * Additionally, this class can extract inline resources (&lt;IMG&gt; tag
36  * <b>src</b> attribute, &lt;LINK&gt; tag <b>src</b> and <b>href</b> attributes
37  * and <b>background</b> attribute for all tags) with
38  * <code>extractInlineRecources()</code> and references (<b>src</b>, <b>href</b>
39  * and <b>background</b> attributes) with <code>extractAllRefs()</code>.
40  * </p>
41  *
42  * @see SpecificPlainTextExtractor
43  * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
44  * &lt;roman.puchkovskiy@blandware.com&gt;</a>
45  * @version $Revision: 1.6 $ $Date: 2006/03/26 11:47:32 $
46  */

47 public class HTMLPlainTextExtractor implements SpecificPlainTextExtractor {
48
49     protected String JavaDoc usedEncoding = null;
50
51     /**
52      * Constructs new HTMLPlainTextExtractor instance
53      */

54     public HTMLPlainTextExtractor() {
55     }
56
57     /**
58      * Extracts a plain text from an HTML.
59      *
60      * @param input the input stream that supplies an HTML for extraction
61      * @param output the writer that will accept the extracted text
62      * @param encoding If encoding is specified and extractor didn't find
63      * encoding info in HTML, given encoding will be used to extract text.
64      * If it's <code>null</code> and extractor didn't find extractor info in
65      * document, default encoding (<code>Constants.DEFAULT_ENCODING</code>) will
66      * be used. If extractor found such info, this parameter is ignored.
67      * @throws PlainTextExtractorException throwed on exception raised during
68      * extracting
69      */

70     public void extract(InputStream input, Writer output, String JavaDoc encoding)
71             throws PlainTextExtractorException {
72         HTMLParser parser = null;
73         Reader reader = null;
74
75         try {
76             if (encoding == null || encoding.trim().length() == 0) {
77                 encoding = Constants.DEFAULT_ENCODING;
78             }
79             usedEncoding = encoding;
80
81             RewindableInputStreamWrapper rewindableInputStreamWrapper
82                     = new RewindableInputStreamWrapper(input);
83             reader = new InputStreamReader(rewindableInputStreamWrapper, encoding);
84             parser = new PlainTextParser(reader, output, true,
85                                          rewindableInputStreamWrapper);
86             try {
87                 // If no exception is thrown here, then the document has been parsed
88
// using given (or default) encoding
89
parser.parse();
90             } catch (ChangedCharSetException JavaDoc e) {
91                 // There's some META in docuemnt that specifies the encoding
92
String JavaDoc inputEncodingSpec = e.getCharSetSpec();
93                 String JavaDoc inputEncoding = (e.keyEqualsCharSet())
94                                        ? inputEncodingSpec
95                                        : getEncodingFromSpec(inputEncodingSpec);
96                 usedEncoding = inputEncoding;
97                 reader = new BufferedReader(new InputStreamReader(new PrefilledInputStream(rewindableInputStreamWrapper),
98                                                                   inputEncoding));
99                 parser = new PlainTextParser(reader, output, false, null);
100                 parser.parse();
101             }
102         } catch (Exception JavaDoc e) {
103             throw new PlainTextExtractorException(e);
104         }
105     }
106
107     /**
108      * Gets encoding from content-type spec like 'text/html; charset=koi8-r'.
109      * @param spec the content-type specification
110      * @return the encoding
111      */

112     private String JavaDoc getEncodingFromSpec(String JavaDoc spec) {
113         String JavaDoc inputEncoding;
114         StringTokenizer JavaDoc tokenizer = new StringTokenizer JavaDoc(spec, " \t;=");
115         int tokenCount = tokenizer.countTokens();
116         String JavaDoc[] tokens = new String JavaDoc[tokenCount];
117         for (int i = 0; i < tokenCount; i++) {
118             tokens[i] = tokenizer.nextToken();
119         }
120         if (tokenCount >= 3 && "charset".equalsIgnoreCase(tokens[1])) {
121             inputEncoding = tokens[2];
122         } else {
123             inputEncoding = Constants.DEFAULT_ENCODING;
124         }
125         return inputEncoding;
126     }
127
128     /**
129      * Extracts inline resources. Those are &lt;IMG&gt; tag
130      * <b>src</b> attribute, &lt;LINK&gt; tag <b>src</b> and <b>href</b> attributes
131      * and <b>background</b> attribute for all tags.
132      *
133      * @param input the input stream that supplies an HTML for extraction
134      * @param encoding the encoding of the input stream
135      * @return list of inline resources
136      * @throws PlainTextExtractorException
137      */

138     public List JavaDoc extractInlineResources(InputStream input, String JavaDoc encoding)
139             throws PlainTextExtractorException {
140         try {
141             Reader reader = new InputStreamReader(input, encoding);
142             InlineResourcesParser parser = new InlineResourcesParser(reader);
143             parser.parse();
144             return parser.getExtractedResources();
145         } catch (IOException e) {
146             throw new PlainTextExtractorException(e);
147         } catch (ParseException e) {
148             throw new PlainTextExtractorException(e);
149         }
150     }
151
152     /**
153      * Extracts all references. Those are <b>src</b>, <b>href</b>
154      * and <b>background</b> attributes.
155      *
156      * @param input the input stream that supplies an HTML for extraction
157      * @param encoding the encoding of the input stream
158      * @return list of all references
159      * @throws PlainTextExtractorException
160      */

161     public List JavaDoc extractAllRefs(InputStream input, String JavaDoc encoding)
162             throws PlainTextExtractorException {
163         try {
164             Reader reader = new InputStreamReader(input, encoding);
165             RefsParser parser = new RefsParser(reader);
166             parser.parse();
167             return parser.getExtractedRefs();
168         } catch (IOException e) {
169             throw new PlainTextExtractorException(e);
170         } catch (ParseException e) {
171             throw new PlainTextExtractorException(e);
172         }
173     }
174
175     /**
176      * @see com.blandware.atleap.common.parsers.SpecificPlainTextExtractor#getUsedEncoding()
177      */

178     public String JavaDoc getUsedEncoding() {
179         return usedEncoding;
180     }
181 }
Popular Tags