HTMLPlainTextExtractor


1   /*
2    *  Copyright 2005 Blandware (http://www.blandware.com)
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   */
16  package com.blandware.atleap.common.parsers.html;
17  
18  
19  import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException;
20  import com.blandware.atleap.common.parsers.SpecificPlainTextExtractor;
21  import com.blandware.atleap.common.Constants;
22  
23  import javax.swing.text.ChangedCharSetException  ;
24  import java.io.*;
25  import java.util.StringTokenizer  ;
26  import java.util.List  ;
27  
28  /**
29   * <p>
30   * An extractor that extracts a plain text from HTML. That extractor can accept
31   * not only a full HTML-documents with &lt;html&gt;, &lt;body&gt; and other
32   * tags that are needed, but it can accept just a piece of HTML text.
33   * </p>
34   * <p>
35   * Additionally, this class can extract inline resources (&lt;IMG&gt; tag
36   * <b>src</b> attribute, &lt;LINK&gt; tag <b>src</b> and <b>href</b> attributes
37   * and <b>background</b> attribute for all tags) with
38   * <code>extractInlineRecources()</code> and references (<b>src</b>, <b>href</b>
39   * and <b>background</b> attributes) with <code>extractAllRefs()</code>.
40   * </p>
41   *
42   * @see SpecificPlainTextExtractor
43   * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
44   * &lt;roman.puchkovskiy@blandware.com&gt;</a>
45   * @version $Revision: 1.6 $ $Date: 2006/03/26 11:47:32 $
46   */
47  public class HTMLPlainTextExtractor implements SpecificPlainTextExtractor {
48  
49      protected String   usedEncoding = null;
50  
51      /**
52       * Constructs new HTMLPlainTextExtractor instance
53       */
54      public HTMLPlainTextExtractor() {
55      }
56  
57      /**
58       * Extracts a plain text from an HTML.
59       *
60       * @param input the input stream that supplies an HTML for extraction
61       * @param output the writer that will accept the extracted text
62       * @param encoding If encoding is specified and extractor didn't find
63       * encoding info in HTML, given encoding will be used to extract text.
64       * If it's <code>null</code> and extractor didn't find extractor info in
65       * document, default encoding (<code>Constants.DEFAULT_ENCODING</code>) will
66       * be used. If extractor found such info, this parameter is ignored.
67       * @throws PlainTextExtractorException throwed on exception raised during
68       * extracting
69       */
70      public void extract(InputStream input, Writer output, String   encoding)
71              throws PlainTextExtractorException {
72          HTMLParser parser = null;
73          Reader reader = null;
74  
75          try {
76              if (encoding == null || encoding.trim().length() == 0) {
77                  encoding = Constants.DEFAULT_ENCODING;
78              }
79              usedEncoding = encoding;
80  
81              RewindableInputStreamWrapper rewindableInputStreamWrapper
82                      = new RewindableInputStreamWrapper(input);
83              reader = new InputStreamReader(rewindableInputStreamWrapper, encoding);
84              parser = new PlainTextParser(reader, output, true,
85                                           rewindableInputStreamWrapper);
86              try {
87                  // If no exception is thrown here, then the document has been parsed
88                  // using given (or default) encoding
89                  parser.parse();
90              } catch (ChangedCharSetException   e) {
91                  // There's some META in docuemnt that specifies the encoding
92                  String   inputEncodingSpec = e.getCharSetSpec();
93                  String   inputEncoding = (e.keyEqualsCharSet())
94                                         ? inputEncodingSpec
95                                         : getEncodingFromSpec(inputEncodingSpec);
96                  usedEncoding = inputEncoding;
97                  reader = new BufferedReader(new InputStreamReader(new PrefilledInputStream(rewindableInputStreamWrapper),
98                                                                    inputEncoding));
99                  parser = new PlainTextParser(reader, output, false, null);
100                 parser.parse();
101             }
102         } catch (Exception   e) {
103             throw new PlainTextExtractorException(e);
104         }
105     }
106 
107     /**
108      * Gets encoding from content-type spec like 'text/html; charset=koi8-r'.
109      * @param spec the content-type specification
110      * @return the encoding
111      */
112     private String   getEncodingFromSpec(String   spec) {
113         String   inputEncoding;
114         StringTokenizer   tokenizer = new StringTokenizer  (spec, " \t;=");
115         int tokenCount = tokenizer.countTokens();
116         String  [] tokens = new String  [tokenCount];
117         for (int i = 0; i < tokenCount; i++) {
118             tokens[i] = tokenizer.nextToken();
119         }
120         if (tokenCount >= 3 && "charset".equalsIgnoreCase(tokens[1])) {
121             inputEncoding = tokens[2];
122         } else {
123             inputEncoding = Constants.DEFAULT_ENCODING;
124         }
125         return inputEncoding;
126     }
127 
128     /**
129      * Extracts inline resources. Those are &lt;IMG&gt; tag
130      * <b>src</b> attribute, &lt;LINK&gt; tag <b>src</b> and <b>href</b> attributes
131      * and <b>background</b> attribute for all tags.
132      *
133      * @param input the input stream that supplies an HTML for extraction
134      * @param encoding the encoding of the input stream
135      * @return list of inline resources
136      * @throws PlainTextExtractorException
137      */
138     public List   extractInlineResources(InputStream input, String   encoding)
139             throws PlainTextExtractorException {
140         try {
141             Reader reader = new InputStreamReader(input, encoding);
142             InlineResourcesParser parser = new InlineResourcesParser(reader);
143             parser.parse();
144             return parser.getExtractedResources();
145         } catch (IOException e) {
146             throw new PlainTextExtractorException(e);
147         } catch (ParseException e) {
148             throw new PlainTextExtractorException(e);
149         }
150     }
151 
152     /**
153      * Extracts all references. Those are <b>src</b>, <b>href</b>
154      * and <b>background</b> attributes.
155      *
156      * @param input the input stream that supplies an HTML for extraction
157      * @param encoding the encoding of the input stream
158      * @return list of all references
159      * @throws PlainTextExtractorException
160      */
161     public List   extractAllRefs(InputStream input, String   encoding)
162             throws PlainTextExtractorException {
163         try {
164             Reader reader = new InputStreamReader(input, encoding);
165             RefsParser parser = new RefsParser(reader);
166             parser.parse();
167             return parser.getExtractedRefs();
168         } catch (IOException e) {
169             throw new PlainTextExtractorException(e);
170         } catch (ParseException e) {
171             throw new PlainTextExtractorException(e);
172         }
173     }
174 
175     /**
176      * @see com.blandware.atleap.common.parsers.SpecificPlainTextExtractor#getUsedEncoding()
177      */
178     public String   getUsedEncoding() {
179         return usedEncoding;
180     }
181 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags