CmsHtmlExtractor


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/src/org/opencms/util/CmsHtmlExtractor.java,v $
3    * Date   : $Date: 2006/03/27 14:52:41 $
4    * Version: $Revision: 1.10 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.util;
33  
34  import org.opencms.staticexport.CmsLinkProcessor;
35  
36  import java.io.ByteArrayInputStream  ;
37  import java.io.InputStream  ;
38  import java.io.UnsupportedEncodingException  ;
39  
40  import org.htmlparser.Parser;
41  import org.htmlparser.beans.StringBean;
42  import org.htmlparser.lexer.Lexer;
43  import org.htmlparser.lexer.Page;
44  import org.htmlparser.util.ParserException;
45  
46  /**
47   * Extracts plain text from HTML.<p>
48   * 
49   * @author  Alexander Kandzior 
50   * 
51   * @version $Revision: 1.10 $ 
52   * 
53   * @since 6.0.0 
54   */
55  public final class CmsHtmlExtractor {
56  
57      /**
58       * Hides the public constructor.<p>
59       */
60      private CmsHtmlExtractor() {
61  
62          // hides the public constructor
63      }
64  
65      /**
66       * Extract the text from a HTML page.<p>
67       *
68       * @param in the html content input stream
69       * @param encoding the encoding of the content
70       *
71       * @return the extracted text from the page
72       * @throws ParserException if the parsing of the HTML failed
73       * @throws UnsupportedEncodingException if the given encoding is not supported
74       */
75      public static String   extractText(InputStream   in, String   encoding)
76      throws ParserException, UnsupportedEncodingException   {
77  
78          Parser parser = new Parser();
79          Lexer lexer = new Lexer();
80          Page page = new Page(in, encoding);
81          lexer.setPage(page);
82          parser.setLexer(lexer);
83  
84          StringBean stringBean = new StringBean();
85          parser.visitAllNodesWith(stringBean);
86  
87          return stringBean.getStrings();
88      }
89  
90      /**
91       * Extract the text from a HTML page.<p>
92       *
93       * @param content the html content
94       * @param encoding the encoding of the content
95       *
96       * @return the extracted text from the page
97       * @throws ParserException if the parsing of the HTML failed
98       * @throws UnsupportedEncodingException if the given encoding is not supported
99       */
100     public static String   extractText(String   content, String   encoding)
101     throws ParserException, UnsupportedEncodingException   {
102 
103         // we must make sure that the content passed to the parser always is 
104         // a "valid" HTML page, i.e. is surrounded by <html><body>...</body></html> 
105         // otherwise you will get strange results for some specific HTML constructs
106         StringBuffer   newContent = new StringBuffer  (content.length() + 32);
107 
108         newContent.append(CmsLinkProcessor.HTML_START);
109         newContent.append(content);
110         newContent.append(CmsLinkProcessor.HTML_END);
111 
112         // make sure the Lexer uses the right encoding
113         InputStream   in = new ByteArrayInputStream  (newContent.toString().getBytes(encoding));
114 
115         // use the stream based version to process the results
116         return extractText(in, encoding);
117     }
118 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags