TestCmsHtmlExtractor


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/test/org/opencms/util/TestCmsHtmlExtractor.java,v $
3    * Date   : $Date: 2006/03/27 14:52:42 $
4    * Version: $Revision: 1.2 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.util;
33  
34  import org.opencms.i18n.CmsEncoder;
35  
36  import junit.framework.TestCase;
37  
38  import org.htmlparser.Node;
39  import org.htmlparser.NodeFilter;
40  import org.htmlparser.Parser;
41  import org.htmlparser.filters.NodeClassFilter;
42  import org.htmlparser.nodes.TextNode;
43  
44  /** 
45   * Test case for <code>{@link org.opencms.util.CmsHtmlExtractor}</code>.<p>
46   * 
47   * @author Alexander Kandzior 
48   * 
49   * @version $Revision: 1.2 $
50   * 
51   * @since 6.2.0
52   */
53  public class TestCmsHtmlExtractor extends TestCase {
54  
55      private static final String   HTML_PAGE_1 = "<html><title>This is the title</title><body><h1>A headline</h1>This is a test.<br>"
56          + "This  is&nbsp;a <a HREF=\"http://www.opencms.org\">link</a> in a    paragraph.<p>Some more text here. "
57          + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
58          + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
59          + "<p>This is a paragraph.</p>"
60          + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
61          + "<div><p>This is a p in a div<p>This is another p in a div<p></div>"
62          + "<h2>Another headline <b>with some tag content</b></h2>"
63          + "<p>This is a paragraph.</p>"
64          + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
65          + "<div><p>This is a p in a div<p>This is another p in a div<p></div>"
66          + "</body></html>";
67          
68      /**
69       * Default JUnit constructor.<p>
70       * 
71       * @param arg0 JUnit parameters
72       */
73      public TestCmsHtmlExtractor(String   arg0) {
74  
75          super(arg0);
76      }
77  
78      /**
79       * Extracts plain text from a String that contains HTML.<p>
80       * 
81       * @param content the HTML content to extract the text from
82       * 
83       * @return the extracted plain text
84       * 
85       * @throws Exception in case something goes wrong
86       */
87      public static String   extractFromHtml2(String   content) throws Exception   {
88          
89          Parser parser = new Parser();
90          parser.setInputHTML(content);
91  
92          StringBean stringBean = new StringBean();
93          stringBean.setLinks(true);
94          stringBean.setCollapse(true);
95          
96          parser.visitAllNodesWith(stringBean);
97  
98          return stringBean.getStrings();
99      }
100 
101     /**
102      * Extracts plain text from a String that contains HTML.<p>
103      * 
104      * @param content the HTML content to extract the text from
105      * 
106      * @return the extracted plain text
107      * 
108      * @throws Exception in case something goes wrong
109      */
110     private String   extractFromHtml(String   content) throws Exception   {
111 
112         Parser myParser;
113         Node[] nodes = null;
114         myParser = Parser.createParser(content, null);
115 
116         NodeFilter filter = new NodeClassFilter(TextNode.class);
117         
118         nodes = myParser.extractAllNodesThatMatch(filter).toNodeArray();
119 
120         StringBuffer   result = new StringBuffer  ();
121 
122         for (int i = 0; i < nodes.length; i++) {
123             TextNode textnode = (TextNode)nodes[i];
124             String   line = textnode.toPlainTextString().trim();
125             result.append(line);
126         }
127 
128         return result.toString();
129     }
130 
131     /**
132      * Tests the HTML extractor.<p>
133      * 
134      * @throws Exception in case the test fails
135      */
136     public void testHtmlExtractor() throws Exception   {
137 
138         String   result;
139         
140         result = CmsHtmlExtractor.extractText(HTML_PAGE_1, CmsEncoder.ENCODING_ISO_8859_1);        
141         System.out.println(result + "\n\n");
142         
143         result = extractFromHtml(HTML_PAGE_1);
144         System.out.println(result + "\n\n");
145         
146         result = extractFromHtml2(HTML_PAGE_1);
147         System.out.println(result + "\n\n");
148     }
149 }
150
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags