KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > util > TestCmsHtmlExtractor


1 /*
2  * File : $Source: /usr/local/cvs/opencms/test/org/opencms/util/TestCmsHtmlExtractor.java,v $
3  * Date : $Date: 2006/03/27 14:52:42 $
4  * Version: $Revision: 1.2 $
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.util;
33
34 import org.opencms.i18n.CmsEncoder;
35
36 import junit.framework.TestCase;
37
38 import org.htmlparser.Node;
39 import org.htmlparser.NodeFilter;
40 import org.htmlparser.Parser;
41 import org.htmlparser.filters.NodeClassFilter;
42 import org.htmlparser.nodes.TextNode;
43
44 /**
45  * Test case for <code>{@link org.opencms.util.CmsHtmlExtractor}</code>.<p>
46  *
47  * @author Alexander Kandzior
48  *
49  * @version $Revision: 1.2 $
50  *
51  * @since 6.2.0
52  */

53 public class TestCmsHtmlExtractor extends TestCase {
54
55     private static final String JavaDoc HTML_PAGE_1 = "<html><title>This is the title</title><body><h1>A headline</h1>This is a test.<br>"
56         + "This is&nbsp;a <a HREF=\"http://www.opencms.org\">link</a> in a paragraph.<p>Some more text here. "
57         + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
58         + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
59         + "<p>This is a paragraph.</p>"
60         + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
61         + "<div><p>This is a p in a div<p>This is another p in a div<p></div>"
62         + "<h2>Another headline <b>with some tag content</b></h2>"
63         + "<p>This is a paragraph.</p>"
64         + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. "
65         + "<div><p>This is a p in a div<p>This is another p in a div<p></div>"
66         + "</body></html>";
67         
68     /**
69      * Default JUnit constructor.<p>
70      *
71      * @param arg0 JUnit parameters
72      */

73     public TestCmsHtmlExtractor(String JavaDoc arg0) {
74
75         super(arg0);
76     }
77
78     /**
79      * Extracts plain text from a String that contains HTML.<p>
80      *
81      * @param content the HTML content to extract the text from
82      *
83      * @return the extracted plain text
84      *
85      * @throws Exception in case something goes wrong
86      */

87     public static String JavaDoc extractFromHtml2(String JavaDoc content) throws Exception JavaDoc {
88         
89         Parser parser = new Parser();
90         parser.setInputHTML(content);
91
92         StringBean stringBean = new StringBean();
93         stringBean.setLinks(true);
94         stringBean.setCollapse(true);
95         
96         parser.visitAllNodesWith(stringBean);
97
98         return stringBean.getStrings();
99     }
100
101     /**
102      * Extracts plain text from a String that contains HTML.<p>
103      *
104      * @param content the HTML content to extract the text from
105      *
106      * @return the extracted plain text
107      *
108      * @throws Exception in case something goes wrong
109      */

110     private String JavaDoc extractFromHtml(String JavaDoc content) throws Exception JavaDoc {
111
112         Parser myParser;
113         Node[] nodes = null;
114         myParser = Parser.createParser(content, null);
115
116         NodeFilter filter = new NodeClassFilter(TextNode.class);
117         
118         nodes = myParser.extractAllNodesThatMatch(filter).toNodeArray();
119
120         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
121
122         for (int i = 0; i < nodes.length; i++) {
123             TextNode textnode = (TextNode)nodes[i];
124             String JavaDoc line = textnode.toPlainTextString().trim();
125             result.append(line);
126         }
127
128         return result.toString();
129     }
130
131     /**
132      * Tests the HTML extractor.<p>
133      *
134      * @throws Exception in case the test fails
135      */

136     public void testHtmlExtractor() throws Exception JavaDoc {
137
138         String JavaDoc result;
139         
140         result = CmsHtmlExtractor.extractText(HTML_PAGE_1, CmsEncoder.ENCODING_ISO_8859_1);
141         System.out.println(result + "\n\n");
142         
143         result = extractFromHtml(HTML_PAGE_1);
144         System.out.println(result + "\n\n");
145         
146         result = extractFromHtml2(HTML_PAGE_1);
147         System.out.println(result + "\n\n");
148     }
149 }
150
Popular Tags