1 31 32 package org.opencms.util; 33 34 import org.opencms.i18n.CmsEncoder; 35 36 import junit.framework.TestCase; 37 38 import org.htmlparser.Node; 39 import org.htmlparser.NodeFilter; 40 import org.htmlparser.Parser; 41 import org.htmlparser.filters.NodeClassFilter; 42 import org.htmlparser.nodes.TextNode; 43 44 53 public class TestCmsHtmlExtractor extends TestCase { 54 55 private static final String HTML_PAGE_1 = "<html><title>This is the title</title><body><h1>A headline</h1>This is a test.<br>" 56 + "This is a <a HREF=\"http://www.opencms.org\">link</a> in a paragraph.<p>Some more text here. " 57 + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. " 58 + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. " 59 + "<p>This is a paragraph.</p>" 60 + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. " 61 + "<div><p>This is a p in a div<p>This is another p in a div<p></div>" 62 + "<h2>Another headline <b>with some tag content</b></h2>" 63 + "<p>This is a paragraph.</p>" 64 + "This is a very long line, because this is long line, because this is long line, because this is long line, because this is long line. " 65 + "<div><p>This is a p in a div<p>This is another p in a div<p></div>" 66 + "</body></html>"; 67 68 73 public TestCmsHtmlExtractor(String arg0) { 74 75 super(arg0); 76 } 77 78 87 public static String extractFromHtml2(String content) throws Exception { 88 89 Parser parser = new Parser(); 90 parser.setInputHTML(content); 91 92 StringBean stringBean = new StringBean(); 93 stringBean.setLinks(true); 94 stringBean.setCollapse(true); 95 96 parser.visitAllNodesWith(stringBean); 97 98 return stringBean.getStrings(); 99 } 100 101 110 private String extractFromHtml(String content) throws Exception { 111 112 Parser myParser; 113 Node[] nodes = null; 114 myParser = Parser.createParser(content, null); 115 116 NodeFilter filter = new NodeClassFilter(TextNode.class); 117 118 nodes = myParser.extractAllNodesThatMatch(filter).toNodeArray(); 119 120 StringBuffer result = new StringBuffer (); 121 122 for (int i = 0; i < nodes.length; i++) { 123 TextNode textnode = (TextNode)nodes[i]; 124 String line = textnode.toPlainTextString().trim(); 125 result.append(line); 126 } 127 128 return result.toString(); 129 } 130 131 136 public void testHtmlExtractor() throws Exception { 137 138 String result; 139 140 result = CmsHtmlExtractor.extractText(HTML_PAGE_1, CmsEncoder.ENCODING_ISO_8859_1); 141 System.out.println(result + "\n\n"); 142 143 result = extractFromHtml(HTML_PAGE_1); 144 System.out.println(result + "\n\n"); 145 146 result = extractFromHtml2(HTML_PAGE_1); 147 System.out.println(result + "\n\n"); 148 } 149 } 150 | Popular Tags |