1 31 32 package org.opencms.util; 33 34 import org.opencms.staticexport.CmsLinkProcessor; 35 36 import java.io.ByteArrayInputStream ; 37 import java.io.InputStream ; 38 import java.io.UnsupportedEncodingException ; 39 40 import org.htmlparser.Parser; 41 import org.htmlparser.beans.StringBean; 42 import org.htmlparser.lexer.Lexer; 43 import org.htmlparser.lexer.Page; 44 import org.htmlparser.util.ParserException; 45 46 55 public final class CmsHtmlExtractor { 56 57 60 private CmsHtmlExtractor() { 61 62 } 64 65 75 public static String extractText(InputStream in, String encoding) 76 throws ParserException, UnsupportedEncodingException { 77 78 Parser parser = new Parser(); 79 Lexer lexer = new Lexer(); 80 Page page = new Page(in, encoding); 81 lexer.setPage(page); 82 parser.setLexer(lexer); 83 84 StringBean stringBean = new StringBean(); 85 parser.visitAllNodesWith(stringBean); 86 87 return stringBean.getStrings(); 88 } 89 90 100 public static String extractText(String content, String encoding) 101 throws ParserException, UnsupportedEncodingException { 102 103 StringBuffer newContent = new StringBuffer (content.length() + 32); 107 108 newContent.append(CmsLinkProcessor.HTML_START); 109 newContent.append(content); 110 newContent.append(CmsLinkProcessor.HTML_END); 111 112 InputStream in = new ByteArrayInputStream (newContent.toString().getBytes(encoding)); 114 115 return extractText(in, encoding); 117 } 118 } | Popular Tags |