1 17 package org.alfresco.repo.content.transform; 18 19 import java.io.File ; 20 import java.util.Map ; 21 22 import org.alfresco.repo.content.MimetypeMap; 23 import org.alfresco.service.cmr.repository.ContentReader; 24 import org.alfresco.service.cmr.repository.ContentWriter; 25 import org.alfresco.util.TempFileProvider; 26 import org.apache.commons.logging.Log; 27 import org.apache.commons.logging.LogFactory; 28 import org.htmlparser.beans.StringBean; 29 30 36 public class HtmlParserContentTransformer extends AbstractContentTransformer 37 { 38 private static final Log logger = LogFactory.getLog(HtmlParserContentTransformer.class); 39 40 43 public double getReliability(String sourceMimetype, String targetMimetype) 44 { 45 if (!MimetypeMap.MIMETYPE_HTML.equals(sourceMimetype) || 46 !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) 47 { 48 return 0.0; 50 } 51 else 52 { 53 return 1.0; 54 } 55 } 56 57 public void transformInternal(ContentReader reader, ContentWriter writer, Map <String , Object > options) 58 throws Exception 59 { 60 File htmlFile = TempFileProvider.createTempFile("HtmlParserContentTransformer_", ".html"); 62 reader.getContent(htmlFile); 63 64 StringBean extractor = new StringBean(); 66 extractor.setCollapse(false); 67 extractor.setLinks(false); 68 extractor.setReplaceNonBreakingSpaces(false); 69 extractor.setURL(htmlFile.getAbsolutePath()); 70 71 String text = extractor.getStrings(); 73 writer.putContent(text); 75 } 76 } 77 | Popular Tags |