1 17 package org.alfresco.repo.content.transform; 18 19 import java.io.IOException ; 20 import java.io.InputStream ; 21 import java.util.Map ; 22 23 import org.alfresco.repo.content.MimetypeMap; 24 import org.alfresco.service.cmr.repository.ContentReader; 25 import org.alfresco.service.cmr.repository.ContentWriter; 26 import org.textmining.text.extraction.WordExtractor; 27 28 34 public class TextMiningContentTransformer extends AbstractContentTransformer 35 { 36 private WordExtractor wordExtractor; 37 38 public TextMiningContentTransformer() 39 { 40 this.wordExtractor = new WordExtractor(); 41 } 42 43 46 public double getReliability(String sourceMimetype, String targetMimetype) 47 { 48 if (!MimetypeMap.MIMETYPE_WORD.equals(sourceMimetype) || 49 !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) 50 { 51 return 0.0; 53 } 54 else 55 { 56 return 1.0; 57 } 58 } 59 60 public void transformInternal(ContentReader reader, ContentWriter writer, Map <String , Object > options) 61 throws Exception 62 { 63 InputStream is = null; 64 String text = null; 65 try 66 { 67 is = reader.getContentInputStream(); 68 text = wordExtractor.extractText(is); 69 } 70 catch (IOException e) 71 { 72 if (e.getMessage().contains("Unable to read entire header")) 75 { 76 text = ""; 78 } 79 } 80 finally 81 { 82 if (is != null) 83 { 84 is.close(); 85 } 86 } 87 writer.putContent(text); 89 } 90 } 91 | Popular Tags |