1 2 3 4 package net.nutch.parse.rtf; 5 6 import net.nutch.parse.*; 7 import net.nutch.parse.ParseException; 8 import net.nutch.protocol.Content; 9 10 import java.io.ByteArrayInputStream ; 11 import java.io.InputStreamReader ; 12 import java.io.Reader ; 13 import java.util.Properties ; 14 15 import com.etranslate.tm.processing.rtf.RTFParser; 16 17 21 public class RTFParseFactory implements Parser { 22 23 public Parse getParse(Content content) throws ParseException { 24 byte[] raw = content.getContent(); 25 Reader reader = new InputStreamReader (new ByteArrayInputStream (raw)); 26 RTFParserDelegateImpl delegate = new RTFParserDelegateImpl(); 27 RTFParser rtfParser = null; 28 rtfParser = RTFParser.createParser(reader); 29 rtfParser.setNewLine("\n"); 30 rtfParser.setDelegate(delegate); 31 32 try { 33 rtfParser.parse(); 34 } catch (com.etranslate.tm.processing.rtf.ParseException e) { 35 throw new ParseException("Exception parsing RTF document", e); 36 } 37 38 Properties metadata = new Properties (); 39 metadata.putAll(content.getMetadata()); 40 metadata.putAll(delegate.getMetaData()); 41 String title = metadata.getProperty("title"); 42 43 if(title != null){ 44 metadata.remove(title); 45 } else { 46 title = ""; 47 } 48 49 ParseData parseData = new ParseData(title, new Outlink[0], metadata); 50 51 return new ParseImpl(delegate.getText(), parseData); 52 } 53 54 55 } 56 | Popular Tags |