1 2 3 4 package net.nutch.parse.msword; 5 6 import net.nutch.protocol.Content; 7 import net.nutch.util.LogFormatter; 8 import net.nutch.parse.Parser; 9 import net.nutch.parse.Parse; 10 import net.nutch.parse.ParseData; 11 import net.nutch.parse.ParseImpl; 12 import net.nutch.parse.Outlink; 13 import net.nutch.parse.ParseException; 14 15 import java.util.Properties ; 16 18 import java.io.ByteArrayInputStream ; 19 20 35 36 public class MSWordParser implements Parser { 37 40 public MSWordParser () {} 41 42 public Parse getParse(Content content) throws ParseException { 43 44 String contentType = content.getContentType(); 46 if (contentType != null && !contentType.startsWith("application/msword")) 47 throw new ParseException( 48 "Content-Type not application/msword: "+contentType); 49 50 String text = null; 51 String title = null; 52 Properties properties = null; 53 54 try { 55 56 byte[] raw = content.getContent(); 57 58 String contentLength = content.get("Content-Length"); 59 if (contentLength != null 60 && raw.length != Integer.parseInt(contentLength)) { 61 throw new ParseException("Content truncated at "+raw.length 62 +" bytes. Parser can't handle incomplete msword file."); 63 } 64 65 WordExtractor extractor = new WordExtractor(); 66 67 text = extractor.extractText(new ByteArrayInputStream (raw)); 69 70 properties = extractor.extractProperties(new ByteArrayInputStream (raw)); 72 73 extractor = null; 74 75 } catch (ParseException e) { 76 throw e; 77 } catch (FastSavedException e) { 78 throw new ParseException(e); 79 } catch (PasswordProtectedException e) { 80 throw new ParseException(e); 81 } catch (Exception e) { throw new ParseException("Can't be handled as msword document. "+e); 83 } finally { 84 } 86 87 Properties metadata = new Properties (); 89 metadata.putAll(content.getMetadata()); 91 if(properties != null) { 92 title = properties.getProperty("Title"); 93 properties.remove("Title"); 94 metadata.putAll(properties); 95 } 96 97 if (text == null) 98 text = ""; 99 100 if (title == null) 101 title = ""; 102 103 Outlink[] outlinks = new Outlink[0]; 105 106 ParseData parseData = new ParseData(title, outlinks, metadata); 107 return new ParseImpl(text, parseData); 108 } 111 112 } 113 | Popular Tags |