|                                                                                                              1
 2
 3
 4   package net.nutch.parse.msword;
 5
 6   import net.nutch.protocol.Content;
 7   import net.nutch.util.LogFormatter;
 8   import net.nutch.parse.Parser;
 9   import net.nutch.parse.Parse;
 10  import net.nutch.parse.ParseData;
 11  import net.nutch.parse.ParseImpl;
 12  import net.nutch.parse.Outlink;
 13  import net.nutch.parse.ParseException;
 14
 15  import java.util.Properties
  ; 16
 18  import java.io.ByteArrayInputStream
  ; 19
 20
 35
 36  public class MSWordParser implements Parser {
 37
 40    public MSWordParser () {}
 41
 42    public Parse getParse(Content content) throws ParseException {
 43
 44          String
  contentType = content.getContentType(); 46      if (contentType != null && !contentType.startsWith("application/msword"))
 47        throw new ParseException(
 48          "Content-Type not application/msword: "+contentType);
 49
 50      String
  text = null; 51      String
  title = null; 52      Properties
  properties = null; 53
 54      try {
 55
 56        byte[] raw = content.getContent();
 57
 58        String
  contentLength = content.get("Content-Length"); 59        if (contentLength != null
 60              && raw.length != Integer.parseInt(contentLength)) {
 61            throw new ParseException("Content truncated at "+raw.length
 62              +" bytes. Parser can't handle incomplete msword file.");
 63        }
 64
 65        WordExtractor extractor = new WordExtractor();
 66
 67              text = extractor.extractText(new ByteArrayInputStream
  (raw)); 69
 70              properties = extractor.extractProperties(new ByteArrayInputStream
  (raw)); 72
 73        extractor = null;
 74
 75      } catch (ParseException e) {
 76        throw e;
 77      } catch (FastSavedException e) {
 78        throw new ParseException(e);
 79      } catch (PasswordProtectedException e) {
 80        throw new ParseException(e);
 81      } catch (Exception
  e) {       throw new ParseException("Can't be handled as msword document. "+e); 83      } finally {
 84            }
 86
 87          Properties
  metadata = new Properties  (); 89      metadata.putAll(content.getMetadata());
 91      if(properties != null) {
 92        title = properties.getProperty("Title");
 93        properties.remove("Title");
 94        metadata.putAll(properties);
 95      }
 96
 97      if (text == null)
 98        text = "";
 99
 100     if (title == null)
 101       title = "";
 102
 103         Outlink[] outlinks = new Outlink[0];
 105
 106     ParseData parseData = new ParseData(title, outlinks, metadata);
 107     return new ParseImpl(text, parseData);
 108           }
 111
 112 }
 113
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |