MSWordParser


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.parse.msword;
5   
6   import net.nutch.protocol.Content;
7   import net.nutch.util.LogFormatter;
8   import net.nutch.parse.Parser;
9   import net.nutch.parse.Parse;
10  import net.nutch.parse.ParseData;
11  import net.nutch.parse.ParseImpl;
12  import net.nutch.parse.Outlink;
13  import net.nutch.parse.ParseException;
14  
15  import java.util.Properties  ;
16  //import java.util.logging.Logger;
17  
18  import java.io.ByteArrayInputStream  ;
19  
20  /**
21   * parser for mime type application/msword.
22   * It is based on org.apache.poi.*. We have to see how well it performs.
23   *
24   * @author John Xing
25   *
26   * Note on 20040614 by Xing:
27   * Some codes are stacked here for convenience (see inline comments).
28   * They may be moved to more appropriate places when new codebase
29   * stabilizes, especially after code for indexing is written.
30   *
31   * @author Andy Hedges
32   * code to extract all msword properties.
33   *
34   */
35  
36  public class MSWordParser implements Parser {
37  //  public static final Logger LOG =
38  //    LogFormatter.getLogger("net.nutch.parse.msword");
39  
40    public MSWordParser () {}
41  
42    public Parse getParse(Content content) throws ParseException {
43  
44      // check that contentType is one we can handle
45      String   contentType = content.getContentType();
46      if (contentType != null && !contentType.startsWith("application/msword"))
47        throw new ParseException(
48          "Content-Type not application/msword: "+contentType);
49  
50      String   text = null;
51      String   title = null;
52      Properties   properties = null;
53  
54      try {
55  
56        byte[] raw = content.getContent();
57  
58        String   contentLength = content.get("Content-Length");
59        if (contentLength != null
60              && raw.length != Integer.parseInt(contentLength)) {
61            throw new ParseException("Content truncated at "+raw.length
62              +" bytes. Parser can't handle incomplete msword file.");
63        }
64  
65        WordExtractor extractor = new WordExtractor();
66  
67        // collect text
68        text = extractor.extractText(new ByteArrayInputStream  (raw));
69  
70        // collect meta info
71        properties = extractor.extractProperties(new ByteArrayInputStream  (raw));
72  
73        extractor = null;
74  
75      } catch (ParseException e) {
76        throw e;
77      } catch (FastSavedException e) {
78        throw new ParseException(e);
79      } catch (PasswordProtectedException e) {
80        throw new ParseException(e);
81      } catch (Exception   e) { // run time exception
82        throw new ParseException("Can't be handled as msword document. "+e);
83      } finally {
84        // nothing so far
85      }
86  
87      // collect meta data
88      Properties   metadata = new Properties  ();
89      metadata.putAll(content.getMetadata()); // copy through
90  
91      if(properties != null) {
92        title = properties.getProperty("Title");
93        properties.remove("Title");
94        metadata.putAll(properties);
95      }
96  
97      if (text == null)
98        text = "";
99  
100     if (title == null)
101       title = "";
102 
103     // collect outlink
104     Outlink[] outlinks = new Outlink[0];
105 
106     ParseData parseData = new ParseData(title, outlinks, metadata);
107     return new ParseImpl(text, parseData);
108     // any filter?
109     //return HtmlParseFilters.filter(content, parse, root);
110   }
111 
112 }
113
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags