KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > msword > MSWordParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.msword;
5
6 import net.nutch.protocol.Content;
7 import net.nutch.util.LogFormatter;
8 import net.nutch.parse.Parser;
9 import net.nutch.parse.Parse;
10 import net.nutch.parse.ParseData;
11 import net.nutch.parse.ParseImpl;
12 import net.nutch.parse.Outlink;
13 import net.nutch.parse.ParseException;
14
15 import java.util.Properties JavaDoc;
16 //import java.util.logging.Logger;
17

18 import java.io.ByteArrayInputStream JavaDoc;
19
20 /**
21  * parser for mime type application/msword.
22  * It is based on org.apache.poi.*. We have to see how well it performs.
23  *
24  * @author John Xing
25  *
26  * Note on 20040614 by Xing:
27  * Some codes are stacked here for convenience (see inline comments).
28  * They may be moved to more appropriate places when new codebase
29  * stabilizes, especially after code for indexing is written.
30  *
31  * @author Andy Hedges
32  * code to extract all msword properties.
33  *
34  */

35
36 public class MSWordParser implements Parser {
37 // public static final Logger LOG =
38
// LogFormatter.getLogger("net.nutch.parse.msword");
39

40   public MSWordParser () {}
41
42   public Parse getParse(Content content) throws ParseException {
43
44     // check that contentType is one we can handle
45
String JavaDoc contentType = content.getContentType();
46     if (contentType != null && !contentType.startsWith("application/msword"))
47       throw new ParseException(
48         "Content-Type not application/msword: "+contentType);
49
50     String JavaDoc text = null;
51     String JavaDoc title = null;
52     Properties JavaDoc properties = null;
53
54     try {
55
56       byte[] raw = content.getContent();
57
58       String JavaDoc contentLength = content.get("Content-Length");
59       if (contentLength != null
60             && raw.length != Integer.parseInt(contentLength)) {
61           throw new ParseException("Content truncated at "+raw.length
62             +" bytes. Parser can't handle incomplete msword file.");
63       }
64
65       WordExtractor extractor = new WordExtractor();
66
67       // collect text
68
text = extractor.extractText(new ByteArrayInputStream JavaDoc(raw));
69
70       // collect meta info
71
properties = extractor.extractProperties(new ByteArrayInputStream JavaDoc(raw));
72
73       extractor = null;
74
75     } catch (ParseException e) {
76       throw e;
77     } catch (FastSavedException e) {
78       throw new ParseException(e);
79     } catch (PasswordProtectedException e) {
80       throw new ParseException(e);
81     } catch (Exception JavaDoc e) { // run time exception
82
throw new ParseException("Can't be handled as msword document. "+e);
83     } finally {
84       // nothing so far
85
}
86
87     // collect meta data
88
Properties JavaDoc metadata = new Properties JavaDoc();
89     metadata.putAll(content.getMetadata()); // copy through
90

91     if(properties != null) {
92       title = properties.getProperty("Title");
93       properties.remove("Title");
94       metadata.putAll(properties);
95     }
96
97     if (text == null)
98       text = "";
99
100     if (title == null)
101       title = "";
102
103     // collect outlink
104
Outlink[] outlinks = new Outlink[0];
105
106     ParseData parseData = new ParseData(title, outlinks, metadata);
107     return new ParseImpl(text, parseData);
108     // any filter?
109
//return HtmlParseFilters.filter(content, parse, root);
110
}
111
112 }
113
Popular Tags