KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > contineo > core > text > parser > DOCParser


1 /*
2  * DOCParser.java
3  *
4  * Created on 4. November 2003, 20:10
5  */

6
7 package org.contineo.core.text.parser;
8
9 import java.io.File JavaDoc;
10 import java.io.FileInputStream JavaDoc;
11 import org.apache.log4j.Level;
12 import org.apache.log4j.Logger;
13 import org.contineo.core.LoggingManager;
14 import org.apache.poi.hwpf.HWPFDocument;
15
16 /**
17  * Parses a MS Word (*.doc, *.dot) file to extract the text contained in the
18  * file. This class uses the external library HWPF provided by the Apache
19  * Jakarta POI project. Even though this library provides features to extract
20  * the document author and version, we do not use those features, because the
21  * library is known to be buggy. The important part is to get the text content,
22  * not extracting the author, date, etc. is not essential.
23  * @author Michael Scholz
24  * @author Sebastian Stein
25  */

26 public class DOCParser implements Parser {
27
28     /**
29      * Holds the pure text content of the document.
30      * @uml.property name="content"
31      */

32     private StringBuffer JavaDoc content = new StringBuffer JavaDoc();
33
34     /**
35      * @uml.property name="logger"
36      * @uml.associationEnd
37      */

38     private Logger logger;
39
40     /** Creates a new instance of DOCParser
41      * @param file The MS Word (*.doc, *.dot) file to be parsed.
42      * */

43     public DOCParser(File JavaDoc file) {
44         logger = LoggingManager.getLogger(this.getClass());
45         init(file);
46     }
47
48     /**
49      * This function actually parses the doc file using the HWPF library.
50      * The text content is stored in the class member variable content.
51      * @param file The MS Word (*.doc, *.dot) file to be parsed.
52      */

53     protected void init(File JavaDoc file) {
54         try {
55             // for reading the MS Word file we use the deprecated HWPF library
56
// provided by Jakarta POI
57
FileInputStream JavaDoc in = new FileInputStream JavaDoc(file);
58             HWPFDocument doc = new HWPFDocument(in);
59
60             // this call returns the complete document text without any formatting
61
String JavaDoc docText = doc.getRange().text();
62             content.append(docText);
63             in.close();
64         }
65         catch (Exception JavaDoc ex) {
66             if (logger.isEnabledFor(Level.ERROR))
67                 logger.error(ex.getMessage());
68         }
69     }
70
71     /**
72      *
73      * @uml.property name="content"
74      */

75     public StringBuffer JavaDoc getContent() {
76         return content;
77     }
78
79     
80     public String JavaDoc getVersion() {
81         return "";
82     }
83
84     /* (non-Javadoc)
85      * @see org.contineo.core.text.parser.Parser#getAuthor()
86      */

87     public String JavaDoc getAuthor() {
88         return "";
89     }
90
91     /* (non-Javadoc)
92      * @see org.contineo.core.text.parser.Parser#getSourceDate()
93      */

94     public String JavaDoc getSourceDate() {
95         return "";
96     }
97
98     /* (non-Javadoc)
99      * @see org.contineo.core.text.parser.Parser#getKeywords()
100      */

101     public String JavaDoc getKeywords() {
102         return "";
103     }
104
105     /* (non-Javadoc)
106      * @see org.contineo.core.text.parser.Parser#getTitle()
107      */

108     public String JavaDoc getTitle() {
109         return "";
110     }
111 }
112
Popular Tags