KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > snapper > parsers > WordParser


1 package org.enhydra.snapper.parsers;
2
3 import java.io.File JavaDoc;
4 import java.io.FileInputStream JavaDoc;
5 import java.io.InputStream JavaDoc;
6 import java.util.Collection JavaDoc;
7 import java.util.Iterator JavaDoc;
8 import java.util.Map JavaDoc;
9 import java.util.Set JavaDoc;
10
11
12 import org.enhydra.snapper.api.*;
13 import org.textmining.text.extraction.WordExtractor;
14
15 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
16 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
17 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
18 //import org.apache.poi.hdf.extractor.WordDocument;
19
import org.apache.poi.hpsf.DocumentSummaryInformation;
20 import org.apache.poi.hpsf.PropertySetFactory;
21 import org.apache.poi.hpsf.SummaryInformation;
22
23
24 public class WordParser implements org.enhydra.snapper.api.Parser {
25       private String JavaDoc fileName;
26       private String JavaDoc parsedText;
27       public String JavaDoc title;
28       private String JavaDoc properties = "";
29       public Map JavaDoc customproperties;
30       
31       public String JavaDoc parse(InputStream JavaDoc is) throws java.io.IOException JavaDoc{ return "";}
32       
33       public void parse () {
34           FileInputStream JavaDoc fis1 = null, fis = null;
35         try {
36             
37             FileInputStream JavaDoc in = new FileInputStream JavaDoc(new File JavaDoc(fileName));
38             parsedText = new WordExtractor().extractText(in);
39             /*WordDocument wd = new WordDocument(in);
40             //text
41             StringWriter docTextWriter = new StringWriter();
42             wd.writeAllText(new PrintWriter(docTextWriter));
43             docTextWriter.close();
44             parsedText = docTextWriter.toString();
45             */

46             
47             in.close();
48             // summary info
49
POIFSReader r = new POIFSReader();
50             MyPOIFSReaderListener mypoi = new MyPOIFSReaderListener();
51             r.registerListener(mypoi,
52                 "\005SummaryInformation");
53             fis = new FileInputStream JavaDoc(fileName);
54             r.read(fis);
55             title = mypoi.getTitle();
56             fis.close();
57             fis = null;
58        
59             
60             POIFSReader r1 = new POIFSReader();
61             DocumentPOIFSReaderListener docpoi = new DocumentPOIFSReaderListener();
62             r1.registerListener(docpoi,
63                 "\005DocumentSummaryInformation");
64             
65             fis1 = new FileInputStream JavaDoc(fileName);
66             //
67

68             r1.read(fis1);
69             if (docpoi.getCustomProperties() == null){
70                 properties = "";
71                 return;
72             }
73             customproperties = docpoi.getCustomProperties();
74             createString(customproperties);
75             fis1.close();
76             fis1 = null;
77
78
79             
80             
81         } catch (Throwable JavaDoc e) {
82             
83             
84             try{
85                 
86                 ParserManager.logger.debug("***** File could not be parsed: " + fileName);
87             } catch (Exception JavaDoc ex) {
88                 System.out.println("***** File could not be parsed: " + fileName);
89                 }
90
91         }
92         
93         finally {
94             if (fis != null){
95                 try{
96                 fis.close();
97                 } catch (Exception JavaDoc ex){};
98                 fis = null;
99             }
100             if (fis1 != null){
101                 try{
102                 fis1.close();
103                 } catch (Exception JavaDoc ex){};
104                 fis1 = null;
105             }
106         }
107
108        
109        }
110       
111
112       public void setFileName(String JavaDoc fileName) {
113         this.fileName = fileName;
114       }
115
116       public String JavaDoc getParsedText() {
117         return parsedText;
118       }
119       
120       public String JavaDoc getTitle() {
121         return title;
122       }
123       
124       public String JavaDoc getProperties(){
125         return properties;
126       }
127       
128       private void createString( Map JavaDoc customproperties){
129         Collection JavaDoc values = customproperties.values();
130         Iterator JavaDoc valuesIterator = values.iterator();
131         properties = "";
132         Set JavaDoc keys = customproperties.keySet();
133         Iterator JavaDoc keysIterator = keys.iterator();
134         
135         for (Iterator JavaDoc it= customproperties.entrySet().iterator(); it.hasNext(); ) {
136             Map.Entry JavaDoc entry = (Map.Entry JavaDoc)it.next();
137             properties += entry.getKey().toString() + " = " + entry.getValue() + " , ";
138             }
139         
140         
141     }
142 }
143       
144       class MyPOIFSReaderListener implements POIFSReaderListener
145         {
146             String JavaDoc title;
147             public void processPOIFSReaderEvent(POIFSReaderEvent event)
148             {
149                 SummaryInformation si = null;
150                 try
151                 {
152                     si = (SummaryInformation)
153                     PropertySetFactory.create(event.getStream());
154                 }
155                 catch (Exception JavaDoc ex)
156                 {
157                     throw new RuntimeException JavaDoc
158                         ("Property set stream \"" + event.getPath() +
159                             event.getName() + "\": " + ex);
160                 }
161                 title = si.getTitle();
162             }
163             
164             public String JavaDoc getTitle(){
165                 return title;
166             }
167         }
168       
169       class DocumentPOIFSReaderListener implements POIFSReaderListener
170         { Map JavaDoc customproperties;
171             public void processPOIFSReaderEvent(POIFSReaderEvent event)
172             {
173                 DocumentSummaryInformation dsi = null;
174                 try
175                 {
176                     dsi = (DocumentSummaryInformation)
177                     PropertySetFactory.create(event.getStream());
178                 }
179                 catch (Exception JavaDoc ex)
180                 {
181                     throw new RuntimeException JavaDoc
182                         ("Property set stream \"" + event.getPath() +
183                             event.getName() + "\": " + ex);
184                 }
185                 customproperties = dsi.getCustomProperties();
186             }
187             
188             public Map JavaDoc getCustomProperties(){
189                 return customproperties;
190             }
191             
192            
193             
194         }
195       
Popular Tags