KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > msword > WordExtractor


1 /* Copyright 2004 Ryan Ackley
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */

15 package net.nutch.parse.msword;
16
17 import org.apache.poi.hpsf.*;
18 import org.apache.poi.hwpf.model.*;
19 import org.apache.poi.hwpf.sprm.*;
20 import org.apache.poi.poifs.eventfilesystem.*;
21 import org.apache.poi.poifs.filesystem.*;
22 import org.apache.poi.util.LittleEndian;
23
24 import java.util.*;
25 import java.io.*;
26
27 /**
28  * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
29  *
30  * @author Ryan Ackley
31  *
32  * @author Andy Hedges
33  * code to extract all msword properties.
34  *
35  */

36 public class WordExtractor
37 {
38
39   /**
40    * Constructor
41    */

42   public WordExtractor()
43   {
44   }
45
46   /**
47    * Gets the text from a Word document.
48    *
49    * @param in The InputStream representing the Word file.
50    */

51   public String JavaDoc extractText(InputStream in) throws Exception JavaDoc
52   {
53     ArrayList text = new ArrayList();
54     POIFSFileSystem fsys = new POIFSFileSystem(in);
55
56     // load our POIFS document streams.
57
DocumentEntry headerProps =
58         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
59     DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
60     byte[] header = new byte[headerProps.getSize()];
61
62
63     din.read(header);
64     din.close();
65
66     int info = LittleEndian.getShort(header, 0xa);
67     if ((info & 0x4) != 0)
68     {
69       throw new FastSavedException("Fast-saved files are unsupported at this time");
70     }
71     if ((info & 0x100) != 0)
72     {
73       throw new PasswordProtectedException("This document is password protected");
74     }
75
76     // determine the version of Word this document came from.
77
int nFib = LittleEndian.getShort(header, 0x2);
78     switch (nFib)
79     {
80       case 101:
81       case 102:
82       case 103:
83       case 104:
84         // this is a Word 6.0 doc send it to the extractor for that version.
85
Word6Extractor oldExtractor = new Word6Extractor();
86         return oldExtractor.extractText(header);
87     }
88
89     //Get the information we need from the header
90
boolean useTable1 = (info & 0x200) != 0;
91
92     //get the location of the piece table
93
int complexOffset = LittleEndian.getInt(header, 0x1a2);
94
95     // determine which table stream we must use.
96
String JavaDoc tableName = null;
97     if (useTable1)
98     {
99       tableName = "1Table";
100     }
101     else
102     {
103       tableName = "0Table";
104     }
105
106     DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
107     byte[] tableStream = new byte[table.getSize()];
108
109     din = fsys.createDocumentInputStream(tableName);
110
111     din.read(tableStream);
112     din.close();
113
114     int chpOffset = LittleEndian.getInt(header, 0xfa);
115     int chpSize = LittleEndian.getInt(header, 0xfe);
116     int fcMin = LittleEndian.getInt(header, 0x18);
117     CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
118
119     // load our text pieces and our character runs
120
ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
121     TextPieceTable tpt = cft.getTextPieceTable();
122     List textPieces = tpt.getTextPieces();
123
124     // make the POIFS objects available for garbage collection
125
din = null;
126     fsys = null;
127     table = null;
128     headerProps = null;
129
130     List textRuns = cbt.getTextRuns();
131     Iterator runIt = textRuns.iterator();
132     Iterator textIt = textPieces.iterator();
133
134     TextPiece currentPiece = (TextPiece)textIt.next();
135     int currentTextStart = currentPiece.getStart();
136     int currentTextEnd = currentPiece.getEnd();
137
138     WordTextBuffer finalTextBuf = new WordTextBuffer();
139
140     // iterate through all text runs extract the text only if they haven't been
141
// deleted
142
while (runIt.hasNext())
143     {
144       CHPX chpx = (CHPX)runIt.next();
145       boolean deleted = isDeleted(chpx.getGrpprl());
146       if (deleted)
147       {
148         continue;
149       }
150
151       int runStart = chpx.getStart();
152       int runEnd = chpx.getEnd();
153
154       while (runStart >= currentTextEnd)
155       {
156         currentPiece = (TextPiece) textIt.next ();
157         currentTextStart = currentPiece.getStart ();
158         currentTextEnd = currentPiece.getEnd ();
159       }
160
161       if (runEnd < currentTextEnd)
162       {
163         String JavaDoc str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
164         finalTextBuf.append(str);
165       }
166       else if (runEnd > currentTextEnd)
167       {
168         while (runEnd > currentTextEnd)
169         {
170           String JavaDoc str = currentPiece.substring(runStart - currentTextStart,
171                                    currentTextEnd - currentTextStart);
172           finalTextBuf.append(str);
173           if (textIt.hasNext())
174           {
175             currentPiece = (TextPiece) textIt.next ();
176             currentTextStart = currentPiece.getStart ();
177             runStart = currentTextStart;
178             currentTextEnd = currentPiece.getEnd ();
179           }
180           else
181           {
182             return finalTextBuf.toString();
183           }
184         }
185         String JavaDoc str = currentPiece.substring(0, runEnd - currentTextStart);
186         finalTextBuf.append(str);
187       }
188       else
189       {
190         String JavaDoc str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
191         if (textIt.hasNext())
192         {
193           currentPiece = (TextPiece) textIt.next();
194           currentTextStart = currentPiece.getStart();
195           currentTextEnd = currentPiece.getEnd();
196         }
197         finalTextBuf.append(str);
198       }
199     }
200     return finalTextBuf.toString();
201   }
202
203   /**
204    * Used to determine if a run of text has been deleted.
205    *
206    * @param grpprl The list of sprms for a particular run of text.
207    * @return true if this run of text has been deleted.
208    */

209   private boolean isDeleted(byte[] grpprl)
210   {
211     SprmIterator iterator = new SprmIterator(grpprl,0);
212     while (iterator.hasNext())
213     {
214       SprmOperation op = iterator.next();
215       // 0 is the operation that signals a FDelRMark operation
216
if (op.getOperation() == 0 && op.getOperand() != 0)
217       {
218         return true;
219       }
220     }
221     return false;
222   }
223
224   public Properties extractProperties(InputStream in)
225                       throws IOException {
226
227     PropertiesBroker propertiesBroker = new PropertiesBroker();
228     POIFSReader reader = new POIFSReader();
229     reader.registerListener(new PropertiesReaderListener(propertiesBroker),
230                             "\005SummaryInformation");
231     reader.read(in);
232     return propertiesBroker.getProperties();
233   }
234
235   class PropertiesReaderListener
236     implements POIFSReaderListener {
237
238     private PropertiesBroker propertiesBroker;
239     private Properties metaData = new Properties();
240
241     public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
242       this.propertiesBroker = propertiesBroker;
243     }
244
245     public void processPOIFSReaderEvent(POIFSReaderEvent event) {
246
247       SummaryInformation si = null;
248       Properties properties = new Properties();
249
250       try {
251         si = (SummaryInformation)PropertySetFactory.create(event.getStream());
252       } catch (Exception JavaDoc ex) {
253         properties = null;
254       }
255
256       Date tmp = null;
257
258       String JavaDoc title = si.getTitle();
259       String JavaDoc applicationName = si.getApplicationName();
260       String JavaDoc author = si.getAuthor();
261       int charCount = si.getCharCount();
262       String JavaDoc comments = si.getComments();
263       Date createDateTime = si.getCreateDateTime();
264       long editTime = si.getEditTime();
265       String JavaDoc keywords = si.getKeywords();
266       String JavaDoc lastAuthor = si.getLastAuthor();
267       Date lastPrinted = si.getLastPrinted();
268       Date lastSaveDateTime = si.getLastSaveDateTime();
269       int pageCount = si.getPageCount();
270       String JavaDoc revNumber = si.getRevNumber();
271       int security = si.getSecurity();
272       String JavaDoc subject = si.getSubject();
273       String JavaDoc template = si.getTemplate();
274       int wordCount = si.getWordCount();
275
276       /*Dates are being stored in millis since the epoch to aid
277       localization*/

278       if(title != null)
279         properties.setProperty("Title", title);
280       if(applicationName != null)
281         properties.setProperty("Application-Name", applicationName);
282       if(author != null)
283         properties.setProperty("Author", author);
284       if(charCount != 0)
285         properties.setProperty("Character Count", charCount + "");
286       if(comments != null)
287         properties.setProperty("Comments", comments);
288       if(createDateTime != null)
289         properties.setProperty("Creation-Date", createDateTime.getTime() + "");
290       if(editTime != 0)
291         properties.setProperty("Edit-Time", editTime + "");
292       if(keywords != null)
293         properties.setProperty("Keywords", keywords);
294       if(lastAuthor != null)
295         properties.setProperty("Last-Author", lastAuthor);
296       if(lastPrinted != null)
297         properties.setProperty("Last-Printed", lastPrinted.getTime() + "");
298       if(lastSaveDateTime != null)
299         properties.setProperty("Last-Save-Date", lastSaveDateTime.getTime() + "");
300       if(pageCount != 0)
301         properties.setProperty("Page-Count", pageCount + "");
302       if(revNumber != null)
303         properties.setProperty("Revision-Number", revNumber);
304       if(security != 0)
305         properties.setProperty("Security", security + "");
306       if(subject != null)
307         properties.setProperty("Subject", subject);
308       if(template != null)
309         properties.setProperty("Template", template);
310       if(wordCount != 0)
311         properties.setProperty("Word-Count", wordCount + "");
312       propertiesBroker.setProperties(properties);
313
314       //si.getThumbnail(); // can't think of a sensible way of turning this into a string.
315
}
316   }
317
318   class PropertiesBroker {
319
320     private Properties properties;
321     private int timeoutMillis = 2 * 1000;
322
323
324     public synchronized Properties getProperties() {
325
326       long start = new Date().getTime();
327       long now = start;
328
329       while (properties == null && now - start < timeoutMillis) {
330         try {
331           wait(timeoutMillis / 10);
332         } catch (InterruptedException JavaDoc e) {}
333         now = new Date().getTime();
334       }
335
336       notifyAll();
337
338       return properties;
339     }
340
341     public synchronized void setProperties(Properties properties) {
342       this.properties = properties;
343       notifyAll();
344     }
345   }
346 }
347
348
Popular Tags