KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > textmining > text > extraction > WordExtractor


1 /* Copyright 2004 Ryan Ackley
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */

15 package org.textmining.text.extraction;
16
17 import org.apache.poi.poifs.filesystem.*;
18 import org.apache.poi.util.LittleEndian;
19 import org.apache.poi.hwpf.model.*;
20 import org.textmining.text.extraction.sprm.*;
21
22 import java.util.*;
23 import java.io.*;
24
25 /**
26  * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
27  *
28  * @author Ryan Ackley
29  */

30 public class WordExtractor
31 {
32
33   /**
34    * Constructor
35    */

36   public WordExtractor()
37   {
38   }
39
40   /**
41    * Gets the text from a Word document.
42    *
43    * @param in The InputStream representing the Word file.
44    */

45   public String JavaDoc extractText(InputStream in) throws Exception JavaDoc
46   {
47       WordTextBuffer finalTextBuf = null;
48       try{
49     ArrayList text = new ArrayList();
50     POIFSFileSystem fsys = new POIFSFileSystem(in);
51
52     // load our POIFS document streams.
53
DocumentEntry headerProps =
54         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
55     DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
56     byte[] header = new byte[headerProps.getSize()];
57
58
59     din.read(header);
60     din.close();
61
62     int info = LittleEndian.getShort(header, 0xa);
63     if ((info & 0x4) != 0)
64     {
65       throw new FastSavedException("Fast-saved files are unsupported at this time");
66     }
67     if ((info & 0x100) != 0)
68     {
69       throw new PasswordProtectedException("This document is password protected");
70     }
71
72     // determine the version of Word this document came from.
73
int nFib = LittleEndian.getShort(header, 0x2);
74     switch (nFib)
75     {
76       case 101:
77       case 102:
78       case 103:
79       case 104:
80         // this is a Word 6.0 doc send it to the extractor for that version.
81
Word6Extractor oldExtractor = new Word6Extractor();
82         return oldExtractor.extractText(header);
83     }
84
85     //Get the information we need from the header
86
boolean useTable1 = (info & 0x200) != 0;
87
88     //get the location of the piece table
89
int complexOffset = LittleEndian.getInt(header, 0x1a2);
90
91     // determine which table stream we must use.
92
String JavaDoc tableName = null;
93     if (useTable1)
94     {
95       tableName = "1Table";
96     }
97     else
98     {
99       tableName = "0Table";
100     }
101
102     DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
103     byte[] tableStream = new byte[table.getSize()];
104
105     din = fsys.createDocumentInputStream(tableName);
106
107     din.read(tableStream);
108     din.close();
109
110     int chpOffset = LittleEndian.getInt(header, 0xfa);
111     int chpSize = LittleEndian.getInt(header, 0xfe);
112     int fcMin = LittleEndian.getInt(header, 0x18);
113     CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
114
115     // load our text pieces and our character runs
116
ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
117     TextPieceTable tpt = cft.getTextPieceTable();
118     List textPieces = tpt.getTextPieces();
119
120     // make the POIFS objects available for garbage collection
121
din = null;
122     fsys = null;
123     table = null;
124     headerProps = null;
125
126     List textRuns = cbt.getTextRuns();
127     Iterator runIt = textRuns.iterator();
128     Iterator textIt = textPieces.iterator();
129
130     TextPiece currentPiece = (TextPiece)textIt.next();
131     int currentTextStart = currentPiece.getStart();
132     int currentTextEnd = currentPiece.getEnd();
133
134     finalTextBuf = new WordTextBuffer();
135
136     // iterate through all text runs extract the text only if they haven't been
137
// deleted
138
while (runIt.hasNext())
139     {
140       CHPX chpx = (CHPX)runIt.next();
141       boolean deleted = isDeleted(chpx.getGrpprl());
142       if (deleted)
143       {
144         continue;
145       }
146
147       int runStart = chpx.getStart();
148       int runEnd = chpx.getEnd();
149
150       while (runStart >= currentTextEnd)
151       {
152         currentPiece = (TextPiece) textIt.next ();
153         currentTextStart = currentPiece.getStart ();
154         currentTextEnd = currentPiece.getEnd ();
155       }
156
157       if (runEnd < currentTextEnd)
158       {
159         String JavaDoc str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
160         finalTextBuf.append(str);
161       }
162       else if (runEnd > currentTextEnd)
163       {
164         while (runEnd > currentTextEnd)
165         {
166           String JavaDoc str = currentPiece.substring(runStart - currentTextStart,
167                                    currentTextEnd - currentTextStart);
168           finalTextBuf.append(str);
169           if (textIt.hasNext())
170           {
171             currentPiece = (TextPiece) textIt.next ();
172             currentTextStart = currentPiece.getStart ();
173             runStart = currentTextStart;
174             currentTextEnd = currentPiece.getEnd ();
175           }
176           else
177           {
178             return finalTextBuf.toString();
179           }
180         }
181         String JavaDoc str = currentPiece.substring(0, runEnd - currentTextStart);
182         finalTextBuf.append(str);
183       }
184       else
185       {
186         String JavaDoc str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
187         if (textIt.hasNext())
188         {
189           currentPiece = (TextPiece) textIt.next();
190           currentTextStart = currentPiece.getStart();
191           currentTextEnd = currentPiece.getEnd();
192         }
193         finalTextBuf.append(str);
194       }
195     }
196     return finalTextBuf.toString();
197     
198   
199       } catch (Throwable JavaDoc e){
200       return finalTextBuf.toString();
201     }
202   }
203
204   /**
205    * Used to determine if a run of text has been deleted.
206    *
207    * @param grpprl The list of sprms for a particular run of text.
208    * @return true if this run of text has been deleted.
209    */

210   private boolean isDeleted(byte[] grpprl)
211   {
212     SprmIterator iterator = new SprmIterator(grpprl);
213     while (iterator.hasNext())
214     {
215       SprmOperation op = iterator.next();
216       // 0 is the operation that signals a FDelRMark operation
217
if (op.getOperation() == 0 && op.getOperand() != 0)
218       {
219         return true;
220       }
221     }
222     return false;
223   }
224 }
Popular Tags