WordExtractor


1   /*  Copyright 2004 Ryan Ackley
2    *
3    *  Licensed under the Apache License, Version 2.0 (the "License");
4    *  you may not use this file except in compliance with the License.
5    *  You may obtain a copy of the License at
6    *
7    *      http://www.apache.org/licenses/LICENSE-2.0
8    *
9    *  Unless required by applicable law or agreed to in writing, software
10   *  distributed under the License is distributed on an "AS IS" BASIS,
11   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   *  See the License for the specific language governing permissions and
13   *  limitations under the License.
14   */
15  package org.textmining.text.extraction;
16  
17  import org.apache.poi.poifs.filesystem.*;
18  import org.apache.poi.util.LittleEndian;
19  import org.apache.poi.hwpf.model.*;
20  import org.textmining.text.extraction.sprm.*;
21  
22  import java.util.*;
23  import java.io.*;
24  
25  /**
26   * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
27   *
28   * @author Ryan Ackley
29   */
30  public class WordExtractor
31  {
32  
33    /**
34     * Constructor
35     */
36    public WordExtractor()
37    {
38    }
39  
40    /**
41     * Gets the text from a Word document.
42     *
43     * @param in The InputStream representing the Word file.
44     */
45    public String   extractText(InputStream in) throws Exception  
46    {
47        WordTextBuffer finalTextBuf = null;
48        try{
49      ArrayList text = new ArrayList();
50      POIFSFileSystem fsys = new POIFSFileSystem(in);
51  
52      // load our POIFS document streams.
53      DocumentEntry headerProps =
54          (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
55      DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
56      byte[] header = new byte[headerProps.getSize()];
57  
58  
59      din.read(header);
60      din.close();
61  
62      int info = LittleEndian.getShort(header, 0xa);
63      if ((info & 0x4) != 0)
64      {
65        throw new FastSavedException("Fast-saved files are unsupported at this time");
66      }
67      if ((info & 0x100) != 0)
68      {
69        throw new PasswordProtectedException("This document is password protected");
70      }
71  
72      // determine the version of Word this document came from.
73      int nFib = LittleEndian.getShort(header, 0x2);
74      switch (nFib)
75      {
76        case 101:
77        case 102:
78        case 103:
79        case 104:
80          // this is a Word 6.0 doc send it to the extractor for that version.
81          Word6Extractor oldExtractor = new Word6Extractor();
82          return oldExtractor.extractText(header);
83      }
84  
85      //Get the information we need from the header
86      boolean useTable1 = (info & 0x200) != 0;
87  
88      //get the location of the piece table
89      int complexOffset = LittleEndian.getInt(header, 0x1a2);
90  
91      // determine which table stream we must use.
92      String   tableName = null;
93      if (useTable1)
94      {
95        tableName = "1Table";
96      }
97      else
98      {
99        tableName = "0Table";
100     }
101 
102     DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
103     byte[] tableStream = new byte[table.getSize()];
104 
105     din = fsys.createDocumentInputStream(tableName);
106 
107     din.read(tableStream);
108     din.close();
109 
110     int chpOffset = LittleEndian.getInt(header, 0xfa);
111     int chpSize = LittleEndian.getInt(header, 0xfe);
112     int fcMin = LittleEndian.getInt(header, 0x18);
113     CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
114 
115     // load our text pieces and our character runs
116     ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
117     TextPieceTable tpt = cft.getTextPieceTable();
118     List textPieces = tpt.getTextPieces();
119 
120     // make the POIFS objects available for garbage collection
121     din = null;
122     fsys = null;
123     table = null;
124     headerProps = null;
125 
126     List textRuns = cbt.getTextRuns();
127     Iterator runIt = textRuns.iterator();
128     Iterator textIt = textPieces.iterator();
129 
130     TextPiece currentPiece = (TextPiece)textIt.next();
131     int currentTextStart = currentPiece.getStart();
132     int currentTextEnd = currentPiece.getEnd();
133 
134     finalTextBuf = new WordTextBuffer();
135 
136     // iterate through all text runs extract the text only if they haven't been
137     // deleted
138     while (runIt.hasNext())
139     {
140       CHPX chpx = (CHPX)runIt.next();
141       boolean deleted = isDeleted(chpx.getGrpprl());
142       if (deleted)
143       {
144         continue;
145       }
146 
147       int runStart = chpx.getStart();
148       int runEnd = chpx.getEnd();
149 
150       while (runStart >= currentTextEnd)
151       {
152         currentPiece = (TextPiece) textIt.next ();
153         currentTextStart = currentPiece.getStart ();
154         currentTextEnd = currentPiece.getEnd ();
155       }
156 
157       if (runEnd < currentTextEnd)
158       {
159         String   str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
160         finalTextBuf.append(str);
161       }
162       else if (runEnd > currentTextEnd)
163       {
164         while (runEnd > currentTextEnd)
165         {
166           String   str = currentPiece.substring(runStart - currentTextStart,
167                                    currentTextEnd - currentTextStart);
168           finalTextBuf.append(str);
169           if (textIt.hasNext())
170           {
171             currentPiece = (TextPiece) textIt.next ();
172             currentTextStart = currentPiece.getStart ();
173             runStart = currentTextStart;
174             currentTextEnd = currentPiece.getEnd ();
175           }
176           else
177           {
178             return finalTextBuf.toString();
179           }
180         }
181         String   str = currentPiece.substring(0, runEnd - currentTextStart);
182         finalTextBuf.append(str);
183       }
184       else
185       {
186         String   str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
187         if (textIt.hasNext())
188         {
189           currentPiece = (TextPiece) textIt.next();
190           currentTextStart = currentPiece.getStart();
191           currentTextEnd = currentPiece.getEnd();
192         }
193         finalTextBuf.append(str);
194       }
195     }
196     return finalTextBuf.toString();
197     
198   
199       } catch (Throwable   e){
200       return finalTextBuf.toString();
201     }
202   }
203 
204   /**
205    * Used to determine if a run of text has been deleted.
206    *
207    * @param grpprl The list of sprms for a particular run of text.
208    * @return true if this run of text has been deleted.
209    */
210   private boolean isDeleted(byte[] grpprl)
211   {
212     SprmIterator iterator = new SprmIterator(grpprl);
213     while (iterator.hasNext())
214     {
215       SprmOperation op = iterator.next();
216       // 0 is the operation that signals a FDelRMark operation
217       if (op.getOperation() == 0 && op.getOperand() != 0)
218       {
219         return true;
220       }
221     }
222     return false;
223   }
224 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags