KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > textmining > text > extraction > Word6Extractor


1 /* Copyright 2004 Ryan Ackley
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */

15
16 package org.textmining.text.extraction;
17
18 import org.apache.poi.util.LittleEndian;
19 import org.apache.poi.hwpf.model.*;
20 import org.textmining.text.extraction.sprm.*;
21 import org.textmining.text.extraction.chp.*;
22
23 import java.util.*;
24 import java.io.*;
25
26 /**
27  * This class is used to extract text from Word 6 documents only. It should
28  * only be called from the org.textmining.text.extraction.WordExtractor because
29  * it will automatically determine the version.
30  *
31  * @author Ryan Ackley
32  */

33 class Word6Extractor
34 {
35
36   public Word6Extractor()
37   {
38   }
39
40   /**
41    * Extracts the text
42    *
43    * @param mainStream The POIFS document stream entitled "WordDocument".
44    *
45    * @return The text from the document
46    * @throws Exception If there are any unexpected exceptions.
47    */

48   public String JavaDoc extractText(byte[] mainStream) throws Exception JavaDoc
49   {
50     int fcMin = LittleEndian.getInt(mainStream, 0x18);
51     int fcMax = LittleEndian.getInt(mainStream, 0x1C);
52
53     int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
54     int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
55
56     // get a list of character properties
57
Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
58       chpTableSize, fcMin);
59     List textRuns = chpTable.getTextRuns();
60
61     // iterate through the
62
WordTextBuffer finalTextBuf = new WordTextBuffer();
63     Iterator runsIt = textRuns.iterator();
64     while(runsIt.hasNext())
65     {
66       CHPX chpx = (CHPX)runsIt.next();
67       int runStart = chpx.getStart() + fcMin;
68       int runEnd = chpx.getEnd() + fcMin;
69
70       if (!isDeleted(chpx.getGrpprl()))
71       {
72         String JavaDoc s = new String JavaDoc(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
73         finalTextBuf.append(s);
74         if (runEnd >= fcMax)
75         {
76           break;
77         }
78       }
79     }
80
81     return finalTextBuf.toString();
82   }
83
84   /**
85    * Used to determine if a run of text has been deleted.
86    * @param grpprl The list of sprms for this run of text.
87    * @return
88    */

89   private boolean isDeleted(byte[] grpprl)
90   {
91     int offset = 0;
92     boolean deleted = false;
93     while (offset < grpprl.length)
94     {
95       switch (LittleEndian.getUnsignedByte(grpprl, offset++))
96       {
97         case 65:
98           deleted = grpprl[offset++] != 0;
99           break;
100         case 66:
101           offset++;
102           break;
103         case 67:
104           offset++;
105           break;
106         case 68:
107           offset += grpprl[offset];
108           break;
109         case 69:
110           offset += 2;
111           break;
112         case 70:
113           offset += 4;
114           break;
115         case 71:
116           offset++;
117           break;
118         case 72:
119           offset += 2;
120           break;
121         case 73:
122           offset += 3;
123           break;
124         case 74:
125           offset += grpprl[offset];
126           break;
127         case 75:
128           offset++;
129           break;
130         case 80:
131           offset += 2;
132           break;
133         case 81:
134           offset += grpprl[offset];
135           break;
136         case 82:
137           offset += grpprl[offset];
138           break;
139         case 83:
140           break;
141         case 85:
142           offset++;
143           break;
144         case 86:
145           offset++;
146           break;
147         case 87:
148           offset++;
149           break;
150         case 88:
151           offset++;
152           break;
153         case 89:
154           offset++;
155           break;
156         case 90:
157           offset++;
158           break;
159         case 91:
160           offset++;
161           break;
162         case 92:
163           offset++;
164           break;
165         case 93:
166           offset += 2;
167           break;
168         case 94:
169           offset++;
170           break;
171         case 95:
172           offset += 3;
173           break;
174         case 96:
175           offset += 2;
176           break;
177         case 97:
178           offset += 2;
179           break;
180         case 98:
181           offset++;
182           break;
183         case 99:
184           offset++;
185           break;
186         case 100:
187           offset++;
188           break;
189         case 101:
190           offset++;
191           break;
192         case 102:
193           offset++;
194           break;
195         case 103:
196           offset += grpprl[offset];
197           break;
198         case 104:
199           offset++;
200           break;
201         case 105:
202           offset += grpprl[offset];
203           break;
204         case 106:
205           offset += grpprl[offset];
206           break;
207         case 107:
208           offset += 2;
209           break;
210         case 108:
211           offset += grpprl[offset];
212           break;
213         case 109:
214           offset += 2;
215           break;
216         case 110:
217           offset += 2;
218           break;
219         case 117:
220           offset++;
221           break;
222         case 118:
223           offset++;
224           break;
225
226       }
227     }
228     return deleted;
229   }
230 }
Popular Tags