KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > msword > Word6Extractor


1 /* Copyright 2004 Ryan Ackley
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */

15
16 package net.nutch.parse.msword;
17
18 import net.nutch.parse.msword.chp.*;
19
20 import org.apache.poi.util.LittleEndian;
21 import org.apache.poi.hwpf.model.*;
22 import org.apache.poi.hwpf.sprm.*;
23
24 import java.util.*;
25 import java.io.*;
26
27 /**
28  * This class is used to extract text from Word 6 documents only. It should
29  * only be called from the org.textmining.text.extraction.WordExtractor because
30  * it will automatically determine the version.
31  *
32  * @author Ryan Ackley
33  */

34 class Word6Extractor
35 {
36
37   public Word6Extractor()
38   {
39   }
40
41   /**
42    * Extracts the text
43    *
44    * @param mainStream The POIFS document stream entitled "WordDocument".
45    *
46    * @return The text from the document
47    * @throws Exception If there are any unexpected exceptions.
48    */

49   public String JavaDoc extractText(byte[] mainStream) throws Exception JavaDoc
50   {
51     int fcMin = LittleEndian.getInt(mainStream, 0x18);
52     int fcMax = LittleEndian.getInt(mainStream, 0x1C);
53
54     int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
55     int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
56
57     // get a list of character properties
58
Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
59       chpTableSize, fcMin);
60     List textRuns = chpTable.getTextRuns();
61
62     // iterate through the
63
WordTextBuffer finalTextBuf = new WordTextBuffer();
64     Iterator runsIt = textRuns.iterator();
65     while(runsIt.hasNext())
66     {
67       CHPX chpx = (CHPX)runsIt.next();
68       int runStart = chpx.getStart() + fcMin;
69       int runEnd = chpx.getEnd() + fcMin;
70
71       if (!isDeleted(chpx.getGrpprl()))
72       {
73         String JavaDoc s = new String JavaDoc(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
74         finalTextBuf.append(s);
75         if (runEnd >= fcMax)
76         {
77           break;
78         }
79       }
80     }
81
82     return finalTextBuf.toString();
83   }
84
85   /**
86    * Used to determine if a run of text has been deleted.
87    * @param grpprl The list of sprms for this run of text.
88    * @return
89    */

90   private boolean isDeleted(byte[] grpprl)
91   {
92     int offset = 0;
93     boolean deleted = false;
94     while (offset < grpprl.length)
95     {
96       switch (LittleEndian.getUnsignedByte(grpprl, offset++))
97       {
98         case 65:
99           deleted = grpprl[offset++] != 0;
100           break;
101         case 66:
102           offset++;
103           break;
104         case 67:
105           offset++;
106           break;
107         case 68:
108           offset += grpprl[offset];
109           break;
110         case 69:
111           offset += 2;
112           break;
113         case 70:
114           offset += 4;
115           break;
116         case 71:
117           offset++;
118           break;
119         case 72:
120           offset += 2;
121           break;
122         case 73:
123           offset += 3;
124           break;
125         case 74:
126           offset += grpprl[offset];
127           break;
128         case 75:
129           offset++;
130           break;
131         case 80:
132           offset += 2;
133           break;
134         case 81:
135           offset += grpprl[offset];
136           break;
137         case 82:
138           offset += grpprl[offset];
139           break;
140         case 83:
141           break;
142         case 85:
143           offset++;
144           break;
145         case 86:
146           offset++;
147           break;
148         case 87:
149           offset++;
150           break;
151         case 88:
152           offset++;
153           break;
154         case 89:
155           offset++;
156           break;
157         case 90:
158           offset++;
159           break;
160         case 91:
161           offset++;
162           break;
163         case 92:
164           offset++;
165           break;
166         case 93:
167           offset += 2;
168           break;
169         case 94:
170           offset++;
171           break;
172         case 95:
173           offset += 3;
174           break;
175         case 96:
176           offset += 2;
177           break;
178         case 97:
179           offset += 2;
180           break;
181         case 98:
182           offset++;
183           break;
184         case 99:
185           offset++;
186           break;
187         case 100:
188           offset++;
189           break;
190         case 101:
191           offset++;
192           break;
193         case 102:
194           offset++;
195           break;
196         case 103:
197           offset += grpprl[offset];
198           break;
199         case 104:
200           offset++;
201           break;
202         case 105:
203           offset += grpprl[offset];
204           break;
205         case 106:
206           offset += grpprl[offset];
207           break;
208         case 107:
209           offset += 2;
210           break;
211         case 108:
212           offset += grpprl[offset];
213           break;
214         case 109:
215           offset += 2;
216           break;
217         case 110:
218           offset += 2;
219           break;
220         case 117:
221           offset++;
222           break;
223         case 118:
224           offset++;
225           break;
226
227       }
228     }
229     return deleted;
230   }
231 }
232
Popular Tags