1 15 16 package org.textmining.text.extraction; 17 18 import org.apache.poi.util.LittleEndian; 19 import org.apache.poi.hwpf.model.*; 20 import org.textmining.text.extraction.sprm.*; 21 import org.textmining.text.extraction.chp.*; 22 23 import java.util.*; 24 import java.io.*; 25 26 33 class Word6Extractor 34 { 35 36 public Word6Extractor() 37 { 38 } 39 40 48 public String extractText(byte[] mainStream) throws Exception 49 { 50 int fcMin = LittleEndian.getInt(mainStream, 0x18); 51 int fcMax = LittleEndian.getInt(mainStream, 0x1C); 52 53 int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); 54 int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); 55 56 Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, 58 chpTableSize, fcMin); 59 List textRuns = chpTable.getTextRuns(); 60 61 WordTextBuffer finalTextBuf = new WordTextBuffer(); 63 Iterator runsIt = textRuns.iterator(); 64 while(runsIt.hasNext()) 65 { 66 CHPX chpx = (CHPX)runsIt.next(); 67 int runStart = chpx.getStart() + fcMin; 68 int runEnd = chpx.getEnd() + fcMin; 69 70 if (!isDeleted(chpx.getGrpprl())) 71 { 72 String s = new String (mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); 73 finalTextBuf.append(s); 74 if (runEnd >= fcMax) 75 { 76 break; 77 } 78 } 79 } 80 81 return finalTextBuf.toString(); 82 } 83 84 89 private boolean isDeleted(byte[] grpprl) 90 { 91 int offset = 0; 92 boolean deleted = false; 93 while (offset < grpprl.length) 94 { 95 switch (LittleEndian.getUnsignedByte(grpprl, offset++)) 96 { 97 case 65: 98 deleted = grpprl[offset++] != 0; 99 break; 100 case 66: 101 offset++; 102 break; 103 case 67: 104 offset++; 105 break; 106 case 68: 107 offset += grpprl[offset]; 108 break; 109 case 69: 110 offset += 2; 111 break; 112 case 70: 113 offset += 4; 114 break; 115 case 71: 116 offset++; 117 break; 118 case 72: 119 offset += 2; 120 break; 121 case 73: 122 offset += 3; 123 break; 124 case 74: 125 offset += grpprl[offset]; 126 break; 127 case 75: 128 offset++; 129 break; 130 case 80: 131 offset += 2; 132 break; 133 case 81: 134 offset += grpprl[offset]; 135 break; 136 case 82: 137 offset += grpprl[offset]; 138 break; 139 case 83: 140 break; 141 case 85: 142 offset++; 143 break; 144 case 86: 145 offset++; 146 break; 147 case 87: 148 offset++; 149 break; 150 case 88: 151 offset++; 152 break; 153 case 89: 154 offset++; 155 break; 156 case 90: 157 offset++; 158 break; 159 case 91: 160 offset++; 161 break; 162 case 92: 163 offset++; 164 break; 165 case 93: 166 offset += 2; 167 break; 168 case 94: 169 offset++; 170 break; 171 case 95: 172 offset += 3; 173 break; 174 case 96: 175 offset += 2; 176 break; 177 case 97: 178 offset += 2; 179 break; 180 case 98: 181 offset++; 182 break; 183 case 99: 184 offset++; 185 break; 186 case 100: 187 offset++; 188 break; 189 case 101: 190 offset++; 191 break; 192 case 102: 193 offset++; 194 break; 195 case 103: 196 offset += grpprl[offset]; 197 break; 198 case 104: 199 offset++; 200 break; 201 case 105: 202 offset += grpprl[offset]; 203 break; 204 case 106: 205 offset += grpprl[offset]; 206 break; 207 case 107: 208 offset += 2; 209 break; 210 case 108: 211 offset += grpprl[offset]; 212 break; 213 case 109: 214 offset += 2; 215 break; 216 case 110: 217 offset += 2; 218 break; 219 case 117: 220 offset++; 221 break; 222 case 118: 223 offset++; 224 break; 225 226 } 227 } 228 return deleted; 229 } 230 } | Popular Tags |