1 15 16 package net.nutch.parse.msword; 17 18 import net.nutch.parse.msword.chp.*; 19 20 import org.apache.poi.util.LittleEndian; 21 import org.apache.poi.hwpf.model.*; 22 import org.apache.poi.hwpf.sprm.*; 23 24 import java.util.*; 25 import java.io.*; 26 27 34 class Word6Extractor 35 { 36 37 public Word6Extractor() 38 { 39 } 40 41 49 public String extractText(byte[] mainStream) throws Exception 50 { 51 int fcMin = LittleEndian.getInt(mainStream, 0x18); 52 int fcMax = LittleEndian.getInt(mainStream, 0x1C); 53 54 int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); 55 int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); 56 57 Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, 59 chpTableSize, fcMin); 60 List textRuns = chpTable.getTextRuns(); 61 62 WordTextBuffer finalTextBuf = new WordTextBuffer(); 64 Iterator runsIt = textRuns.iterator(); 65 while(runsIt.hasNext()) 66 { 67 CHPX chpx = (CHPX)runsIt.next(); 68 int runStart = chpx.getStart() + fcMin; 69 int runEnd = chpx.getEnd() + fcMin; 70 71 if (!isDeleted(chpx.getGrpprl())) 72 { 73 String s = new String (mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); 74 finalTextBuf.append(s); 75 if (runEnd >= fcMax) 76 { 77 break; 78 } 79 } 80 } 81 82 return finalTextBuf.toString(); 83 } 84 85 90 private boolean isDeleted(byte[] grpprl) 91 { 92 int offset = 0; 93 boolean deleted = false; 94 while (offset < grpprl.length) 95 { 96 switch (LittleEndian.getUnsignedByte(grpprl, offset++)) 97 { 98 case 65: 99 deleted = grpprl[offset++] != 0; 100 break; 101 case 66: 102 offset++; 103 break; 104 case 67: 105 offset++; 106 break; 107 case 68: 108 offset += grpprl[offset]; 109 break; 110 case 69: 111 offset += 2; 112 break; 113 case 70: 114 offset += 4; 115 break; 116 case 71: 117 offset++; 118 break; 119 case 72: 120 offset += 2; 121 break; 122 case 73: 123 offset += 3; 124 break; 125 case 74: 126 offset += grpprl[offset]; 127 break; 128 case 75: 129 offset++; 130 break; 131 case 80: 132 offset += 2; 133 break; 134 case 81: 135 offset += grpprl[offset]; 136 break; 137 case 82: 138 offset += grpprl[offset]; 139 break; 140 case 83: 141 break; 142 case 85: 143 offset++; 144 break; 145 case 86: 146 offset++; 147 break; 148 case 87: 149 offset++; 150 break; 151 case 88: 152 offset++; 153 break; 154 case 89: 155 offset++; 156 break; 157 case 90: 158 offset++; 159 break; 160 case 91: 161 offset++; 162 break; 163 case 92: 164 offset++; 165 break; 166 case 93: 167 offset += 2; 168 break; 169 case 94: 170 offset++; 171 break; 172 case 95: 173 offset += 3; 174 break; 175 case 96: 176 offset += 2; 177 break; 178 case 97: 179 offset += 2; 180 break; 181 case 98: 182 offset++; 183 break; 184 case 99: 185 offset++; 186 break; 187 case 100: 188 offset++; 189 break; 190 case 101: 191 offset++; 192 break; 193 case 102: 194 offset++; 195 break; 196 case 103: 197 offset += grpprl[offset]; 198 break; 199 case 104: 200 offset++; 201 break; 202 case 105: 203 offset += grpprl[offset]; 204 break; 205 case 106: 206 offset += grpprl[offset]; 207 break; 208 case 107: 209 offset += 2; 210 break; 211 case 108: 212 offset += grpprl[offset]; 213 break; 214 case 109: 215 offset += 2; 216 break; 217 case 110: 218 offset += 2; 219 break; 220 case 117: 221 offset++; 222 break; 223 case 118: 224 offset++; 225 break; 226 227 } 228 } 229 return deleted; 230 } 231 } 232 | Popular Tags |