1 16 package com.blandware.atleap.common.parsers.ppt; 17 18 import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException; 19 import com.blandware.atleap.common.parsers.SpecificPlainTextExtractor; 20 import com.blandware.atleap.common.Constants; 21 import org.apache.poi.poifs.filesystem.POIFSFileSystem; 22 import org.apache.poi.util.LittleEndian; 23 24 import java.io.IOException ; 25 import java.io.InputStream ; 26 import java.io.Writer ; 27 import java.util.HashMap ; 28 import java.util.Map ; 29 30 39 public class PowerPointPlainTextExtractor 40 implements SpecificPlainTextExtractor { 41 57 58 private static final int RECORD_HEADER_LEN = 8; 59 private static final int MAGIC = -476987297; 60 61 private static final int SYMBOL_CONTAINER_END = -1; 63 64 private static final int STATE_NONE = -1; 66 private static final int STATE_SLIDE = 3; 67 private static final int STATE_SLIDE_PPDRAWING = 4; 68 private static final int STATE_SLIDE_PPDRAWING_61442 = 5; 69 private static final int STATE_SLIDE_PPDRAWING_61442_61443 = 6; 70 private static final int STATE_SLIDE_PPDRAWING_61442_61443_61444 = 7; 71 private static final int STATE_SLIDE_PPDRAWING_61442_61443_61444_61453 = 8; 72 private static final int STATE_NOTES = 9; 73 private static final int STATE_NOTES_PPDRAWING = 10; 74 private static final int STATE_NOTES_PPDRAWING_61442 = 11; 75 private static final int STATE_NOTES_PPDRAWING_61442_61443 = 12; 76 private static final int STATE_NOTES_PPDRAWING_61442_61443_61444 = 13; 77 private static final int STATE_NOTES_PPDRAWING_61442_61443_61444_61453 = 14; 78 79 private static final int STATE_DOC_BEGIN = 15; 80 private static final int STATE_HEADER_FOOTER = 16; 81 private static final int STATE_SLIDE_LIST_WITH_TEXT = 17; 82 83 private static final int STATE_MAIN_MASTER = 40; 84 85 private static final int ACTION_HANDLE_U16 = 0; 87 private static final int ACTION_HANDLE_ASCII = 1; 88 private static final int ACTION_HANDLE_FORMATTED = 2; 89 private static final int ACTION_HANDLE_HEADER_FOOTER = 3; 90 private static final int ACTION_HANDLE_SLIDE_PERSIST_ATOM = 4; 91 private static final int ACTION_HANDLE_PERSIST_TEXT_ASCII = 5; 92 private static final int ACTION_HANDLE_PERSIST_TEXT_U16 = 6; 93 private static final int ACTION_HANDLE_SLIDE_LIST_END = 7; 94 95 private static final int RECORD_TYPE_DOCUMENT = 1000; 97 private static final int RECORD_TYPE_SLIDE = 1006; 98 private static final int RECORD_TYPE_NOTES = 1008; 99 private static final int RECORD_TYPE_SLIDE_PERSIST_ATOM = 1011; 100 private static final int RECORD_TYPE_MAIN_MASTER = 1016; 101 private static final int RECORD_TYPE_PPDRAWING = 1036; 102 private static final int RECORD_TYPE_TEXT_CHARS_ATOM = 4000; 103 private static final int RECORD_TYPE_TEXT_BYTES_ATOM = 4008; 104 private static final int RECORD_TYPE_CSTRING = 4026; 105 private static final int RECORD_TYPE_HEADERS_FOOTERS = 4057; 106 private static final int RECORD_TYPE_SLIDE_LIST_WITH_TEXT = 4080; 107 108 private Writer writer; 109 private static Map map; 111 private int curSlideRef; 113 private int textsLeft; 115 private long[] persistentDirectories; 116 private boolean inMasterSlideList; 117 private boolean firstSlide; 119 120 static { 121 map = new HashMap (); 122 123 map.put(new TransitionFunctionArg(STATE_DOC_BEGIN, RECORD_TYPE_HEADERS_FOOTERS), 125 new TransitionFunctionValue(STATE_HEADER_FOOTER, 0)); 126 map.put(new TransitionFunctionArg(STATE_DOC_BEGIN, RECORD_TYPE_SLIDE_LIST_WITH_TEXT), 127 new TransitionFunctionValue(STATE_SLIDE_LIST_WITH_TEXT, 0)); 128 129 map.put(new TransitionFunctionArg(STATE_SLIDE_LIST_WITH_TEXT, RECORD_TYPE_SLIDE_PERSIST_ATOM), 131 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_SLIDE_PERSIST_ATOM)); 132 map.put(new TransitionFunctionArg(STATE_SLIDE_LIST_WITH_TEXT, RECORD_TYPE_TEXT_BYTES_ATOM), 133 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_PERSIST_TEXT_ASCII)); 134 map.put(new TransitionFunctionArg(STATE_SLIDE_LIST_WITH_TEXT, RECORD_TYPE_TEXT_CHARS_ATOM), 135 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_PERSIST_TEXT_U16)); 136 map.put(new TransitionFunctionArg(STATE_SLIDE_LIST_WITH_TEXT, SYMBOL_CONTAINER_END), 137 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_SLIDE_LIST_END)); 138 139 map.put(new TransitionFunctionArg(STATE_MAIN_MASTER, RECORD_TYPE_HEADERS_FOOTERS), 141 new TransitionFunctionValue(STATE_HEADER_FOOTER, 0)); 142 143 map.put(new TransitionFunctionArg(STATE_SLIDE, RECORD_TYPE_HEADERS_FOOTERS), 145 new TransitionFunctionValue(STATE_HEADER_FOOTER, 0)); 146 map.put(new TransitionFunctionArg(STATE_SLIDE, RECORD_TYPE_CSTRING), 147 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_U16)); 148 map.put(new TransitionFunctionArg(STATE_SLIDE, RECORD_TYPE_PPDRAWING), 149 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING, 0)); 150 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING, 61442), 151 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING_61442, 0)); 152 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442, 61443), 153 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING_61442_61443, 0)); 154 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442_61443, 61443), 155 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING_61442_61443, 0)); 156 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442_61443, 61444), 157 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING_61442_61443_61444, 0)); 158 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442_61443_61444, 61453), 159 new TransitionFunctionValue(STATE_SLIDE_PPDRAWING_61442_61443_61444_61453, 0)); 160 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442_61443_61444_61453, RECORD_TYPE_TEXT_CHARS_ATOM), 161 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_U16)); 162 map.put(new TransitionFunctionArg(STATE_SLIDE_PPDRAWING_61442_61443_61444_61453, RECORD_TYPE_TEXT_BYTES_ATOM), 163 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_ASCII)); 164 165 map.put(new TransitionFunctionArg(STATE_NOTES, RECORD_TYPE_CSTRING), 167 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_U16)); 168 map.put(new TransitionFunctionArg(STATE_NOTES, RECORD_TYPE_PPDRAWING), 169 new TransitionFunctionValue(STATE_NOTES_PPDRAWING, 0)); 170 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING, 61442), 171 new TransitionFunctionValue(STATE_NOTES_PPDRAWING_61442, 0)); 172 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442, 61443), 173 new TransitionFunctionValue(STATE_NOTES_PPDRAWING_61442_61443, 0)); 174 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442_61443, 61443), 175 new TransitionFunctionValue(STATE_NOTES_PPDRAWING_61442_61443, 0)); 176 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442_61443, 61444), 177 new TransitionFunctionValue(STATE_NOTES_PPDRAWING_61442_61443_61444, 0)); 178 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442_61443_61444, 61453), 179 new TransitionFunctionValue(STATE_NOTES_PPDRAWING_61442_61443_61444_61453, 0)); 180 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442_61443_61444_61453, RECORD_TYPE_TEXT_CHARS_ATOM), 181 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_U16)); 182 map.put(new TransitionFunctionArg(STATE_NOTES_PPDRAWING_61442_61443_61444_61453, RECORD_TYPE_TEXT_BYTES_ATOM), 183 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_ASCII)); 184 185 map.put(new TransitionFunctionArg(STATE_HEADER_FOOTER, RECORD_TYPE_CSTRING), 187 new TransitionFunctionValue(STATE_NONE, ACTION_HANDLE_HEADER_FOOTER)); 188 } 189 190 public PowerPointPlainTextExtractor() { 191 writer = null; 192 } 193 194 204 public void extract(InputStream input, Writer output, String encoding) 205 throws PlainTextExtractorException { 206 writer = output; 207 208 try { 209 POIFSFileSystem fs = new POIFSFileSystem(input); 210 InputStream currentUserStream = fs.createDocumentInputStream("Current User"); 212 byte[] currentUserBytes = new byte[currentUserStream.available()]; 213 currentUserStream.read(currentUserBytes); 214 currentUserStream.close(); 215 int magic = LittleEndian.getInt(currentUserBytes, 12); 216 if (magic != MAGIC) { 217 throw new PlainTextExtractorException("That's not a supported PowerPoint Document"); 218 } else { 219 int lastEdit = (int) LittleEndian.getUInt(currentUserBytes, 16); 220 InputStream documentStream = fs.createDocumentInputStream("PowerPoint Document"); 221 byte[] documentBytes = new byte[documentStream.available()]; 222 documentStream.read(documentBytes); 223 documentStream.close(); 224 225 persistentDirectories = buildPersistentDirectories(documentBytes, lastEdit); 227 228 int refToDoc = (int) LittleEndian.getUInt(documentBytes, lastEdit + RECORD_HEADER_LEN + 16); 230 if (persistentDirectories[refToDoc] != -1) { 231 int offsetToDoc = (int) persistentDirectories[refToDoc]; 232 int docLen = (int) LittleEndian.getUInt(documentBytes, offsetToDoc + 4); 233 int docType = LittleEndian.getUShort(documentBytes, offsetToDoc + 2); 234 if (docType != RECORD_TYPE_DOCUMENT) { 235 throw new PlainTextExtractorException("PPT parser: Document container expected"); 236 } else { 237 inMasterSlideList = false; 238 firstSlide = true; 239 decode(documentBytes, offsetToDoc + RECORD_HEADER_LEN, 241 docLen, STATE_DOC_BEGIN); 242 } 243 } 244 } 245 } catch (IOException e) { 246 throw new PlainTextExtractorException(e); 247 } 248 } 249 250 262 private long[] buildPersistentDirectories(byte[] documentBytes, 263 int lastEdit) { 264 int maxPersistWritten = (int) LittleEndian.getUInt(documentBytes, lastEdit + RECORD_HEADER_LEN + 20); 266 long[] result = new long[maxPersistWritten + 1]; 267 for (int i = 0; i <= maxPersistWritten; i++) { 268 result[i] = -1; 269 } 270 271 do { 272 int curDirs = (int) LittleEndian.getUInt(documentBytes, lastEdit + RECORD_HEADER_LEN + 12); 273 addPersistantDirectories(result, documentBytes, curDirs); 274 lastEdit = (int) LittleEndian.getUInt(documentBytes, lastEdit + RECORD_HEADER_LEN + 8); 276 } while (lastEdit != 0); 277 278 return result; 279 } 280 281 292 private void addPersistantDirectories(long[] dirs, byte[] documentBytes, 293 int curDirs) { 294 long size = LittleEndian.getUInt(documentBytes, curDirs + 4); 295 int pointer = 0; 296 while (pointer < size) { 297 long firstField = LittleEndian.getUInt(documentBytes, curDirs + RECORD_HEADER_LEN + pointer); 298 pointer += 4; 299 int refNum = (int) (firstField >> 20); 301 int refStart = (int) (firstField & 0xFFFFF); 303 for (int i = 0; (i < refNum) && (pointer < size); i++) { 304 if (dirs[refStart + i] == -1) { 305 dirs[refStart + i] = LittleEndian.getUInt(documentBytes, curDirs + RECORD_HEADER_LEN + pointer); 306 } 307 pointer += 4; 308 } 309 } 310 } 311 312 323 private void decode(byte[] buffer, int begin, int len, int state) 324 throws IOException { 325 if ((state == STATE_SLIDE_PPDRAWING || state == STATE_NOTES_PPDRAWING) && inMasterSlideList) { 327 return; 329 } 330 331 int pointer = 0; 332 TransitionFunctionArg arg; 333 TransitionFunctionValue value; 334 335 while (pointer < len) { 336 int code = LittleEndian.getUShort(buffer, begin + pointer); 337 int type = LittleEndian.getUShort(buffer, begin + pointer + 2); 338 long size = LittleEndian.getUInt(buffer, begin + pointer + 4); 339 340 arg = new TransitionFunctionArg(state, type); 342 value = (TransitionFunctionValue) map.get(arg); 343 if (value != null) { 344 boolean recordIsContainer = ((code & 0xF) == 0xF); 346 if (recordIsContainer) { 347 if (value.state == STATE_SLIDE_LIST_WITH_TEXT && code == 31) { 348 boolean temp = inMasterSlideList; 350 inMasterSlideList = true; 351 decode(buffer, begin + pointer + RECORD_HEADER_LEN, (int) size, value.state); 352 inMasterSlideList = temp; 353 } else { 354 decode(buffer, begin + pointer + RECORD_HEADER_LEN, (int) size, value.state); 355 } 356 } else { 357 executeAction(value.action, buffer, begin + pointer + RECORD_HEADER_LEN, size); 358 } 359 } 360 pointer += (size + RECORD_HEADER_LEN); 361 } 362 363 arg = new TransitionFunctionArg(state, SYMBOL_CONTAINER_END); 365 value = (TransitionFunctionValue) map.get(arg); 366 if (value != null) { 367 executeAction(value.action, buffer, 0, 0); 368 } 369 } 370 371 381 private void executeAction(int action, byte[] buffer, int begin, long len) 382 throws IOException { 383 switch (action) { 384 case ACTION_HANDLE_ASCII: 385 handleAscii(buffer, begin, (int) len); 386 break; 387 case ACTION_HANDLE_U16: 388 handleU16(buffer, begin, (int) len); 389 break; 390 case ACTION_HANDLE_FORMATTED: 391 handleFormatted(buffer, begin, (int) len); 392 break; 393 case ACTION_HANDLE_HEADER_FOOTER: 394 handleHeaderFooter(buffer, begin, (int) len); 395 break; 396 case ACTION_HANDLE_SLIDE_PERSIST_ATOM: 397 handleSlidePersistAtom(buffer, begin, (int) len); 398 break; 399 case ACTION_HANDLE_PERSIST_TEXT_ASCII: 400 handlePersistTextAscii(buffer, begin, (int) len); 401 break; 402 case ACTION_HANDLE_PERSIST_TEXT_U16: 403 handlePersistTextU16(buffer, begin, (int) len); 404 break; 405 case ACTION_HANDLE_SLIDE_LIST_END: 406 handleSlideListEnd(buffer, begin, (int) len); 407 break; 408 default: 409 break; 410 } 411 } 412 413 private void handleSlideListEnd(byte[] buffer, int begin, int len) 414 throws IOException { 415 if (!firstSlide) { 416 handleSlideBegin(buffer); 418 } 419 firstSlide = true; 420 } 421 422 private void handlePersistTextAscii(byte[] buffer, int begin, int len) 423 throws IOException { 424 handleAscii(buffer, begin, len); 425 if (textsLeft > 0) { 426 textsLeft--; 427 } 428 } 429 430 private void handlePersistTextU16(byte[] buffer, int begin, int len) 431 throws IOException { 432 handleU16(buffer, begin, len); 433 if (textsLeft > 0) { 434 textsLeft--; 435 } 436 } 437 438 private void handleSlidePersistAtom(byte[] buffer, int begin, int len) 439 throws IOException { 440 if (!firstSlide) { 442 handleSlideBegin(buffer); 444 } else { 445 firstSlide = false; 446 } 447 curSlideRef = (int) LittleEndian.getUInt(buffer, begin); 448 textsLeft = (int) LittleEndian.getUInt(buffer, begin + 8); 449 } 450 451 private void handleSlideBegin(byte[] buffer) throws IOException { 452 int curSlideOffset = (int) persistentDirectories[curSlideRef]; 453 int curSlideType = LittleEndian.getUShort(buffer, curSlideOffset + 2); 454 int curSlideSize = (int) LittleEndian.getUInt(buffer, curSlideOffset + 4); 455 switch (curSlideType) { 456 case RECORD_TYPE_SLIDE: 457 decode(buffer, curSlideOffset + RECORD_HEADER_LEN, 458 curSlideSize, STATE_SLIDE); 459 break; 460 case RECORD_TYPE_NOTES: 461 decode(buffer, curSlideOffset + RECORD_HEADER_LEN, 462 curSlideSize, STATE_NOTES); 463 break; 464 case RECORD_TYPE_MAIN_MASTER: 465 decode(buffer, curSlideOffset + RECORD_HEADER_LEN, 466 curSlideSize, STATE_MAIN_MASTER); 467 break; 468 default: 469 break; 470 } 471 } 472 473 private void handleHeaderFooter(byte[] buffer, int begin, int len) 474 throws IOException { 475 writer.write(new String (buffer, begin, len, "UTF-16LE")); 476 writer.write(Constants.EOL); 477 } 478 479 private void handleAscii(byte[] buffer, int begin, int len) 480 throws IOException { 481 writer.write(new String (buffer, begin, len)); 482 writer.write(Constants.EOL); 483 } 484 485 private void handleU16(byte[] buffer, int begin, int len) 486 throws IOException { 487 writer.write(new String (buffer, begin, len, "UTF-16LE")); 488 writer.write(Constants.EOL); 489 } 490 491 private void handleFormatted(byte[] buffer, int begin, int len) { 492 } 494 495 499 private static class TransitionFunctionArg { 500 public int state; 501 public int type; 502 503 TransitionFunctionArg(int newState, int newType) { 504 state = newState; 505 type = newType; 506 } 507 508 public boolean equals(Object obj) { 509 TransitionFunctionArg arg = (TransitionFunctionArg) obj; 510 return arg.state == this.state && arg.type == this.type; 511 } 512 513 public int hashCode() { 514 int result; 515 result = state; 516 result = 29 * result + type; 517 return result; 518 } 519 } 520 521 525 private static class TransitionFunctionValue { 526 public int state; 527 public int action; 528 529 TransitionFunctionValue(int newState, int newAction) { 530 state = newState; 531 action = newAction; 532 } 533 } 534 535 538 public String getUsedEncoding() { 539 return null; 540 } 541 } 542 | Popular Tags |