1 2 17 18 23 24 package org.apache.poi.hdf.model; 25 26 27 29 import java.util.ArrayList ; 30 import java.io.InputStream ; 31 import java.io.FileInputStream ; 32 import java.io.IOException ; 33 import java.util.List ; 34 import java.util.TreeSet ; 35 36 37 import org.apache.poi.hdf.model.hdftypes.*; 38 import org.apache.poi.hdf.event.HDFLowLevelParsingListener; 39 import org.apache.poi.hdf.model.util.BTreeSet; 40 import org.apache.poi.hdf.model.util.ParsingState; 41 42 import org.apache.poi.poifs.filesystem.POIFSFileSystem; 43 import org.apache.poi.poifs.filesystem.POIFSDocument; 44 import org.apache.poi.poifs.filesystem.DocumentEntry; 45 import org.apache.poi.util.LittleEndian; 46 47 48 49 50 55 public class HDFObjectFactory 56 { 57 58 59 private POIFSFileSystem _filesystem; 60 61 private FileInformationBlock _fib; 62 63 64 private HDFLowLevelParsingListener _listener; 65 66 private ParsingState _charParsingState; 67 68 private ParsingState _parParsingState; 69 70 71 byte[] _mainDocument; 72 73 byte[] _tableBuffer; 74 75 76 public static void main(String args[]) 77 { 78 try 79 { 80 HDFObjectFactory f = new HDFObjectFactory(new FileInputStream ("c:\\test.doc")); 81 int k = 0; 82 } 83 catch(Throwable t) 84 { 85 t.printStackTrace(); 86 } 87 } 88 93 protected HDFObjectFactory(InputStream istream, HDFLowLevelParsingListener l) throws IOException 94 { 95 if (l == null) 96 { 97 _listener = new HDFObjectModel(); 98 } 99 else 100 { 101 _listener = l; 102 } 103 104 _filesystem = new POIFSFileSystem(istream); 106 107 DocumentEntry headerProps = 108 (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument"); 109 110 _mainDocument = new byte[headerProps.getSize()]; 111 _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument); 112 113 _fib = new FileInformationBlock(_mainDocument); 114 115 initTableStream(); 116 initTextPieces(); 117 initFormattingProperties(); 118 119 120 } 121 122 123 124 125 130 public HDFObjectFactory(InputStream istream) throws IOException 131 { 132 this(istream, null); 133 } 134 135 public static List getTypes(InputStream istream) throws IOException 136 { 137 List results = new ArrayList (1); 138 139 POIFSFileSystem filesystem = new POIFSFileSystem(istream); 141 142 DocumentEntry headerProps = 143 (DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); 144 145 byte[] mainDocument = new byte[headerProps.getSize()]; 146 filesystem.createDocumentInputStream("WordDocument").read(mainDocument); 147 148 FileInformationBlock fib = new FileInformationBlock(mainDocument); 149 150 151 results.add(fib); 152 return results; 153 } 154 155 156 161 private void initTableStream() throws IOException 162 { 163 String tablename = null; 164 if(_fib.isFWhichTblStm()) 165 { 166 tablename="1Table"; 167 } 168 else 169 { 170 tablename="0Table"; 171 } 172 173 DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename); 174 175 int size = tableEntry.getSize(); 177 _tableBuffer = new byte[size]; 178 _filesystem.createDocumentInputStream(tablename).read(_tableBuffer); 179 } 180 186 private void initTextPieces() throws IOException 187 { 188 int pos = _fib.getFcClx(); 189 190 while (_tableBuffer[pos] == 1) 193 { 194 pos++; 195 int skip = LittleEndian.getShort(_tableBuffer, pos); 196 pos += 2 + skip; 197 } 198 if(_tableBuffer[pos] != 2) 199 { 200 throw new IOException ("The text piece table is corrupted"); 201 } 202 else 203 { 204 int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos); 206 pos += 4; 207 int pieces = (pieceTableSize - 4) / 12; 208 for (int x = 0; x < pieces; x++) 209 { 210 int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2); 211 boolean unicode = false; 212 if ((filePos & 0x40000000) == 0) 213 { 214 unicode = true; 215 } 216 else 217 { 218 unicode = false; 219 filePos &= ~(0x40000000); filePos /= 2; 221 } 222 int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) - 223 LittleEndian.getInt(_tableBuffer, pos + (x * 4)); 224 225 TextPiece piece = new TextPiece(filePos, totLength, unicode); 226 _listener.text(piece); 227 228 } 229 230 } 231 232 } 233 236 private void initFormattingProperties() 237 { 238 createStyleSheet(); 239 createListTables(); 240 createFontTable(); 241 242 initDocumentProperties(); 243 initSectionProperties(); 244 } 247 private void initCharacterProperties(int charOffset, PlexOfCps charPlcf, int start, int end) 248 { 249 int charPlcfLen = charPlcf.length(); 252 int currentPageIndex = _charParsingState.getCurrentPageIndex(); 253 FormattedDiskPage fkp = _charParsingState.getFkp(); 254 int currentChpxIndex = _charParsingState.getCurrentPropIndex(); 255 int currentArraySize = fkp.size(); 256 257 int charStart = 0; 259 int charEnd = 0; 260 do 262 { 263 if (currentChpxIndex < currentArraySize) 264 { 265 charStart = fkp.getStart(currentChpxIndex); 266 charEnd = fkp.getEnd(currentChpxIndex); 267 byte[] chpx = fkp.getGrpprl(currentChpxIndex); 268 _listener.characterRun(new ChpxNode(Math.max(charStart, start), Math.min(charEnd, end), chpx)); 269 270 if (charEnd < end) 271 { 272 currentChpxIndex++; 273 } 274 else 275 { 276 _charParsingState.setState(currentPageIndex, fkp, currentChpxIndex); 277 break; 278 } 279 } 280 else 281 { 282 int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex)); 283 byte[] byteFkp = new byte[512]; 284 System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512); 285 fkp = new CHPFormattedDiskPage(byteFkp); 286 currentChpxIndex = 0; 287 currentArraySize = fkp.size(); 288 } 289 } 290 while(currentPageIndex < charPlcfLen); 291 } 292 private void initParagraphProperties(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf, int start, int end) 293 { 294 int parPlcfLen = parPlcf.length(); 297 int currentPageIndex = _parParsingState.getCurrentPageIndex(); 298 FormattedDiskPage fkp = _parParsingState.getFkp(); 299 int currentPapxIndex = _parParsingState.getCurrentPropIndex(); 300 int currentArraySize = fkp.size(); 301 302 do 303 { 304 if (currentPapxIndex < currentArraySize) 305 { 306 int parStart = fkp.getStart(currentPapxIndex); 307 int parEnd = fkp.getEnd(currentPapxIndex); 308 byte[] papx = fkp.getGrpprl(currentPapxIndex); 309 _listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx)); 310 initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end)); 311 if (parEnd < end) 312 { 313 currentPapxIndex++; 314 } 315 else 316 { 317 _parParsingState.setState(currentPageIndex, fkp, currentPapxIndex); 319 break; 320 } 321 } 322 else 323 { 324 int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex)); 325 byte byteFkp[] = new byte[512]; 326 System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512); 327 fkp = new PAPFormattedDiskPage(byteFkp); 328 currentPapxIndex = 0; 329 currentArraySize = fkp.size(); 330 } 331 } 332 while(currentPageIndex < parPlcfLen); 333 } 334 337 382 385 private void initParagraphProperties() 386 { 387 int parOffset = _fib.getFcPlcfbtePapx(); 389 int parPlcSize = _fib.getLcbPlcfbtePapx(); 390 391 int charOffset = _fib.getFcPlcfbteChpx(); 393 int charPlcSize = _fib.getLcbPlcfbteChpx(); 394 395 PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4); 396 PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4); 397 398 int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0)); 400 int charPlcfLen = charPlcf.length(); 401 int currentPageIndex = 0; 402 byte[] fkp = new byte[512]; 403 System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); 404 CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp); 405 int currentChpxIndex = 0; 406 int currentArraySize = cfkp.size(); 407 408 409 int arraySize = parPlcf.length(); 410 411 for(int x = 0; x < arraySize; x++) 413 { 414 int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x)); 415 416 fkp = new byte[512]; 417 System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512); 418 419 PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp); 420 int crun = pfkp.size(); 422 for(int y = 0; y < crun; y++) 423 { 424 int fcStart = pfkp.getStart(y); 426 int fcEnd = pfkp.getEnd(y); 427 428 byte[] papx = pfkp.getGrpprl(y); 430 431 _listener.paragraph(new PapxNode(fcStart, fcEnd, papx)); 432 433 int charStart = 0; 435 int charEnd = 0; 436 do 438 { 439 if (currentChpxIndex < currentArraySize) 440 { 441 charStart = cfkp.getStart(currentChpxIndex); 442 charEnd = cfkp.getEnd(currentChpxIndex); 443 byte[] chpx = cfkp.getGrpprl(currentChpxIndex); 444 _listener.characterRun(new ChpxNode(charStart, charEnd, chpx)); 445 if (charEnd < fcEnd) 446 { 447 currentChpxIndex++; 448 } 449 else 450 { 451 break; 452 } 453 } 454 else 455 { 456 currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex)); 457 fkp = new byte[512]; 458 System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); 459 cfkp = new CHPFormattedDiskPage(fkp); 460 currentChpxIndex = 0; 461 currentArraySize = cfkp.size(); 462 } 463 } 464 while(currentCharPage <= charPlcfLen + 1); 465 466 } 467 468 } 469 470 } 471 private void initParsingStates(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf) 472 { 473 int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0)); 474 byte[] fkp = new byte[512]; 475 System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); 476 CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp); 477 _charParsingState = new ParsingState(currentCharPage, cfkp); 478 479 int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0)); 480 fkp = new byte[512]; 481 System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512); 482 PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp); 483 _parParsingState = new ParsingState(currentParPage, pfkp); 484 } 485 488 private void initSectionProperties() 489 { 490 491 int ccpText = _fib.getCcpText(); 492 int ccpFtn = _fib.getCcpFtn(); 493 494 int fcMin = _fib.getFcMin(); 496 int plcfsedFC = _fib.getFcPlcfsed(); 497 int plcfsedSize = _fib.getLcbPlcfsed(); 498 499 int parOffset = _fib.getFcPlcfbtePapx(); 501 int parPlcSize = _fib.getLcbPlcfbtePapx(); 502 503 int charOffset = _fib.getFcPlcfbteChpx(); 505 int charPlcSize = _fib.getLcbPlcfbteChpx(); 506 507 PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4); 508 PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4); 509 510 initParsingStates(parOffset, parPlcf, charOffset, charPlcf); 511 512 515 PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12); 516 int arraySize = plcfsed.length(); 517 518 int start = fcMin; 519 int end = fcMin + ccpText; 520 int x = 0; 521 int sectionEnd = 0; 522 523 while (x < arraySize) 525 { 526 int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin; 527 sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin; 528 int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2); 529 int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart); 530 531 byte[] sepx = new byte[sepxSize]; 532 System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize); 533 SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx); 534 _listener.bodySection(node); 535 initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd)); 536 537 if (sectionEnd > end) 538 { 539 break; 540 } 541 else 542 { 543 x++; 544 } 545 } 546 for (; x < arraySize; x++) { 549 int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin; 550 sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin; 551 int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2); 552 int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart); 553 554 byte[] sepx = new byte[sepxSize]; 555 System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize); 556 SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx); 557 _listener.hdrSection(node); 558 initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd); 559 560 } 561 _listener.endSections(); 562 } 563 566 private void initDocumentProperties() 567 { 568 int pos = _fib.getFcDop(); 569 int size = _fib.getLcbDop(); 570 byte[] dopArray = new byte[size]; 571 572 System.arraycopy(_tableBuffer, pos, dopArray, 0, size); 573 _listener.document(new DocumentProperties(dopArray)); 574 } 575 578 private void createStyleSheet() 579 { 580 int stshIndex = _fib.getFcStshf(); 581 int stshSize = _fib.getLcbStshf(); 582 byte[] stsh = new byte[stshSize]; 583 System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize); 584 585 _listener.styleSheet(new StyleSheet(stsh)); 586 } 587 590 private void createListTables() 591 { 592 int lfoOffset = _fib.getFcPlfLfo(); 593 int lfoSize = _fib.getLcbPlfLfo(); 594 byte[] plflfo = new byte[lfoSize]; 595 596 System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize); 597 598 int lstOffset = _fib.getFcPlcfLst(); 599 int lstSize = _fib.getLcbPlcfLst(); 600 if (lstOffset > 0 && lstSize > 0) 601 { 602 lstSize = lfoOffset - lstOffset; 607 byte[] plcflst = new byte[lstSize]; 608 System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize); 609 _listener.lists(new ListTables(plcflst, plflfo)); 610 } 611 } 612 615 private void createFontTable() 616 { 617 int fontTableIndex = _fib.getFcSttbfffn(); 618 int fontTableSize = _fib.getLcbSttbfffn(); 619 byte[] fontTable = new byte[fontTableSize]; 620 System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize); 621 _listener.fonts(new FontTable(fontTable)); 622 } 623 624 } 625 | Popular Tags |