HDFObjectFactory


1   
2   /* ====================================================================
3      Copyright 2002-2004   Apache Software Foundation
4   
5      Licensed under the Apache License, Version 2.0 (the "License");
6      you may not use this file except in compliance with the License.
7      You may obtain a copy of the License at
8   
9          http://www.apache.org/licenses/LICENSE-2.0
10  
11     Unless required by applicable law or agreed to in writing, software
12     distributed under the License is distributed on an "AS IS" BASIS,
13     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14     See the License for the specific language governing permissions and
15     limitations under the License.
16  ==================================================================== */
17          
18  /*
19   * HDFObjectFactory.java
20   *
21   * Created on February 24, 2002, 2:17 PM
22   */
23  
24  package org.apache.poi.hdf.model;
25  
26  
27  //import java.io;
28  
29  import java.util.ArrayList  ;
30  import java.io.InputStream  ;
31  import java.io.FileInputStream  ;
32  import java.io.IOException  ;
33  import java.util.List  ;
34  import java.util.TreeSet  ;
35  
36  
37  import org.apache.poi.hdf.model.hdftypes.*;
38  import org.apache.poi.hdf.event.HDFLowLevelParsingListener;
39  import org.apache.poi.hdf.model.util.BTreeSet;
40  import org.apache.poi.hdf.model.util.ParsingState;
41  
42  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
43  import org.apache.poi.poifs.filesystem.POIFSDocument;
44  import org.apache.poi.poifs.filesystem.DocumentEntry;
45  import org.apache.poi.util.LittleEndian;
46  
47  
48  
49  
50  /**
51   * The Object Factory takes in a stream and creates the low level objects
52   * that represent the data.
53   * @author  andy
54   */
55  public class HDFObjectFactory
56  {
57  
58      /** OLE stuff*/
59      private POIFSFileSystem _filesystem;
60      /** The FIB*/
61      private FileInformationBlock _fib;
62  
63      /** Used to set up the object model*/
64      private HDFLowLevelParsingListener _listener;
65      /** parsing state for characters */
66      private ParsingState _charParsingState;
67      /** parsing state for paragraphs */
68      private ParsingState _parParsingState;
69  
70      /** main document stream buffer*/
71      byte[] _mainDocument;
72      /** table stream buffer*/
73      byte[] _tableBuffer;
74  
75  
76      public static void main(String   args[])
77      {
78        try
79        {
80          HDFObjectFactory f = new HDFObjectFactory(new FileInputStream  ("c:\\test.doc"));
81          int k = 0;
82        }
83        catch(Throwable   t)
84        {
85          t.printStackTrace();
86        }
87      }
88      /** Creates a new instance of HDFObjectFactory
89       *
90       * @param istream The InputStream that is the Word document
91       *
92       */
93      protected HDFObjectFactory(InputStream   istream, HDFLowLevelParsingListener l) throws IOException  
94      {
95          if (l == null)
96          {
97              _listener = new HDFObjectModel();
98          }
99          else
100         {
101             _listener = l;
102         }
103 
104         //do Ole stuff
105         _filesystem = new POIFSFileSystem(istream);
106 
107         DocumentEntry headerProps =
108             (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");
109 
110         _mainDocument = new byte[headerProps.getSize()];
111         _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);
112 
113         _fib = new FileInformationBlock(_mainDocument);
114 
115         initTableStream();
116         initTextPieces();
117         initFormattingProperties();
118 
119 
120     }
121 
122 
123 
124 
125     /** Creates a new instance of HDFObjectFactory
126      *
127      * @param istream The InputStream that is the Word document
128      *
129      */
130     public HDFObjectFactory(InputStream   istream) throws IOException  
131     {
132         this(istream, null);
133     }
134 
135     public static List   getTypes(InputStream   istream) throws IOException  
136     {
137         List   results = new ArrayList  (1);
138 
139         //do Ole stuff
140         POIFSFileSystem filesystem = new POIFSFileSystem(istream);
141 
142         DocumentEntry headerProps =
143             (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
144 
145         byte[] mainDocument = new byte[headerProps.getSize()];
146         filesystem.createDocumentInputStream("WordDocument").read(mainDocument);
147 
148         FileInformationBlock fib = new FileInformationBlock(mainDocument);
149 
150 
151         results.add(fib);
152         return results;
153     }
154 
155 
156     /**
157      * Initializes the table stream
158      *
159      * @throws IOException
160      */
161     private void initTableStream() throws IOException  
162     {
163         String   tablename = null;
164         if(_fib.isFWhichTblStm())
165         {
166             tablename="1Table";
167         }
168         else
169         {
170           tablename="0Table";
171         }
172 
173         DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);
174 
175         //load the table stream into a buffer
176         int size = tableEntry.getSize();
177         _tableBuffer = new byte[size];
178         _filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
179     }
180     /**
181      * Initializes the text pieces. Text is divided into pieces because some
182      * "pieces" may only contain unicode characters.
183      *
184      * @throws IOException
185      */
186     private void initTextPieces() throws IOException  
187     {
188         int pos = _fib.getFcClx();
189 
190         //skips through the prms before we reach the piece table. These contain data
191         //for actual fast saved files
192         while (_tableBuffer[pos] == 1)
193         {
194             pos++;
195             int skip = LittleEndian.getShort(_tableBuffer, pos);
196             pos += 2 + skip;
197         }
198         if(_tableBuffer[pos] != 2)
199         {
200             throw new IOException  ("The text piece table is corrupted");
201         }
202         else
203         {
204             //parse out the text pieces
205             int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
206             pos += 4;
207             int pieces = (pieceTableSize - 4) / 12;
208             for (int x = 0; x < pieces; x++)
209             {
210                 int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
211                 boolean unicode = false;
212                 if ((filePos & 0x40000000) == 0)
213                 {
214                     unicode = true;
215                 }
216                 else
217                 {
218                     unicode = false;
219                     filePos &= ~(0x40000000);//gives me FC in doc stream
220                     filePos /= 2;
221                 }
222                 int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
223                                 LittleEndian.getInt(_tableBuffer, pos + (x * 4));
224 
225                 TextPiece piece = new TextPiece(filePos, totLength, unicode);
226                 _listener.text(piece);
227 
228             }
229 
230         }
231 
232     }
233     /**
234      * initializes all of the formatting properties for a Word Document
235      */
236     private void initFormattingProperties()
237     {
238         createStyleSheet();
239         createListTables();
240         createFontTable();
241 
242         initDocumentProperties();
243         initSectionProperties();
244         //initCharacterProperties();
245         //initParagraphProperties();
246     }
247     private void initCharacterProperties(int charOffset, PlexOfCps charPlcf, int start, int end)
248     {
249         //Initialize paragraph property stuff
250         //int currentCharPage = _charParsingState.getCurrentPage();
251         int charPlcfLen = charPlcf.length();
252         int currentPageIndex = _charParsingState.getCurrentPageIndex();
253         FormattedDiskPage fkp = _charParsingState.getFkp();
254         int currentChpxIndex = _charParsingState.getCurrentPropIndex();
255         int currentArraySize = fkp.size();
256 
257         //get the character runs for this paragraph
258         int charStart = 0;
259         int charEnd = 0;
260         //add the character runs
261         do
262         {
263           if (currentChpxIndex < currentArraySize)
264           {
265             charStart = fkp.getStart(currentChpxIndex);
266             charEnd = fkp.getEnd(currentChpxIndex);
267             byte[] chpx = fkp.getGrpprl(currentChpxIndex);
268             _listener.characterRun(new ChpxNode(Math.max(charStart, start),  Math.min(charEnd, end), chpx));
269 
270             if (charEnd < end)
271             {
272               currentChpxIndex++;
273             }
274             else
275             {
276               _charParsingState.setState(currentPageIndex, fkp, currentChpxIndex);
277               break;
278             }
279           }
280           else
281           {
282             int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
283             byte[] byteFkp = new byte[512];
284             System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512);
285             fkp = new CHPFormattedDiskPage(byteFkp);
286             currentChpxIndex = 0;
287             currentArraySize = fkp.size();
288           }
289         }
290         while(currentPageIndex < charPlcfLen);
291     }
292     private void initParagraphProperties(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf, int start, int end)
293     {
294         //Initialize paragraph property stuff
295         //int currentParPage = _parParsingState.getCurrentPage();
296         int parPlcfLen = parPlcf.length();
297         int currentPageIndex = _parParsingState.getCurrentPageIndex();
298         FormattedDiskPage fkp = _parParsingState.getFkp();
299         int currentPapxIndex = _parParsingState.getCurrentPropIndex();
300         int currentArraySize = fkp.size();
301 
302         do
303         {
304           if (currentPapxIndex < currentArraySize)
305           {
306             int parStart = fkp.getStart(currentPapxIndex);
307             int parEnd = fkp.getEnd(currentPapxIndex);
308             byte[] papx = fkp.getGrpprl(currentPapxIndex);
309             _listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx));
310             initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end));
311             if (parEnd < end)
312             {
313               currentPapxIndex++;
314             }
315             else
316             {
317               //save the state
318               _parParsingState.setState(currentPageIndex, fkp, currentPapxIndex);
319               break;
320             }
321           }
322           else
323           {
324             int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex));
325             byte byteFkp[] = new byte[512];
326             System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512);
327             fkp = new PAPFormattedDiskPage(byteFkp);
328             currentPapxIndex = 0;
329             currentArraySize = fkp.size();
330           }
331         }
332         while(currentPageIndex < parPlcfLen);
333     }
334     /**
335      * initializes the CharacterProperties BTree
336      */
337     /*private void initCharacterProperties()
338     {
339         int charOffset = _fib.getFcPlcfbteChpx();
340         int charPlcSize = _fib.getLcbPlcfbteChpx();
341 
342         //int arraySize = (charPlcSize - 4)/8;
343 
344         //first we must go through the bin table and find the fkps
345         for(int x = 0; x < arraySize; x++)
346         {
347 
348             //get page number(has nothing to do with document page)
349             //containing the chpx for the paragraph
350             int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x)));
351 
352             byte[] fkp = new byte[512];
353             System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
354             //take each fkp and get the chpxs
355             int crun = LittleEndian.getUnsignedByte(fkp, 511);
356             for(int y = 0; y < crun; y++)
357             {
358                 //get the beginning fc of each paragraph text run
359                 int fcStart = LittleEndian.getInt(fkp, y * 4);
360                 int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
361                 //get the offset in fkp of the papx for this paragraph
362                 int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y);
363 
364                 //optimization if offset == 0 use "Normal" style
365                 if(chpxOffset == 0)
366 
367                 {
368                     _characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0]));
369                     continue;
370                 }
371 
372                 int size = LittleEndian.getUnsignedByte(fkp, chpxOffset);
373 
374                 byte[] chpx = new byte[size];
375                 System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);
376                 //_papTable.put(new Integer(fcStart), papx);
377                 _characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx));
378             }
379 
380         }
381     }*/
382     /**
383      * intializes the Paragraph Properties BTree
384      */
385     private void initParagraphProperties()
386     {
387         //paragraphs
388         int parOffset = _fib.getFcPlcfbtePapx();
389         int parPlcSize = _fib.getLcbPlcfbtePapx();
390 
391         //characters
392         int charOffset = _fib.getFcPlcfbteChpx();
393         int charPlcSize = _fib.getLcbPlcfbteChpx();
394 
395         PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
396         PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
397 
398         //Initialize character property stuff
399         int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
400         int charPlcfLen = charPlcf.length();
401         int currentPageIndex = 0;
402         byte[] fkp = new byte[512];
403         System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
404         CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
405         int currentChpxIndex = 0;
406         int currentArraySize = cfkp.size();
407 
408 
409         int arraySize = parPlcf.length();
410 
411         //first we must go through the bin table and find the fkps
412         for(int x = 0; x < arraySize; x++)
413         {
414             int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x));
415 
416             fkp = new byte[512];
417             System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
418 
419             PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
420             //take each fkp and get the paps
421             int crun = pfkp.size();
422             for(int y = 0; y < crun; y++)
423             {
424                 //get the beginning fc of each paragraph text run
425                 int fcStart = pfkp.getStart(y);
426                 int fcEnd = pfkp.getEnd(y);
427 
428                 //get the papx for this paragraph
429                 byte[] papx = pfkp.getGrpprl(y);
430 
431                 _listener.paragraph(new PapxNode(fcStart, fcEnd, papx));
432 
433                 //get the character runs for this paragraph
434                 int charStart = 0;
435                 int charEnd = 0;
436                 //add the character runs
437                 do
438                 {
439                   if (currentChpxIndex < currentArraySize)
440                   {
441                     charStart = cfkp.getStart(currentChpxIndex);
442                     charEnd = cfkp.getEnd(currentChpxIndex);
443                     byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
444                     _listener.characterRun(new ChpxNode(charStart, charEnd, chpx));
445                     if (charEnd < fcEnd)
446                     {
447                       currentChpxIndex++;
448                     }
449                     else
450                     {
451                       break;
452                     }
453                   }
454                   else
455                   {
456                     currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
457                     fkp = new byte[512];
458                     System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
459                     cfkp = new CHPFormattedDiskPage(fkp);
460                     currentChpxIndex = 0;
461                     currentArraySize = cfkp.size();
462                   }
463                 }
464                 while(currentCharPage <= charPlcfLen + 1);
465 
466             }
467 
468         }
469 
470     }
471     private void initParsingStates(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf)
472     {
473         int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
474         byte[] fkp = new byte[512];
475         System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
476         CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
477         _charParsingState = new ParsingState(currentCharPage, cfkp);
478 
479         int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0));
480         fkp = new byte[512];
481         System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512);
482         PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
483         _parParsingState = new ParsingState(currentParPage, pfkp);
484     }
485     /**
486      * initializes the SectionProperties BTree
487      */
488     private void initSectionProperties()
489     {
490 
491       int ccpText = _fib.getCcpText();
492       int ccpFtn = _fib.getCcpFtn();
493 
494       //sections
495       int fcMin = _fib.getFcMin();
496       int plcfsedFC = _fib.getFcPlcfsed();
497       int plcfsedSize = _fib.getLcbPlcfsed();
498 
499       //paragraphs
500       int parOffset = _fib.getFcPlcfbtePapx();
501       int parPlcSize = _fib.getLcbPlcfbtePapx();
502 
503       //characters
504       int charOffset = _fib.getFcPlcfbteChpx();
505       int charPlcSize = _fib.getLcbPlcfbteChpx();
506 
507       PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
508       PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
509 
510       initParsingStates(parOffset, parPlcf, charOffset, charPlcf);
511 
512       //byte[] plcfsed = new byte[plcfsedSize];
513       //System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
514 
515       PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
516       int arraySize = plcfsed.length();
517 
518       int start = fcMin;
519       int end = fcMin + ccpText;
520       int x = 0;
521       int sectionEnd = 0;
522 
523       //do the main body sections
524       while (x < arraySize)
525       {
526           int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
527           sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
528           int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
529           int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
530 
531           byte[] sepx = new byte[sepxSize];
532           System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
533           SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
534           _listener.bodySection(node);
535           initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd));
536 
537           if (sectionEnd > end)
538           {
539             break;
540           }
541           else
542           {
543             x++;
544           }
545       }
546       //do the header sections
547       for (; x < arraySize; x++)// && sectionEnd <= end; x++)
548       {
549           int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
550           sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
551           int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
552           int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
553 
554           byte[] sepx = new byte[sepxSize];
555           System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
556           SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
557           _listener.hdrSection(node);
558           initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd);
559 
560       }
561       _listener.endSections();
562     }
563     /**
564      * Initializes the DocumentProperties object unique to this document.
565      */
566     private void initDocumentProperties()
567     {
568         int pos = _fib.getFcDop();
569         int size = _fib.getLcbDop();
570         byte[] dopArray = new byte[size];
571 
572         System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
573         _listener.document(new DocumentProperties(dopArray));
574     }
575     /**
576      * Uncompresses the StyleSheet from file into memory.
577      */
578     private void createStyleSheet()
579     {
580       int stshIndex = _fib.getFcStshf();
581       int stshSize = _fib.getLcbStshf();
582       byte[] stsh = new byte[stshSize];
583       System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
584 
585       _listener.styleSheet(new StyleSheet(stsh));
586     }
587     /**
588      * Initializes the list tables for this document
589      */
590     private void createListTables()
591     {
592         int lfoOffset = _fib.getFcPlfLfo();
593         int lfoSize = _fib.getLcbPlfLfo();
594         byte[] plflfo = new byte[lfoSize];
595 
596         System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
597 
598         int lstOffset = _fib.getFcPlcfLst();
599         int lstSize = _fib.getLcbPlcfLst();
600         if (lstOffset > 0 && lstSize > 0)
601         {
602           //  The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
603           //  to take into account any LVLs.  Therefore, we recalculate
604           //  lstSize based on where the LFO section begins (because the
605           //  LFO section immediately follows the LST section).
606           lstSize = lfoOffset - lstOffset;
607           byte[] plcflst = new byte[lstSize];
608           System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
609           _listener.lists(new ListTables(plcflst, plflfo));
610         }
611     }
612     /**
613      * Initializes this document's FontTable;
614      */
615     private void createFontTable()
616     {
617         int fontTableIndex = _fib.getFcSttbfffn();
618         int fontTableSize = _fib.getLcbSttbfffn();
619         byte[] fontTable = new byte[fontTableSize];
620         System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
621         _listener.fonts(new FontTable(fontTable));
622     }
623 
624 }
625
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags