KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > poi > hdf > model > HDFObjectFactory


1
2 /* ====================================================================
3    Copyright 2002-2004 Apache Software Foundation
4
5    Licensed under the Apache License, Version 2.0 (the "License");
6    you may not use this file except in compliance with the License.
7    You may obtain a copy of the License at
8
9        http://www.apache.org/licenses/LICENSE-2.0
10
11    Unless required by applicable law or agreed to in writing, software
12    distributed under the License is distributed on an "AS IS" BASIS,
13    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14    See the License for the specific language governing permissions and
15    limitations under the License.
16 ==================================================================== */

17         
18 /*
19  * HDFObjectFactory.java
20  *
21  * Created on February 24, 2002, 2:17 PM
22  */

23
24 package org.apache.poi.hdf.model;
25
26
27 //import java.io;
28

29 import java.util.ArrayList JavaDoc;
30 import java.io.InputStream JavaDoc;
31 import java.io.FileInputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.util.List JavaDoc;
34 import java.util.TreeSet JavaDoc;
35
36
37 import org.apache.poi.hdf.model.hdftypes.*;
38 import org.apache.poi.hdf.event.HDFLowLevelParsingListener;
39 import org.apache.poi.hdf.model.util.BTreeSet;
40 import org.apache.poi.hdf.model.util.ParsingState;
41
42 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
43 import org.apache.poi.poifs.filesystem.POIFSDocument;
44 import org.apache.poi.poifs.filesystem.DocumentEntry;
45 import org.apache.poi.util.LittleEndian;
46
47
48
49
50 /**
51  * The Object Factory takes in a stream and creates the low level objects
52  * that represent the data.
53  * @author andy
54  */

55 public class HDFObjectFactory
56 {
57
58     /** OLE stuff*/
59     private POIFSFileSystem _filesystem;
60     /** The FIB*/
61     private FileInformationBlock _fib;
62
63     /** Used to set up the object model*/
64     private HDFLowLevelParsingListener _listener;
65     /** parsing state for characters */
66     private ParsingState _charParsingState;
67     /** parsing state for paragraphs */
68     private ParsingState _parParsingState;
69
70     /** main document stream buffer*/
71     byte[] _mainDocument;
72     /** table stream buffer*/
73     byte[] _tableBuffer;
74
75
76     public static void main(String JavaDoc args[])
77     {
78       try
79       {
80         HDFObjectFactory f = new HDFObjectFactory(new FileInputStream JavaDoc("c:\\test.doc"));
81         int k = 0;
82       }
83       catch(Throwable JavaDoc t)
84       {
85         t.printStackTrace();
86       }
87     }
88     /** Creates a new instance of HDFObjectFactory
89      *
90      * @param istream The InputStream that is the Word document
91      *
92      */

93     protected HDFObjectFactory(InputStream JavaDoc istream, HDFLowLevelParsingListener l) throws IOException JavaDoc
94     {
95         if (l == null)
96         {
97             _listener = new HDFObjectModel();
98         }
99         else
100         {
101             _listener = l;
102         }
103
104         //do Ole stuff
105
_filesystem = new POIFSFileSystem(istream);
106
107         DocumentEntry headerProps =
108             (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");
109
110         _mainDocument = new byte[headerProps.getSize()];
111         _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);
112
113         _fib = new FileInformationBlock(_mainDocument);
114
115         initTableStream();
116         initTextPieces();
117         initFormattingProperties();
118
119
120     }
121
122
123
124
125     /** Creates a new instance of HDFObjectFactory
126      *
127      * @param istream The InputStream that is the Word document
128      *
129      */

130     public HDFObjectFactory(InputStream JavaDoc istream) throws IOException JavaDoc
131     {
132         this(istream, null);
133     }
134
135     public static List JavaDoc getTypes(InputStream JavaDoc istream) throws IOException JavaDoc
136     {
137         List JavaDoc results = new ArrayList JavaDoc(1);
138
139         //do Ole stuff
140
POIFSFileSystem filesystem = new POIFSFileSystem(istream);
141
142         DocumentEntry headerProps =
143             (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
144
145         byte[] mainDocument = new byte[headerProps.getSize()];
146         filesystem.createDocumentInputStream("WordDocument").read(mainDocument);
147
148         FileInformationBlock fib = new FileInformationBlock(mainDocument);
149
150
151         results.add(fib);
152         return results;
153     }
154
155
156     /**
157      * Initializes the table stream
158      *
159      * @throws IOException
160      */

161     private void initTableStream() throws IOException JavaDoc
162     {
163         String JavaDoc tablename = null;
164         if(_fib.isFWhichTblStm())
165         {
166             tablename="1Table";
167         }
168         else
169         {
170           tablename="0Table";
171         }
172
173         DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);
174
175         //load the table stream into a buffer
176
int size = tableEntry.getSize();
177         _tableBuffer = new byte[size];
178         _filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
179     }
180     /**
181      * Initializes the text pieces. Text is divided into pieces because some
182      * "pieces" may only contain unicode characters.
183      *
184      * @throws IOException
185      */

186     private void initTextPieces() throws IOException JavaDoc
187     {
188         int pos = _fib.getFcClx();
189
190         //skips through the prms before we reach the piece table. These contain data
191
//for actual fast saved files
192
while (_tableBuffer[pos] == 1)
193         {
194             pos++;
195             int skip = LittleEndian.getShort(_tableBuffer, pos);
196             pos += 2 + skip;
197         }
198         if(_tableBuffer[pos] != 2)
199         {
200             throw new IOException JavaDoc("The text piece table is corrupted");
201         }
202         else
203         {
204             //parse out the text pieces
205
int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
206             pos += 4;
207             int pieces = (pieceTableSize - 4) / 12;
208             for (int x = 0; x < pieces; x++)
209             {
210                 int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
211                 boolean unicode = false;
212                 if ((filePos & 0x40000000) == 0)
213                 {
214                     unicode = true;
215                 }
216                 else
217                 {
218                     unicode = false;
219                     filePos &= ~(0x40000000);//gives me FC in doc stream
220
filePos /= 2;
221                 }
222                 int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
223                                 LittleEndian.getInt(_tableBuffer, pos + (x * 4));
224
225                 TextPiece piece = new TextPiece(filePos, totLength, unicode);
226                 _listener.text(piece);
227
228             }
229
230         }
231
232     }
233     /**
234      * initializes all of the formatting properties for a Word Document
235      */

236     private void initFormattingProperties()
237     {
238         createStyleSheet();
239         createListTables();
240         createFontTable();
241
242         initDocumentProperties();
243         initSectionProperties();
244         //initCharacterProperties();
245
//initParagraphProperties();
246
}
247     private void initCharacterProperties(int charOffset, PlexOfCps charPlcf, int start, int end)
248     {
249         //Initialize paragraph property stuff
250
//int currentCharPage = _charParsingState.getCurrentPage();
251
int charPlcfLen = charPlcf.length();
252         int currentPageIndex = _charParsingState.getCurrentPageIndex();
253         FormattedDiskPage fkp = _charParsingState.getFkp();
254         int currentChpxIndex = _charParsingState.getCurrentPropIndex();
255         int currentArraySize = fkp.size();
256
257         //get the character runs for this paragraph
258
int charStart = 0;
259         int charEnd = 0;
260         //add the character runs
261
do
262         {
263           if (currentChpxIndex < currentArraySize)
264           {
265             charStart = fkp.getStart(currentChpxIndex);
266             charEnd = fkp.getEnd(currentChpxIndex);
267             byte[] chpx = fkp.getGrpprl(currentChpxIndex);
268             _listener.characterRun(new ChpxNode(Math.max(charStart, start), Math.min(charEnd, end), chpx));
269
270             if (charEnd < end)
271             {
272               currentChpxIndex++;
273             }
274             else
275             {
276               _charParsingState.setState(currentPageIndex, fkp, currentChpxIndex);
277               break;
278             }
279           }
280           else
281           {
282             int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
283             byte[] byteFkp = new byte[512];
284             System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512);
285             fkp = new CHPFormattedDiskPage(byteFkp);
286             currentChpxIndex = 0;
287             currentArraySize = fkp.size();
288           }
289         }
290         while(currentPageIndex < charPlcfLen);
291     }
292     private void initParagraphProperties(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf, int start, int end)
293     {
294         //Initialize paragraph property stuff
295
//int currentParPage = _parParsingState.getCurrentPage();
296
int parPlcfLen = parPlcf.length();
297         int currentPageIndex = _parParsingState.getCurrentPageIndex();
298         FormattedDiskPage fkp = _parParsingState.getFkp();
299         int currentPapxIndex = _parParsingState.getCurrentPropIndex();
300         int currentArraySize = fkp.size();
301
302         do
303         {
304           if (currentPapxIndex < currentArraySize)
305           {
306             int parStart = fkp.getStart(currentPapxIndex);
307             int parEnd = fkp.getEnd(currentPapxIndex);
308             byte[] papx = fkp.getGrpprl(currentPapxIndex);
309             _listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx));
310             initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end));
311             if (parEnd < end)
312             {
313               currentPapxIndex++;
314             }
315             else
316             {
317               //save the state
318
_parParsingState.setState(currentPageIndex, fkp, currentPapxIndex);
319               break;
320             }
321           }
322           else
323           {
324             int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex));
325             byte byteFkp[] = new byte[512];
326             System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512);
327             fkp = new PAPFormattedDiskPage(byteFkp);
328             currentPapxIndex = 0;
329             currentArraySize = fkp.size();
330           }
331         }
332         while(currentPageIndex < parPlcfLen);
333     }
334     /**
335      * initializes the CharacterProperties BTree
336      */

337     /*private void initCharacterProperties()
338     {
339         int charOffset = _fib.getFcPlcfbteChpx();
340         int charPlcSize = _fib.getLcbPlcfbteChpx();
341
342         //int arraySize = (charPlcSize - 4)/8;
343
344         //first we must go through the bin table and find the fkps
345         for(int x = 0; x < arraySize; x++)
346         {
347
348             //get page number(has nothing to do with document page)
349             //containing the chpx for the paragraph
350             int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x)));
351
352             byte[] fkp = new byte[512];
353             System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
354             //take each fkp and get the chpxs
355             int crun = LittleEndian.getUnsignedByte(fkp, 511);
356             for(int y = 0; y < crun; y++)
357             {
358                 //get the beginning fc of each paragraph text run
359                 int fcStart = LittleEndian.getInt(fkp, y * 4);
360                 int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
361                 //get the offset in fkp of the papx for this paragraph
362                 int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y);
363
364                 //optimization if offset == 0 use "Normal" style
365                 if(chpxOffset == 0)
366
367                 {
368                     _characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0]));
369                     continue;
370                 }
371
372                 int size = LittleEndian.getUnsignedByte(fkp, chpxOffset);
373
374                 byte[] chpx = new byte[size];
375                 System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);
376                 //_papTable.put(new Integer(fcStart), papx);
377                 _characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx));
378             }
379
380         }
381     }*/

382     /**
383      * intializes the Paragraph Properties BTree
384      */

385     private void initParagraphProperties()
386     {
387         //paragraphs
388
int parOffset = _fib.getFcPlcfbtePapx();
389         int parPlcSize = _fib.getLcbPlcfbtePapx();
390
391         //characters
392
int charOffset = _fib.getFcPlcfbteChpx();
393         int charPlcSize = _fib.getLcbPlcfbteChpx();
394
395         PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
396         PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
397
398         //Initialize character property stuff
399
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
400         int charPlcfLen = charPlcf.length();
401         int currentPageIndex = 0;
402         byte[] fkp = new byte[512];
403         System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
404         CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
405         int currentChpxIndex = 0;
406         int currentArraySize = cfkp.size();
407
408
409         int arraySize = parPlcf.length();
410
411         //first we must go through the bin table and find the fkps
412
for(int x = 0; x < arraySize; x++)
413         {
414             int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x));
415
416             fkp = new byte[512];
417             System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
418
419             PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
420             //take each fkp and get the paps
421
int crun = pfkp.size();
422             for(int y = 0; y < crun; y++)
423             {
424                 //get the beginning fc of each paragraph text run
425
int fcStart = pfkp.getStart(y);
426                 int fcEnd = pfkp.getEnd(y);
427
428                 //get the papx for this paragraph
429
byte[] papx = pfkp.getGrpprl(y);
430
431                 _listener.paragraph(new PapxNode(fcStart, fcEnd, papx));
432
433                 //get the character runs for this paragraph
434
int charStart = 0;
435                 int charEnd = 0;
436                 //add the character runs
437
do
438                 {
439                   if (currentChpxIndex < currentArraySize)
440                   {
441                     charStart = cfkp.getStart(currentChpxIndex);
442                     charEnd = cfkp.getEnd(currentChpxIndex);
443                     byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
444                     _listener.characterRun(new ChpxNode(charStart, charEnd, chpx));
445                     if (charEnd < fcEnd)
446                     {
447                       currentChpxIndex++;
448                     }
449                     else
450                     {
451                       break;
452                     }
453                   }
454                   else
455                   {
456                     currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
457                     fkp = new byte[512];
458                     System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
459                     cfkp = new CHPFormattedDiskPage(fkp);
460                     currentChpxIndex = 0;
461                     currentArraySize = cfkp.size();
462                   }
463                 }
464                 while(currentCharPage <= charPlcfLen + 1);
465
466             }
467
468         }
469
470     }
471     private void initParsingStates(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf)
472     {
473         int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
474         byte[] fkp = new byte[512];
475         System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
476         CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
477         _charParsingState = new ParsingState(currentCharPage, cfkp);
478
479         int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0));
480         fkp = new byte[512];
481         System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512);
482         PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
483         _parParsingState = new ParsingState(currentParPage, pfkp);
484     }
485     /**
486      * initializes the SectionProperties BTree
487      */

488     private void initSectionProperties()
489     {
490
491       int ccpText = _fib.getCcpText();
492       int ccpFtn = _fib.getCcpFtn();
493
494       //sections
495
int fcMin = _fib.getFcMin();
496       int plcfsedFC = _fib.getFcPlcfsed();
497       int plcfsedSize = _fib.getLcbPlcfsed();
498
499       //paragraphs
500
int parOffset = _fib.getFcPlcfbtePapx();
501       int parPlcSize = _fib.getLcbPlcfbtePapx();
502
503       //characters
504
int charOffset = _fib.getFcPlcfbteChpx();
505       int charPlcSize = _fib.getLcbPlcfbteChpx();
506
507       PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
508       PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
509
510       initParsingStates(parOffset, parPlcf, charOffset, charPlcf);
511
512       //byte[] plcfsed = new byte[plcfsedSize];
513
//System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
514

515       PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
516       int arraySize = plcfsed.length();
517
518       int start = fcMin;
519       int end = fcMin + ccpText;
520       int x = 0;
521       int sectionEnd = 0;
522
523       //do the main body sections
524
while (x < arraySize)
525       {
526           int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
527           sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
528           int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
529           int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
530
531           byte[] sepx = new byte[sepxSize];
532           System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
533           SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
534           _listener.bodySection(node);
535           initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd));
536
537           if (sectionEnd > end)
538           {
539             break;
540           }
541           else
542           {
543             x++;
544           }
545       }
546       //do the header sections
547
for (; x < arraySize; x++)// && sectionEnd <= end; x++)
548
{
549           int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
550           sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
551           int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
552           int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
553
554           byte[] sepx = new byte[sepxSize];
555           System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
556           SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
557           _listener.hdrSection(node);
558           initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd);
559
560       }
561       _listener.endSections();
562     }
563     /**
564      * Initializes the DocumentProperties object unique to this document.
565      */

566     private void initDocumentProperties()
567     {
568         int pos = _fib.getFcDop();
569         int size = _fib.getLcbDop();
570         byte[] dopArray = new byte[size];
571
572         System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
573         _listener.document(new DocumentProperties(dopArray));
574     }
575     /**
576      * Uncompresses the StyleSheet from file into memory.
577      */

578     private void createStyleSheet()
579     {
580       int stshIndex = _fib.getFcStshf();
581       int stshSize = _fib.getLcbStshf();
582       byte[] stsh = new byte[stshSize];
583       System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
584
585       _listener.styleSheet(new StyleSheet(stsh));
586     }
587     /**
588      * Initializes the list tables for this document
589      */

590     private void createListTables()
591     {
592         int lfoOffset = _fib.getFcPlfLfo();
593         int lfoSize = _fib.getLcbPlfLfo();
594         byte[] plflfo = new byte[lfoSize];
595
596         System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
597
598         int lstOffset = _fib.getFcPlcfLst();
599         int lstSize = _fib.getLcbPlcfLst();
600         if (lstOffset > 0 && lstSize > 0)
601         {
602           // The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
603
// to take into account any LVLs. Therefore, we recalculate
604
// lstSize based on where the LFO section begins (because the
605
// LFO section immediately follows the LST section).
606
lstSize = lfoOffset - lstOffset;
607           byte[] plcflst = new byte[lstSize];
608           System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
609           _listener.lists(new ListTables(plcflst, plflfo));
610         }
611     }
612     /**
613      * Initializes this document's FontTable;
614      */

615     private void createFontTable()
616     {
617         int fontTableIndex = _fib.getFcSttbfffn();
618         int fontTableSize = _fib.getLcbSttbfffn();
619         byte[] fontTable = new byte[fontTableSize];
620         System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
621         _listener.fonts(new FontTable(fontTable));
622     }
623
624 }
625
Popular Tags