KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > ms > PieceTable


1 /* PieceTable
2 *
3 * Created on September 12, 2006
4 *
5 * Copyright (C) 2006 Internet Archive.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.util.ms;
24
25 import java.io.IOException JavaDoc;
26 import java.util.logging.Level JavaDoc;
27 import java.util.logging.Logger JavaDoc;
28
29 import org.archive.io.BufferedSeekInputStream;
30 import org.archive.io.Endian;
31 import org.archive.io.OriginSeekInputStream;
32 import org.archive.io.SafeSeekInputStream;
33 import org.archive.io.SeekInputStream;
34
35
36 /**
37  * The piece table of a .doc file.
38  *
39  * <p>The piece table maps logical character positions of a document's text
40  * stream to actual file stream positions. The piece table is stored as two
41  * parallel arrays. The first array contains 32-bit integers representing
42  * the logical character positions. The second array contains 64-bit data
43  * structures that are mostly mysterious to me, except that they contain a
44  * 32-bit subfile offset. The second array is stored immediately after the
45  * first array. I call the first array the <i>charPos</i> array and the
46  * second array the <i>filePos</i> array.
47  *
48  * <p>The arrays are preceded by a special tag byte (2), followed by the
49  * combined size of both arrays in bytes. The number of piece table entries
50  * must be deduced from this byte size.
51  *
52  * <p>Because of this bizarre structure, caching piece table entries is
53  * something of a challenge. A single piece table entry is actually located
54  * in two different file locations. If there are many piece table entries,
55  * then the charPos and filePos information may be separated by many bytes,
56  * potentially crossing block boundaries. The approach I took was to use
57  * two different buffered streams. Up to n charPos offsets and n filePos
58  * structures can be buffered in the two streams, preventing any file seeking
59  * from occurring when looking up piece information. (File seeking must
60  * still occur to jump from one piece to the next.)
61  *
62  * <p>Note that the vast majority of .doc files in the world will have exactly
63  * 1 piece table entry, representing the complete text of the document. Only
64  * those documents that were "fast-saved" should have multiple pieces.
65  *
66  * <p>Finally, the text contained in a .doc file can either contain 16-bit
67  * unicode characters (charset UTF-16LE) or 8-bit CP1252 characters. One
68  * .doc file can contain both kinds of pieces. Whether or not a piece is
69  * Cp1252 is stored as a flag in the filePos value, bizarrely enough. If
70  * the flag is set, then the actual file position is the filePos with the
71  * flag cleared, then divided by 2.
72  *
73  * @author pjack
74  */

75 class PieceTable {
76
77     final static Logger JavaDoc LOGGER
78      = Logger.getLogger(PieceTable.class.getName());
79
80     /** The bit that indicates if a piece uses Cp1252 or unicode. */
81     final static int CP1252_INDICATOR = 1 << 30;
82     
83     /** The mask to use to clear the Cp1252 flag bit. */
84     final static int CP1252_MASK = ~(3 << 30);
85
86     /** The total number of pieces in the table. */
87     private int count;
88     
89     /** The total number of characters in the text stream. */
90     private int maxCharPos;
91
92     /** The index of the current piece. */
93     private int current;
94     
95     /** The most recently returned piece from this table. */
96     private Piece currentPiece;
97
98
99     /** The buffered stream that provides character position information. */
100     private SeekInputStream charPos;
101     
102     /** The buffered stream that provides file pointer information. */
103     private SeekInputStream filePos;
104
105
106     /**
107      * Constructor.
108      *
109      * @param tableStream the stream containing the piece table
110      * @param offset the starting offset of the piece table
111      * @param maxCharPos the total number of characters in the document
112      * @param cachedRecords the number of piece table entries to cache
113      * @throws IOException if an IO error occurs
114      */

115     public PieceTable(SeekInputStream tableStream, int offset,
116             int maxCharPos, int cachedRecords) throws IOException JavaDoc {
117         tableStream.position(offset);
118         skipProperties(tableStream);
119         int sizeInBytes = Endian.littleInt(tableStream);
120         this.count = (sizeInBytes - 4) / 12;
121         cachedRecords = Math.min(cachedRecords, count);
122         long tp = tableStream.position() + 4;
123         long charPosStart = tp;
124         long filePosStart = tp + count * 4 + 2;
125         
126         this.filePos = wrap(tableStream, filePosStart, cachedRecords * 8);
127         this.charPos = wrap(tableStream, charPosStart, cachedRecords * 4);
128         this.maxCharPos = maxCharPos;
129         
130         if (LOGGER.isLoggable(Level.FINEST)) {
131             LOGGER.finest("Size in bytes: " + sizeInBytes);
132             LOGGER.finest("Piece table count: " + count);
133             for (Piece piece = next(); piece != null; piece = next()) {
134                 LOGGER.finest("#" + current + ": " + piece.toString());
135             }
136             current = 0;
137         }
138     }
139     
140     
141     /**
142      * Wraps the raw table stream. This is used to create the charPos and
143      * filePos streams. The streams that this method returns are "safe",
144      * meaning that the charPos and filePos position() fields never clobber
145      * each other. They are buffered, meaning that up to <i>n</i> elements
146      * can be read before the disk is accessed again. And they are "origined",
147      * meaning result.position(0) actually positions the stream at the
148      * beginning of the piece table array, not the beginning of the file.
149      *
150      * @param input the stream to wrap
151      * @param pos the origin for the returned stream
152      * @param cache the number of bytes for the returned stream to buffer
153      * @return the wrapped stream
154      * @throws IOException if an IO error occurs
155      */

156     private SeekInputStream wrap(SeekInputStream input, long pos, int cache)
157     throws IOException JavaDoc {
158         input.position(pos);
159         SeekInputStream r = new SafeSeekInputStream(input);
160         r = new OriginSeekInputStream(r, pos);
161         r = new BufferedSeekInputStream(r, cache);
162         return r;
163     }
164     
165     
166     /**
167      * Skips over any property information that may precede a piece table.
168      * These property structures contain stylesheet information that applies
169      * to the piece table. Since we're only interested in the text itself,
170      * we just ignore this property stuff. (I suppose a third buffered
171      * stream could be used to add style information to {@link Piece}, but
172      * we don't need it.)
173      *
174      * @param input the input stream containing the piece table
175      * @throws IOException if an IO error occurs
176      */

177     private static void skipProperties(SeekInputStream input) throws IOException JavaDoc {
178         int tag = input.read();
179         while (tag == 1) {
180             int size = Endian.littleChar(input);
181             while (size > 0) {
182                 size -= input.skip(size);
183             }
184             tag = input.read();
185         }
186         if (tag != 2) {
187             throw new IllegalStateException JavaDoc();
188         }
189     }
190
191
192     /**
193      * Returns the maximum character position. Put another way, returns the
194      * total number of characters in the document.
195      *
196      * @return the maximum character position
197      */

198     public int getMaxCharPos() {
199         return maxCharPos;
200     }
201
202
203     /**
204      * Returns the next piece in the piece table.
205      *
206      * @return the next piece in the piece table, or null if there is no
207      * next piece
208      * @throws IOException if an IO error occurs
209      */

210     public Piece next() throws IOException JavaDoc {
211         if (current >= count) {
212             currentPiece = null;
213             return null;
214         }
215                 
216         int cp;
217         if (current == count - 1) {
218             cp = maxCharPos;
219         } else {
220             charPos.position(current * 4);
221             cp = Endian.littleInt(charPos);
222         }
223         filePos.position(current * 8);
224         int encoded = Endian.littleInt(filePos);
225
226         if (LOGGER.isLoggable(Level.FINEST)) {
227             StringBuffer JavaDoc sb = new StringBuffer JavaDoc(Integer.toBinaryString(encoded));
228             while (sb.length() < 32) {
229                 sb.insert(0, '0');
230             }
231             LOGGER.finest("Encoded offset: " + sb.toString());
232         }
233         
234         current++;
235
236         int start;
237         if (currentPiece == null) {
238             start = 0;
239         } else {
240             start = currentPiece.getCharPosLimit();
241         }
242         if ((encoded & CP1252_INDICATOR) == 0) {
243             Piece piece = new Piece(encoded, start, cp, true);
244             currentPiece = piece;
245             return piece;
246         } else {
247             int filePos = (encoded & CP1252_MASK) / 2;
248             Piece piece = new Piece(filePos, start, cp, false);
249             currentPiece = piece;
250             return piece;
251         }
252     }
253
254     
255     /**
256      * Returns the piece containing the given character position.
257      *
258      * @param charPos the character position whose piece to return
259      * @return that piece, or null if no such piece exists (if charPos
260      * is greater than getMaxCharPos())
261      * @throws IOException if an IO error occurs
262      */

263     public Piece pieceFor(int charPos) throws IOException JavaDoc {
264         if (currentPiece.contains(charPos)) {
265             return currentPiece;
266         }
267      
268         // FIXME: Use binary search to find piece index
269

270         current = 0;
271         currentPiece = null;
272         next();
273         
274         while (currentPiece != null) {
275             if (currentPiece.contains(charPos)) {
276                 return currentPiece;
277             }
278             next();
279         }
280         
281         return null;
282     }
283
284 }
285
Popular Tags