KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > ms > DefaultBlockFileSystem


1 /* DefaultBlockFileSystem
2 *
3 * Created on September 12, 2006
4 *
5 * Copyright (C) 2006 Internet Archive.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.util.ms;
24
25 import java.io.IOException JavaDoc;
26 import java.nio.ByteBuffer JavaDoc;
27 import java.nio.ByteOrder JavaDoc;
28 import java.util.Map JavaDoc;
29
30 import org.archive.io.SeekInputStream;
31 import org.archive.util.IoUtils;
32 import org.archive.util.LRU;
33
34
35 /**
36  * Default implementation of the Block File System.
37  *
38  * <p>The overall structure of a BlockFileSystem file (such as a .doc file) is
39  * as follows. The file is divided into blocks, which are of uniform length
40  * (512 bytes). The first block (at file pointer 0) is called the header
41  * block. It's used to look up other blocks in the file.
42  *
43  * <p>Subfiles contained within the .doc file are organized using a Block
44  * Allocation Table, or BAT. The BAT is basically a linked list; given a
45  * block number, the BAT will tell you the next block number. Note that
46  * the header block has no number; block #0 is the first block after the
47  * header. Thus, to convert a block number to a file pointer:
48  * <code>int filePointer = (blockNumber + 1) * BLOCK_SIZE</code>.
49  *
50  * <p>The BAT itself is discontinuous, however. To find the blocks that
51  * comprise the BAT, you have to look in the header block. The header block
52  * contains an array of 109 pointers to the blocks that comprise the BAT.
53  * If more than 109 BAT blocks are required (in other words, if the .doc
54  * file is larger than ~6 megabytes), then something called the
55  * XBAT comes into play.
56  *
57  * <p>XBAT blocks contain pointers to the 110th BAT block and beyond.
58  * The first XBAT block is stored at a file pointer listed in the header.
59  * The other XBAT blocks are always stored in order after the first; the
60  * XBAT table is continuous. One is inclined to wonder why the BAT itself
61  * is not so stored, but oh well.
62  *
63  * <p>The BAT only tells you the next block for a given block. To find the
64  * first block for a subfile, you have to look up that subfile's directory
65  * entry. Each directory entry is a 128 byte structure in the file, so four
66  * of them fit in a block. The number of the first block of the entry list
67  * is stored in the header. To find subsequent entry blocks, the BAT must
68  * be used.
69  *
70  * <p>I'm telling you all this so that you understand the caching that this
71  * class provides.
72  *
73  * <p>First, directory entries are not cached. It's assumed that they will
74  * be looked up at the beginning of a lengthy operation, and then forgotten
75  * about. This is certainly the case for {@link Doc#getText(BlockFileSystem)}.
76  * If you need to remember directory entries, you can manually store the Entry
77  * objects in a map or something, as they don't grow stale.
78  *
79  * <p>This class keeps all 512 bytes of the header block in memory at all
80  * times. This prevents a potentially expensive file pointer repositioning
81  * every time you're trying to figure out what comes next.
82  *
83  * <p>BAT and XBAT blocks are stored in a least-recently used cache. The
84  * <i>n</i> most recent BAT and XBAT blocks are remembered, where <i>n</i>
85  * is set at construction time. The minimum value of <i>n</i> is 1. For small
86  * files, this can prevent file pointer repositioning for BAT look ups.
87  *
88  * <p>The BAT/XBAT cache only takes up memory as needed. If the specified
89  * cache size is 100 blocks, but the file only has 4 BAT blocks, then only
90  * 2048 bytes will be used by the cache.
91  *
92  * <p>Note this class only caches BAT and XBAT blocks. It does not cache the
93  * blocks that actually make up a subfile's contents. It is assumed that those
94  * blocks will only be accessed once per operation (again, this is what
95  * {Doc.getText(BlockFileSystem)} typically requires.)
96  *
97  * @author pjack
98  * @see http://jakarta.apache.org/poi/poifs/fileformat.html
99  */

100 public class DefaultBlockFileSystem implements BlockFileSystem {
101
102
103     /**
104      * Pointers per BAT block.
105      */

106     final private static int POINTERS_PER_BAT = 128;
107
108
109     /**
110      * Size of a BAT pointer in bytes. (In other words, 4).
111      */

112     final private static int BAT_POINTER_SIZE = BLOCK_SIZE / POINTERS_PER_BAT;
113
114     
115     /**
116      * The number of BAT pointers in the header block. After this many
117      * BAT blocks, the XBAT blocks must be consulted.
118      */

119     final private static int HEADER_BAT_LIMIT = 109;
120     
121     
122     /**
123      * The size of an entry record in bytes.
124      */

125     final private static int ENTRY_SIZE = 128;
126     
127     
128     /**
129      * The number of entries that can fit in a block.
130      */

131     final private static int ENTRIES_PER_BLOCK = BLOCK_SIZE / ENTRY_SIZE;
132
133
134     /**
135      * The .doc file as a stream.
136      */

137     private SeekInputStream input;
138     
139     
140     /**
141      * The header block.
142      */

143     private HeaderBlock header;
144
145
146     /**
147      * Cache of BAT and XBAT blocks.
148      */

149     private Map JavaDoc<Integer JavaDoc,ByteBuffer JavaDoc> cache;
150     
151     
152     /**
153      * Constructor.
154      *
155      * @param input the file to read from
156      * @param batCacheSize number of BAT and XBAT blocks to cache
157      * @throws IOException if an IO error occurs
158      */

159     public DefaultBlockFileSystem(SeekInputStream input, int batCacheSize)
160     throws IOException JavaDoc {
161         this.input = input;
162         byte[] temp = new byte[BLOCK_SIZE];
163         IoUtils.readFully(input, temp);
164         this.header = new HeaderBlock(ByteBuffer.wrap(temp));
165         this.cache = new LRU<Integer JavaDoc,ByteBuffer JavaDoc>(batCacheSize);
166     }
167
168
169     public Entry getRoot() throws IOException JavaDoc {
170         // Position to the first block of the entry list.
171
int block = header.getEntriesStart();
172         input.position((block + 1) * BLOCK_SIZE);
173         
174         // The root entry is always entry #0.
175
return new DefaultEntry(this, input, 0);
176     }
177
178
179     /**
180      * Returns the entry with the given number.
181      *
182      * @param entryNumber the number of the entry to return
183      * @return that entry, or null if no such entry exists
184      * @throws IOException if an IO error occurs
185      */

186     Entry getEntry(int entryNumber) throws IOException JavaDoc {
187         // Entry numbers < 0 typically indicate an end-of-stream.
188
if (entryNumber < 0) {
189             return null;
190         }
191         
192         // It's impossible to check against the upper bound, because the
193
// upper bound is not recorded anywhere.
194

195         // Advance to the block containing the desired entry.
196
int blockCount = entryNumber / ENTRIES_PER_BLOCK;
197         int remainder = entryNumber % ENTRIES_PER_BLOCK;
198         int block = header.getEntriesStart();
199         for (int i = 0; i < blockCount; i++) {
200             block = getNextBlock(block);
201         }
202         
203         if (block < 0) {
204             // Given entry number exceeded the number of available entries.
205
return null;
206         }
207
208         int filePos = (block + 1) * BLOCK_SIZE + remainder * ENTRY_SIZE;
209         input.position(filePos);
210         
211         return new DefaultEntry(this, input, entryNumber);
212     }
213
214
215     public int getNextBlock(int block) throws IOException JavaDoc {
216         if (block < 0) {
217             return block;
218         }
219         
220         // Index into the header array of BAT blocks.
221
int headerIndex = block / POINTERS_PER_BAT;
222         
223         // Index within that BAT block of the block we're interested in.
224
int batBlockIndex = block % POINTERS_PER_BAT;
225
226         int batBlockNumber = batLookup(headerIndex);
227         ByteBuffer JavaDoc batBlock = getBATBlock(batBlockNumber);
228         return batBlock.getInt(batBlockIndex * BAT_POINTER_SIZE);
229     }
230
231     
232     /**
233      * Looks up the block number of a BAT block.
234      *
235      * @param headerIndex
236      * @return
237      * @throws IOException
238      */

239     private int batLookup(int headerIndex) throws IOException JavaDoc {
240         if (headerIndex < HEADER_BAT_LIMIT + 1) {
241             return header.getBATBlockNumber(headerIndex);
242         }
243         
244         // Find the XBAT block of interest
245
headerIndex -= HEADER_BAT_LIMIT;
246         int xbatBlockNumber = headerIndex / POINTERS_PER_BAT;
247         xbatBlockNumber += header.getExtendedBATStart();
248         ByteBuffer JavaDoc xbat = getBATBlock(xbatBlockNumber);
249
250         // Find the bat Block number inside the XBAT block
251
int xbatBlockIndex = headerIndex % POINTERS_PER_BAT;
252         return xbat.getInt(xbatBlockIndex * BAT_POINTER_SIZE);
253     }
254
255     
256     /**
257      * Returns the BAT block with the given block number.
258      * If the BAT block were previously cached, then the cached version
259      * is returned. Otherwise, the file pointer is repoisitioned to
260      * the start of the given block, and the 512 bytes are read and
261      * stored in the cache.
262      *
263      * @param block the block number of the BAT block to return
264      * @return the BAT block
265      * @throws IOException
266      */

267     private ByteBuffer JavaDoc getBATBlock(int block) throws IOException JavaDoc {
268         ByteBuffer JavaDoc r = cache.get(block);
269         if (r != null) {
270             return r;
271         }
272
273         byte[] buf = new byte[BLOCK_SIZE];
274         input.position((block + 1) * BLOCK_SIZE);
275         IoUtils.readFully(input, buf);
276
277         r = ByteBuffer.wrap(buf);
278         r.order(ByteOrder.LITTLE_ENDIAN);
279         cache.put(block, r);
280         return r;
281     }
282
283
284     public SeekInputStream getRawInput() {
285         return input;
286     }
287 }
288
Popular Tags