KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > ByteReplayCharSequence


1 /* ByteReplayCharSequenceFactory
2  *
3  * (Re)Created on Dec 21, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import java.io.IOException JavaDoc;
26 import java.io.RandomAccessFile JavaDoc;
27 import java.io.UnsupportedEncodingException JavaDoc;
28 import java.util.logging.Level JavaDoc;
29 import java.util.logging.Logger JavaDoc;
30
31 import org.archive.util.DevUtils;
32
33 /**
34  * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
35  * buffer and overflow backing file).
36  *
37  * Treats the byte stream as 8-bit.
38  *
39  * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
40  * from disk in memory; as long as the 'random access' of a CharSequence
41  * user stays within this window, access should remain fairly efficient.
42  * (So design any regexps pointed at these CharSequences to work within
43  * that range!)
44  *
45  * <p>When rereading of a location is necessary, the whole window is
46  * recentered around the location requested. (TODO: More research
47  * into whether this is the best strategy.)
48  *
49  * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
50  * to wrap the passed prefix buffer and the second, a memory-mapped
51  * ByteBuffer view into the backing file -- was consistently slower: ~10%.
52  * My tests did the following. Made a buffer filled w/ regular content.
53  * This buffer was used as the prefix buffer. The buffer content was
54  * written MULTIPLER times to a backing file. I then did accesses w/ the
55  * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
56  * read forward from byte 16-32. Repeat. Though I varied the size of the
57  * buffer to the size of the backing file,from 3-10, the difference of 10%
58  * or so seemed to persist. Same if I tried to favor get() over get(index).
59  * I used a profiler, JMP, to study times taken (St.Ack did above comment).
60  *
61  * <p>TODO determine in memory mapped files is better way to do this;
62  * probably not -- they don't offer the level of control over
63  * total memory used that this approach does.
64  *
65  * @author Gordon Mohr
66  * @version $Revision: 1.2.2.1 $, $Date: 2007/01/13 01:31:32 $
67  */

68 class ByteReplayCharSequence implements ReplayCharSequence {
69
70     protected static Logger JavaDoc logger =
71         Logger.getLogger(ByteReplayCharSequence.class.getName());
72
73     /**
74      * Buffer that holds the first bit of content.
75      *
76      * Once this is exhausted we go to the backing file.
77      */

78     private byte[] prefixBuffer;
79
80     /**
81      * Total length of character stream to replay minus the HTTP headers
82      * if present.
83      *
84      * Used to find EOS.
85      */

86     protected int length;
87
88     /**
89      * Absolute length of the stream.
90      *
91      * Includes HTTP headers. Needed doing calc. in the below figuring
92      * how much to load into buffer.
93      */

94     private int absoluteLength = -1;
95
96     /**
97      * Buffer window on to backing file.
98      */

99     private byte[] wraparoundBuffer;
100
101     /**
102      * Absolute index into underlying bytestream where wrap starts.
103      */

104     private int wrapOrigin;
105
106     /**
107      * Index in wraparoundBuffer that corresponds to wrapOrigin
108      */

109     private int wrapOffset;
110
111     /**
112      * Name of backing file we go to when we've exhausted content from the
113      * prefix buffer.
114      */

115     private String JavaDoc backingFilename;
116
117     /**
118      * Random access to the backing file.
119      */

120     private RandomAccessFile JavaDoc raFile;
121
122     /**
123      * Offset into prefix buffer at which content beings.
124      */

125     private int contentOffset;
126
127     /**
128      * 8-bit encoding used reading single bytes from buffer and
129      * stream.
130      */

131     private static final String JavaDoc DEFAULT_SINGLE_BYTE_ENCODING =
132         "ISO-8859-1";
133
134
135     /**
136      * Constructor.
137      *
138      * @param buffer In-memory buffer of recordings prefix. We read from
139      * here first and will only go to the backing file if <code>size</code>
140      * requested is greater than <code>buffer.length</code>.
141      * @param size Total size of stream to replay in bytes. Used to find
142      * EOS. This is total length of content including HTTP headers if
143      * present.
144      * @param responseBodyStart Where the response body starts in bytes.
145      * Used to skip over the HTTP headers if present.
146      * @param backingFilename Path to backing file with content in excess of
147      * whats in <code>buffer</code>.
148      *
149      * @throws IOException
150      */

151     public ByteReplayCharSequence(byte[] buffer, long size,
152             long responseBodyStart, String JavaDoc backingFilename)
153         throws IOException JavaDoc {
154
155         this.length = (int)(size - responseBodyStart);
156         this.absoluteLength = (int)size;
157         this.prefixBuffer = buffer;
158         this.contentOffset = (int)responseBodyStart;
159
160         // If amount to read is > than what is in our prefix buffer, then
161
// open the backing file.
162
if (size > buffer.length) {
163             this.backingFilename = backingFilename;
164             this.raFile = new RandomAccessFile JavaDoc(backingFilename, "r");
165             this.wraparoundBuffer = new byte[this.prefixBuffer.length];
166             this.wrapOrigin = this.prefixBuffer.length;
167             this.wrapOffset = 0;
168             loadBuffer();
169         }
170     }
171
172     /**
173      * @return Length of characters in stream to replay. Starts counting
174      * at the HTTP header/body boundary.
175      */

176     public int length() {
177         return this.length;
178     }
179
180     /**
181      * Get character at passed absolute position.
182      *
183      * Called by {@link #charAt(int)} which has a relative index into the
184      * content, one that doesn't account for HTTP header if present.
185      *
186      * @param index Index into content adjusted to accomodate initial offset
187      * to get us past the HTTP header if present (i.e.
188      * {@link #contentOffset}).
189      *
190      * @return Characater at offset <code>index</code>.
191      */

192     public char charAt(int index) {
193         int c = -1;
194         // Add to index start-of-content offset to get us over HTTP header
195
// if present.
196
index += this.contentOffset;
197         if (index < this.prefixBuffer.length) {
198             // If index is into our prefix buffer.
199
c = this.prefixBuffer[index];
200         } else if (index >= this.wrapOrigin &&
201             (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
202             // If index is into our buffer window on underlying backing file.
203
c = this.wraparoundBuffer[
204                     ((index - this.wrapOrigin) + this.wrapOffset) %
205                         this.wraparoundBuffer.length];
206         } else {
207             // Index is outside of both prefix buffer and our buffer window
208
// onto the underlying backing file. Fix the buffer window
209
// location.
210
c = faultCharAt(index);
211         }
212         // Stream is treated as single byte. Make sure characters returned
213
// are not negative.
214
return (char)(c & 0xff);
215     }
216
217     /**
218      * Get a character that's outside the current buffers.
219      *
220      * will cause the wraparoundBuffer to be changed to
221      * cover a region including the index
222      *
223      * if index is higher than the highest index in the
224      * wraparound buffer, buffer is moved forward such
225      * that requested char is last item in buffer
226      *
227      * if index is lower than lowest index in the
228      * wraparound buffer, buffet is reset centered around
229      * index
230      *
231      * @param index Index of character to fetch.
232      * @return A character that's outside the current buffers
233      */

234     private int faultCharAt(int index) {
235         if(Thread.interrupted()) {
236             throw new RuntimeException JavaDoc("thread interrupted");
237         }
238         if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
239             // Moving forward
240
while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
241             {
242                 // TODO optimize this
243
advanceBuffer();
244             }
245             return charAt(index - this.contentOffset);
246         }
247         // Moving backward
248
recenterBuffer(index);
249         return charAt(index - this.contentOffset);
250     }
251
252     /**
253      * Move the buffer window on backing file back centering current access
254      * position in middle of window.
255      *
256      * @param index Index of character to access.
257      */

258     private void recenterBuffer(int index) {
259         if (logger.isLoggable(Level.FINE)) {
260             logger.fine("Recentering around " + index + " in " +
261                 this.backingFilename);
262         }
263         this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
264         if(this.wrapOrigin < this.prefixBuffer.length) {
265             this.wrapOrigin = this.prefixBuffer.length;
266         }
267         this.wrapOffset = 0;
268         loadBuffer();
269     }
270
271     /**
272      * Load from backing file into the wrapper buffer.
273      */

274     private void loadBuffer()
275     {
276         long len = -1;
277         try {
278             len = this.raFile.length();
279             this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
280             this.raFile.readFully(this.wraparoundBuffer, 0,
281                 Math.min(this.wraparoundBuffer.length,
282                      this.absoluteLength - this.wrapOrigin));
283         }
284
285         catch (IOException JavaDoc e) {
286             // TODO convert this to a runtime error?
287
DevUtils.logger.log (
288                 Level.SEVERE,
289                 "raFile.seek(" +
290                 (this.wrapOrigin - this.prefixBuffer.length) +
291                 ")\n" +
292                 "raFile.readFully(wraparoundBuffer,0," +
293                 (Math.min(this.wraparoundBuffer.length,
294                     this.length - this.wrapOrigin )) +
295                 ")\n"+
296                 "raFile.length()" + len + "\n" +
297                 DevUtils.extraInfo(),
298                 e);
299             throw new RuntimeException JavaDoc(e);
300         }
301     }
302
303     /**
304      * Roll the wraparound buffer forward one position
305      */

306     private void advanceBuffer() {
307         try {
308             this.wraparoundBuffer[this.wrapOffset] =
309                 (byte)this.raFile.read();
310             this.wrapOffset++;
311             this.wrapOffset %= this.wraparoundBuffer.length;
312             this.wrapOrigin++;
313         } catch (IOException JavaDoc e) {
314             DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
315                 DevUtils.extraInfo(), e);
316             throw new RuntimeException JavaDoc(e);
317         }
318     }
319
320     public CharSequence JavaDoc subSequence(int start, int end) {
321         return new CharSubSequence(this, start, end);
322     }
323
324     /**
325      * Cleanup resources.
326      *
327      * @exception IOException Failed close of random access file.
328      */

329     public void close() throws IOException JavaDoc
330     {
331         this.prefixBuffer = null;
332         if (this.raFile != null) {
333             this.raFile.close();
334             this.raFile = null;
335         }
336     }
337
338     /* (non-Javadoc)
339      * @see java.lang.Object#finalize()
340      */

341     protected void finalize() throws Throwable JavaDoc
342     {
343         super.finalize();
344         close();
345     }
346
347     /* (non-Javadoc)
348      * @see org.archive.io.EnhancedCharSequence#substring(int, int)
349      */

350     public String JavaDoc substring(int offset, int len) {
351         StringBuffer JavaDoc ret = new StringBuffer JavaDoc(len);
352         // Add to offset start-of-content offset to get us over HTTP header
353
// if present.
354
offset += this.contentOffset;
355         if (offset < this.prefixBuffer.length) {
356             // Need something from the prefix buffer.
357
int from = offset;
358             // To the end of the buffer
359
int count = this.prefixBuffer.length - from;
360             if (offset + len < this.prefixBuffer.length) {
361                 count = len; // length falls within the buffer.
362
} else {
363                 // Will need more then is in the prefixBuffer.
364
offset = this.prefixBuffer.length + 1;
365                 len = len - count;
366             }
367             // Since we are dealing with a byte buffer we'll have to use
368
// a String and then wrap up in a StringBuffer to concat with
369
// the backing file. TODO: This can probably be optimized.
370
//
371
// Also, force an 8-bit encoding. Default jvm encoding is
372
// usually -- us context -- 7 bit ascii. If we don't force
373
// 8-bit, characters above 127 are considered rubbish.
374
try {
375                 ret.append(new String JavaDoc(this.prefixBuffer,from,count,
376                     DEFAULT_SINGLE_BYTE_ENCODING));
377             }
378             catch (UnsupportedEncodingException JavaDoc e) {
379                 logger.severe("Failed encoding string: " + e.getMessage());
380             }
381         }
382         if (offset >= this.prefixBuffer.length) {
383             // TODO: Maybe better performance can be gained by reading
384
// blocks from files.
385
int to = offset + len;
386             for(int i = offset ; i < to ; i++) {
387                 ret.append(charAt(i - this.contentOffset));
388             }
389         }
390
391         return ret.toString();
392     }
393     
394     public String JavaDoc toString() {
395         return substring(0, length());
396     }
397 }
Popular Tags