KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > io > MultiByteReplayCharSequence


1 /* MultiByteReplayCharSequenceFactory
2  *
3  * (Re)Created on Dec 21, 2006
4  *
5  * Copyright (C) 2006 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.io;
24
25 import java.io.BufferedReader JavaDoc;
26 import java.io.BufferedWriter JavaDoc;
27 import java.io.File JavaDoc;
28 import java.io.FileInputStream JavaDoc;
29 import java.io.FileNotFoundException JavaDoc;
30 import java.io.FileOutputStream JavaDoc;
31 import java.io.IOException JavaDoc;
32 import java.io.InputStreamReader JavaDoc;
33 import java.io.OutputStreamWriter JavaDoc;
34 import java.io.Writer JavaDoc;
35 import java.nio.ByteBuffer JavaDoc;
36 import java.nio.CharBuffer JavaDoc;
37 import java.nio.channels.FileChannel JavaDoc;
38 import java.nio.charset.Charset JavaDoc;
39 import java.nio.charset.CharsetDecoder JavaDoc;
40 import java.nio.charset.CoderResult JavaDoc;
41 import java.nio.charset.CodingErrorAction JavaDoc;
42 import java.util.logging.Level JavaDoc;
43 import java.util.logging.Logger JavaDoc;
44
45 /**
46  * Provides a (Replay)CharSequence view on recorded streams (a prefix
47  * buffer and overflow backing file) that can handle streams of multibyte
48  * characters.
49  *
50  * If possible, use {@link ByteReplayCharSequence}. It performs better even
51  * for the single byte case (Decoding is an expensive process).
52  *
53  * <p>Call close on this class when done so can clean up resources.
54  *
55  * <p>Implementation currently works by checking to see if content to read
56  * all fits the in-memory buffer. If so, we decode into a CharBuffer and
57  * keep this around for CharSequence operations. This CharBuffer is
58  * discarded on close.
59  *
60  * <p>If content length is greater than in-memory buffer, we decode the
61  * buffer plus backing file into a new file named for the backing file w/
62  * a suffix of the encoding we write the file as. We then run w/ a
63  * memory-mapped CharBuffer against this file to implement CharSequence.
64  * Reasons for this implemenation are that CharSequence wants to return the
65  * length of the CharSequence.
66  *
67  * <p>Obvious optimizations would keep around decodings whether the
68  * in-memory decoded buffer or the file of decodings written to disk but the
69  * general usage pattern processing URIs is that the decoding is used by one
70  * processor only. Also of note, files usually fit into the in-memory
71  * buffer.
72  *
73  * <p>We might also be able to keep up 3 windows that moved across the file
74  * decoding a window at a time trying to keep one of the buffers just in
75  * front of the regex processing returning it a length that would be only
76  * the length of current position to end of current block or else the length
77  * could be got by multipling the backing files length by the decoders'
78  * estimate of average character size. This would save us writing out the
79  * decoded file. We'd have to do the latter for files that are
80  * > Integer.MAX_VALUE.
81  *
82  * @author stack
83  * @version $Revision: 1.3.2.1 $, $Date: 2007/01/15 18:55:09 $
84  */

85 public class MultiByteReplayCharSequence implements ReplayCharSequence {
86
87     protected static Logger JavaDoc logger =
88         Logger.getLogger(MultiByteReplayCharSequence.class.getName());
89     
90     /**
91      * Name of the encoding we use writing out concatenated decoded prefix
92      * buffer and decoded backing file.
93      *
94      * <p>This define is also used as suffix for the file that holds the
95      * decodings. The name of the file that holds the decoding is the name
96      * of the backing file w/ this encoding for a suffix.
97      *
98      * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
99      */

100     private static final String JavaDoc WRITE_ENCODING = "UTF-16BE";
101
102     /**
103      * CharBuffer of decoded content.
104      *
105      * Content of this buffer is unicode.
106      */

107     private CharBuffer JavaDoc content = null;
108
109     /**
110      * File that has decoded content.
111      *
112      * Keep it around so we can remove on close.
113      */

114     private File JavaDoc decodedFile = null;
115
116
117     /**
118      * Constructor for all in-memory operation.
119      *
120      * @param buffer In-memory buffer of recordings prefix. We read from
121      * here first and will only go to the backing file if <code>size</code>
122      * requested is greater than <code>buffer.length</code>.
123      * @param size Total size of stream to replay in bytes. Used to find
124      * EOS. This is total length of content including HTTP headers if
125      * present.
126      * @param responseBodyStart Where the response body starts in bytes.
127      * Used to skip over the HTTP headers if present.
128      * @param backingFilename Path to backing file with content in excess of
129      * whats in <code>buffer</code>.
130      * @param encoding Encoding to use reading the passed prefix buffer and
131      * backing file. For now, should be java canonical name for the
132      * encoding. (If null is passed, we will default to
133      * ByteReplayCharSequence).
134      *
135      * @throws IOException
136      */

137     public MultiByteReplayCharSequence(byte[] buffer, long size,
138             long responseBodyStart, String JavaDoc encoding)
139         throws IOException JavaDoc {
140         super();
141         this.content = decodeInMemory(buffer, size, responseBodyStart,
142                 encoding);
143      }
144
145     /**
146      * Constructor for overflow-to-disk-file operation.
147      *
148      * @param contentReplayInputStream inputStream of content
149      * @param backingFilename hint for name of temp file
150      * @param characterEncoding Encoding to use reading the stream.
151      * For now, should be java canonical name for the
152      * encoding.
153      *
154      * @throws IOException
155      */

156     public MultiByteReplayCharSequence(
157             ReplayInputStream contentReplayInputStream,
158             String JavaDoc backingFilename,
159             String JavaDoc characterEncoding)
160         throws IOException JavaDoc {
161         super();
162         this.content = decodeToFile(contentReplayInputStream,
163                 backingFilename, characterEncoding);
164     }
165
166     /**
167      * Decode passed buffer and backing file into a CharBuffer.
168      *
169      * This method writes a new file made of the decoded concatenation of
170      * the in-memory prefix buffer and the backing file. Returns a
171      * charSequence view onto this new file.
172      *
173      * @param buffer In-memory buffer of recordings prefix. We read from
174      * here first and will only go to the backing file if <code>size</code>
175      * requested is greater than <code>buffer.length</code>.
176      * @param size Total size of stream to replay in bytes. Used to find
177      * EOS. This is total length of content including HTTP headers if
178      * present.
179      * @param responseBodyStart Where the response body starts in bytes.
180      * Used to skip over the HTTP headers if present.
181      * @param backingFilename Path to backing file with content in excess of
182      * whats in <code>buffer</code>.
183      * @param encoding Encoding to use reading the passed prefix buffer and
184      * backing file. For now, should be java canonical name for the
185      * encoding. (If null is passed, we will default to
186      * ByteReplayCharSequence).
187      *
188      * @return A CharBuffer view on decodings of the contents of passed
189      * buffer.
190      * @throws IOException
191      */

192     private CharBuffer JavaDoc decodeToFile(ReplayInputStream inStream,
193             String JavaDoc backingFilename, String JavaDoc encoding)
194         throws IOException JavaDoc {
195
196         CharBuffer JavaDoc charBuffer = null;
197
198         BufferedReader JavaDoc reader = new BufferedReader JavaDoc(
199                 new InputStreamReader JavaDoc(inStream,encoding));
200         
201         this.decodedFile = new File JavaDoc(backingFilename + "." + WRITE_ENCODING);
202         BufferedWriter JavaDoc writer = new BufferedWriter JavaDoc(
203                 new OutputStreamWriter JavaDoc(
204                         new FileOutputStream JavaDoc(this.decodedFile),
205                         WRITE_ENCODING));
206
207         int c;
208         while((c = reader.read())>=0) {
209             writer.write(c);
210         }
211         writer.close();
212         
213         charBuffer = getReadOnlyMemoryMappedBuffer(this.decodedFile).
214             asCharBuffer();
215
216         return charBuffer;
217     }
218
219     /**
220      * Decode passed buffer into a CharBuffer.
221      *
222      * This method decodes a memory buffer returning a memory buffer.
223      *
224      * @param buffer In-memory buffer of recordings prefix. We read from
225      * here first and will only go to the backing file if <code>size</code>
226      * requested is greater than <code>buffer.length</code>.
227      * @param size Total size of stream to replay in bytes. Used to find
228      * EOS. This is total length of content including HTTP headers if
229      * present.
230      * @param responseBodyStart Where the response body starts in bytes.
231      * Used to skip over the HTTP headers if present.
232      * @param encoding Encoding to use reading the passed prefix buffer and
233      * backing file. For now, should be java canonical name for the
234      * encoding. (If null is passed, we will default to
235      * ByteReplayCharSequence).
236      *
237      * @return A CharBuffer view on decodings of the contents of passed
238      * buffer.
239      */

240     private CharBuffer JavaDoc decodeInMemory(byte[] buffer, long size,
241             long responseBodyStart, String JavaDoc encoding)
242     {
243         ByteBuffer JavaDoc bb = ByteBuffer.wrap(buffer);
244         // Move past the HTTP header if present.
245
bb.position((int)responseBodyStart);
246         // Set the end-of-buffer to be end-of-content.
247
bb.limit((int)size);
248         return (Charset.forName(encoding)).decode(bb).asReadOnlyBuffer();
249     }
250
251     /**
252      * Create read-only memory-mapped buffer onto passed file.
253      *
254      * @param file File to get memory-mapped buffer on.
255      * @return Read-only memory-mapped ByteBuffer view on to passed file.
256      * @throws IOException
257      */

258     private ByteBuffer JavaDoc getReadOnlyMemoryMappedBuffer(File JavaDoc file)
259         throws IOException JavaDoc {
260
261         ByteBuffer JavaDoc bb = null;
262         FileInputStream JavaDoc in = null;
263         FileChannel JavaDoc c = null;
264         assert file.exists(): "No file " + file.getAbsolutePath();
265
266         try {
267             in = new FileInputStream JavaDoc(file);
268             c = in.getChannel();
269             // TODO: Confirm the READ_ONLY works. I recall it not working.
270
// The buffers seem to always say that the buffer is writeable.
271
bb = c.map(FileChannel.MapMode.READ_ONLY, 0, c.size()).
272                 asReadOnlyBuffer();
273         }
274
275         finally {
276             if (c != null && c.isOpen()) {
277                 c.close();
278             }
279             if (in != null) {
280                 in.close();
281             }
282         }
283
284         return bb;
285     }
286
287     private void deleteFile(File JavaDoc fileToDelete) {
288         deleteFile(fileToDelete, null);
289     }
290
291     private void deleteFile(File JavaDoc fileToDelete, final Exception JavaDoc e) {
292         if (e != null) {
293             // Log why the delete to help with debug of java.io.FileNotFoundException:
294
// ....tt53http.ris.UTF-16BE.
295
logger.severe("Deleting " + fileToDelete + " because of "
296                 + e.toString());
297         }
298         if (fileToDelete != null && fileToDelete.exists()) {
299             fileToDelete.delete();
300         }
301     }
302
303     public void close()
304     {
305         this.content = null;
306         deleteFile(this.decodedFile);
307         // clear decodedFile -- so that double-close (as in
308
// finalize()) won't delete a later instance with same name
309
// see bug [ 1218961 ] "failed get of replay" in ExtractorHTML... usu: UTF-16BE
310
this.decodedFile = null;
311     }
312
313     protected void finalize() throws Throwable JavaDoc
314     {
315         super.finalize();
316         // Maybe TODO: eliminate close here, requiring explicit close instead
317
close();
318     }
319
320     public int length()
321     {
322         return this.content.limit();
323     }
324
325     public char charAt(int index)
326     {
327         return this.content.get(index);
328     }
329
330     public CharSequence JavaDoc subSequence(int start, int end) {
331         return new CharSubSequence(this, start, end);
332     }
333     
334     public String JavaDoc toString() {
335         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(length());
336         // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
337
for (int i = 0;i<length();i++) {
338             sb.append(charAt(i));
339         }
340         return sb.toString();
341     }
342 }
Popular Tags