KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > lexer > InputStreamSource


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/InputStreamSource.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/06 21:46:31 $
10
// $Revision: 1.5 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.lexer;
28
29 import java.io.ByteArrayInputStream JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.InputStream JavaDoc;
32 import java.io.InputStreamReader JavaDoc;
33 import java.io.ObjectInputStream JavaDoc;
34 import java.io.ObjectOutputStream JavaDoc;
35 import java.io.UnsupportedEncodingException JavaDoc;
36
37 import org.htmlparser.util.EncodingChangeException;
38 import org.htmlparser.util.ParserException;
39
40 /**
41  * A source of characters based on an InputStream such as from a URLConnection.
42  */

43 public class InputStreamSource
44     extends
45         Source
46 {
47     /**
48      * An initial buffer size.
49      * Has a default value of 16384.
50      */

51     public static int BUFFER_SIZE = 16384;
52
53     /**
54      * The stream of bytes.
55      * Set to <code>null</code> when the source is closed.
56      */

57     protected transient InputStream JavaDoc mStream;
58
59     /**
60      * The character set in use.
61      */

62     protected String JavaDoc mEncoding;
63
64     /**
65      * The converter from bytes to characters.
66      */

67     protected transient InputStreamReader JavaDoc mReader;
68
69     /**
70      * The characters read so far.
71      */

72     public /*volatile*/ char[] mBuffer;
73
74     /**
75      * The number of valid bytes in the buffer.
76      */

77     public /*volatile*/ int mLevel;
78
79     /**
80      * The offset of the next byte returned by read().
81      */

82     public /*volatile*/ int mOffset;
83
84     /**
85      * The bookmark.
86      */

87     protected int mMark;
88
89     /**
90      * Create a source of characters using the default character set.
91      * @param stream The stream of bytes to use.
92      * @exception UnsupportedEncodingException If the default character set is unsupported.
93      */

94     public InputStreamSource (InputStream JavaDoc stream)
95         throws
96             UnsupportedEncodingException JavaDoc
97     {
98         this (stream, null, BUFFER_SIZE);
99     }
100
101     /**
102      * Create a source of characters.
103      * @param stream The stream of bytes to use.
104      * @param charset The character set used in encoding the stream.
105      * @exception UnsupportedEncodingException If the character set is unsupported.
106      */

107     public InputStreamSource (InputStream JavaDoc stream, String JavaDoc charset)
108         throws
109             UnsupportedEncodingException JavaDoc
110     {
111         this (stream, charset, BUFFER_SIZE);
112     }
113
114     /**
115      * Create a source of characters.
116      * @param stream The stream of bytes to use.
117      * @param charset The character set used in encoding the stream.
118      * @param buffer_size The initial character buffer size.
119      * @exception UnsupportedEncodingException If the character set is unsupported.
120      */

121     public InputStreamSource (InputStream JavaDoc stream, String JavaDoc charset, int buffer_size)
122         throws
123             UnsupportedEncodingException JavaDoc
124     {
125         if (null == stream)
126             stream = new Stream (null);
127         else
128             // bug #1044707 mark()/reset() issues
129
if (!stream.markSupported ())
130                 // wrap the stream so we can reset
131
stream = new Stream (stream);
132             // else
133
// just because mark is supported doesn't guarantee
134
// proper reset operation; there is no call to mark
135
// in this code, so if reset misbehaves there is an
136
// appropriate message in setEncoding() to suggest
137
// wraping it in a Stream.
138
// This was deemed better than an attempt to call
139
// reset at this point just to check if we would
140
// succeed later, or to call mark with an arbitrary
141
// lookahead size
142
mStream = stream;
143         if (null == charset)
144         {
145             mReader = new InputStreamReader JavaDoc (stream);
146             mEncoding = mReader.getEncoding ();
147         }
148         else
149         {
150             mEncoding = charset;
151             mReader = new InputStreamReader JavaDoc (stream, charset);
152         }
153         mBuffer = new char[buffer_size];
154         mLevel = 0;
155         mOffset = 0;
156         mMark = -1;
157     }
158
159     //
160
// Serialization support
161
//
162

163     private void writeObject (ObjectOutputStream JavaDoc out)
164         throws
165             IOException JavaDoc
166     {
167         int offset;
168         char[] buffer;
169
170         if (null != mStream)
171         {
172             // remember the offset, drain the input stream, restore the offset
173
offset = mOffset;
174             buffer = new char[4096];
175             while (EOF != read (buffer))
176                 ;
177             mOffset = offset;
178         }
179         
180         out.defaultWriteObject ();
181     }
182
183     private void readObject (ObjectInputStream JavaDoc in)
184         throws
185             IOException JavaDoc,
186             ClassNotFoundException JavaDoc
187     {
188         in.defaultReadObject ();
189         if (null != mBuffer) // buffer is null when destroy's been called
190
// pretend we're open, mStream goes null when exhausted
191
mStream = new ByteArrayInputStream JavaDoc (new byte[0]);
192     }
193
194     /**
195      * Get the input stream being used.
196      * @return The current input stream.
197      */

198     public InputStream JavaDoc getStream ()
199     {
200         return (mStream);
201     }
202
203     /**
204      * Get the encoding being used to convert characters.
205      * @return The current encoding.
206      */

207     public String JavaDoc getEncoding ()
208     {
209         return (mEncoding);
210     }
211
212     /**
213      * Begins reading from the source with the given character set.
214      * If the current encoding is the same as the requested encoding,
215      * this method is a no-op. Otherwise any subsequent characters read from
216      * this page will have been decoded using the given character set.<p>
217      * Some magic happens here to obtain this result if characters have already
218      * been consumed from this source.
219      * Since a Reader cannot be dynamically altered to use a different character
220      * set, the underlying stream is reset, a new Source is constructed
221      * and a comparison made of the characters read so far with the newly
222      * read characters up to the current position.
223      * If a difference is encountered, or some other problem occurs,
224      * an exception is thrown.
225      * @param character_set The character set to use to convert bytes into
226      * characters.
227      * @exception ParserException If a character mismatch occurs between
228      * characters already provided and those that would have been returned
229      * had the new character set been in effect from the beginning. An
230      * exception is also thrown if the underlying stream won't put up with
231      * these shenanigans.
232      */

233     public void setEncoding (String JavaDoc character_set)
234         throws
235             ParserException
236     {
237         String JavaDoc encoding;
238         InputStream JavaDoc stream;
239         char[] buffer;
240         int offset;
241         char[] new_chars;
242
243         encoding = getEncoding ();
244         if (!encoding.equalsIgnoreCase (character_set))
245         {
246             stream = getStream ();
247             try
248             {
249                 buffer = mBuffer;
250                 offset = mOffset;
251                 stream.reset ();
252                 try
253                 {
254                     mEncoding = character_set;
255                     mReader = new InputStreamReader JavaDoc (stream, character_set);
256                     mBuffer = new char[mBuffer.length];
257                     mLevel = 0;
258                     mOffset = 0;
259                     mMark = -1;
260                     if (0 != offset)
261                     {
262                         new_chars = new char[offset];
263                         if (offset != read (new_chars))
264                             throw new ParserException ("reset stream failed");
265                         for (int i = 0; i < offset; i++)
266                             if (new_chars[i] != buffer[i])
267                                 throw new EncodingChangeException ("character mismatch (new: "
268                                 + new_chars[i]
269                                 + " [0x"
270                                 + Integer.toString (new_chars[i], 16)
271                                 + "] != old: "
272                                 + " [0x"
273                                 + Integer.toString (buffer[i], 16)
274                                 + buffer[i]
275                                 + "]) for encoding change from "
276                                 + encoding
277                                 + " to "
278                                 + character_set
279                                 + " at character offset "
280                                 + i);
281                     }
282                 }
283                 catch (IOException JavaDoc ioe)
284                 {
285                     throw new ParserException (ioe.getMessage (), ioe);
286                 }
287             }
288             catch (IOException JavaDoc ioe)
289             { // bug #1044707 mark()/reset() issues
290
throw new ParserException ("Stream reset failed ("
291                     + ioe.getMessage ()
292                     + "), try wrapping it with a org.htmlparser.lexer.Stream",
293                     ioe);
294             }
295         }
296     }
297
298     /**
299      * Fetch more characters from the underlying reader.
300      * Has no effect if the underlying reader has been drained.
301      * @param min The minimum to read.
302      * @exception IOException If the underlying reader read() throws one.
303      */

304     protected void fill (int min)
305         throws
306             IOException JavaDoc
307     {
308         char[] buffer;
309         int size;
310         int read;
311
312         if (null != mReader) // mReader goes null when it's been sucked dry
313
{
314             size = mBuffer.length - mLevel; // available space
315
if (size < min) // oops, better get some buffer space
316
{
317                 // unknown length... keep doubling
318
size = mBuffer.length * 2;
319                 read = mLevel + min;
320                 if (size < read) // or satisfy min, whichever is greater
321
size = read;
322                 else
323                     min = size - mLevel; // read the max
324
buffer = new char[size];
325             }
326             else
327             {
328                 buffer = mBuffer;
329                 min = size;
330             }
331
332             // read into the end of the 'new' buffer
333
read = mReader.read (buffer, mLevel, min);
334             if (EOF == read)
335             {
336                 mReader.close ();
337                 mReader = null;
338             }
339             else
340             {
341                 if (mBuffer != buffer)
342                 { // copy the bytes previously read
343
System.arraycopy (mBuffer, 0, buffer, 0, mLevel);
344                     mBuffer = buffer;
345                 }
346                 mLevel += read;
347             }
348             // todo, should repeat on read shorter than original min
349
}
350     }
351
352     //
353
// Reader overrides
354
//
355

356     /**
357      * Does nothing.
358      * It's supposed to close the source, but use destroy() instead.
359      * @see #destroy
360      */

361     public void close () throws IOException JavaDoc
362     {
363     }
364
365     /**
366      * Read a single character.
367      * This method will block until a character is available,
368      * an I/O error occurs, or the end of the stream is reached.
369      * @return The character read, as an integer in the range 0 to 65535
370      * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has
371      * been reached
372      * @exception IOException If an I/O error occurs.
373      */

374     public int read () throws IOException JavaDoc
375     {
376         int ret;
377
378         if (mLevel - mOffset < 1)
379         {
380             if (null == mStream)
381                 throw new IOException JavaDoc ("source is closed");
382             fill (1);
383             if (mOffset >= mLevel)
384                 ret = EOF;
385             else
386                 ret = mBuffer[mOffset++];
387         }
388         else
389             ret = mBuffer[mOffset++];
390
391         return (ret);
392     }
393
394     /**
395      * Read characters into a portion of an array. This method will block
396      * until some input is available, an I/O error occurs, or the end of the
397      * stream is reached.
398      * @param cbuf Destination buffer
399      * @param off Offset at which to start storing characters
400      * @param len Maximum number of characters to read
401      * @return The number of characters read, or {@link #EOF EOF} if the end of
402      * the stream has been reached
403      * @exception IOException If an I/O error occurs.
404      */

405     public int read (char[] cbuf, int off, int len) throws IOException JavaDoc
406     {
407         int ret;
408
409         if (null == mStream)
410             throw new IOException JavaDoc ("source is closed");
411         if ((null == cbuf) || (0 > off) || (0 > len))
412             throw new IOException JavaDoc ("illegal argument read ("
413                 + ((null == cbuf) ? "null" : "cbuf")
414                 + ", " + off + ", " + len + ")");
415         if (mLevel - mOffset < len)
416             fill (len - (mLevel - mOffset)); // minimum to satisfy this request
417
if (mOffset >= mLevel)
418             ret = EOF;
419         else
420         {
421             ret = Math.min (mLevel - mOffset, len);
422             System.arraycopy (mBuffer, mOffset, cbuf, off, ret);
423             mOffset += ret;
424         }
425
426         return (ret);
427     }
428
429     /**
430      * Read characters into an array.
431      * This method will block until some input is available, an I/O error occurs,
432      * or the end of the stream is reached.
433      * @param cbuf Destination buffer.
434      * @return The number of characters read, or {@link #EOF EOF} if the end of
435      * the stream has been reached.
436      * @exception IOException If an I/O error occurs.
437      */

438     public int read (char[] cbuf) throws IOException JavaDoc
439     {
440         return (read (cbuf, 0, cbuf.length));
441     }
442
443     /**
444      * Reset the source.
445      * Repositions the read point to begin at zero.
446      * @exception IllegalStateException If the source has been closed.
447      */

448     public void reset ()
449     {
450         if (null == mStream)
451             throw new IllegalStateException JavaDoc ("source is closed");
452         if (-1 != mMark)
453             mOffset = mMark;
454         else
455             mOffset = 0;
456     }
457
458     /**
459      * Tell whether this source supports the mark() operation.
460      * @return <code>true</code>.
461      */

462     public boolean markSupported ()
463     {
464         return (true);
465     }
466
467     /**
468      * Mark the present position in the source.
469      * Subsequent calls to {@link #reset()}
470      * will attempt to reposition the source to this point.
471      * @param readAheadLimit <em>Not used.</em>
472      * @exception IOException If the source is closed.
473      *
474      */

475     public void mark (int readAheadLimit) throws IOException JavaDoc
476     {
477         if (null == mStream)
478             throw new IOException JavaDoc ("source is closed");
479         mMark = mOffset;
480     }
481
482     /**
483      * Tell whether this source is ready to be read.
484      * @return <code>true</code> if the next read() is guaranteed not to block
485      * for input, <code>false</code> otherwise.
486      * Note that returning false does not guarantee that the next read will block.
487      * @exception IOException If the source is closed.
488      */

489     public boolean ready () throws IOException JavaDoc
490     {
491         if (null == mStream)
492             throw new IOException JavaDoc ("source is closed");
493         return (mOffset < mLevel);
494     }
495
496     /**
497      * Skip characters.
498      * This method will block until some characters are available,
499      * an I/O error occurs, or the end of the stream is reached.
500      * <em>Note: n is treated as an int</em>
501      * @param n The number of characters to skip.
502      * @return The number of characters actually skipped
503      * @exception IllegalArgumentException If <code>n</code> is negative.
504      * @exception IOException If an I/O error occurs.
505      */

506     public long skip (long n) throws IOException JavaDoc
507     {
508         long ret;
509
510         if (null == mStream)
511             throw new IOException JavaDoc ("source is closed");
512         if (mLevel - mOffset < n)
513             fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request
514
if (mOffset >= mLevel)
515             ret = EOF;
516         else
517         {
518             ret = Math.min (mLevel - mOffset, n);
519             mOffset += ret;
520         }
521
522         return (ret);
523     }
524
525     //
526
// Methods not in your Daddy's Reader
527
//
528

529     /**
530      * Undo the read of a single character.
531      * @exception IOException If the source is closed or no characters have
532      * been read.
533      */

534     public void unread () throws IOException JavaDoc
535     {
536         if (null == mStream)
537             throw new IOException JavaDoc ("source is closed");
538         if (0 < mOffset)
539             mOffset--;
540         else
541             throw new IOException JavaDoc ("can't unread no characters");
542     }
543
544     /**
545      * Retrieve a character again.
546      * @param offset The offset of the character.
547      * @return The character at <code>offset</code>.
548      * @exception IOException If the offset is beyond {@link #offset()} or the
549      * source is closed.
550      */

551     public char getCharacter (int offset) throws IOException JavaDoc
552     {
553         char ret;
554
555         if (null == mStream)
556             throw new IOException JavaDoc ("source is closed");
557         if (offset >= mBuffer.length)
558             throw new IOException JavaDoc ("illegal read ahead");
559         else
560             ret = mBuffer[offset];
561         
562         return (ret);
563     }
564
565     /**
566      * Retrieve characters again.
567      * @param array The array of characters.
568      * @param offset The starting position in the array where characters are to be placed.
569      * @param start The starting position, zero based.
570      * @param end The ending position
571      * (exclusive, i.e. the character at the ending position is not included),
572      * zero based.
573      * @exception IOException If the start or end is beyond {@link #offset()}
574      * or the source is closed.
575      */

576     public void getCharacters (char[] array, int offset, int start, int end) throws IOException JavaDoc
577     {
578         if (null == mStream)
579             throw new IOException JavaDoc ("source is closed");
580         System.arraycopy (mBuffer, start, array, offset, end - start);
581     }
582     
583     /**
584      * Retrieve a string.
585      * @param offset The offset of the first character.
586      * @param length The number of characters to retrieve.
587      * @return A string containing the <code>length</code> characters at <code>offset</code>.
588      * @exception IOException If the offset or (offset + length) is beyond
589      * {@link #offset()} or the source is closed.
590      */

591     public String JavaDoc getString (int offset, int length) throws IOException JavaDoc
592     {
593         String JavaDoc ret;
594
595         if (null == mStream)
596             throw new IOException JavaDoc ("source is closed");
597         if (offset + length >= mBuffer.length)
598             throw new IOException JavaDoc ("illegal read ahead");
599         else
600             ret = new String JavaDoc (mBuffer, offset, length);
601         
602         return (ret);
603     }
604
605     /**
606      * Append characters already read into a <code>StringBuffer</code>.
607      * @param buffer The buffer to append to.
608      * @param offset The offset of the first character.
609      * @param length The number of characters to retrieve.
610      * @exception IOException If the offset or (offset + length) is beyond
611      * {@link #offset()} or the source is closed.
612      */

613     public void getCharacters (StringBuffer JavaDoc buffer, int offset, int length) throws IOException JavaDoc
614     {
615         if (null == mStream)
616             throw new IOException JavaDoc ("source is closed");
617         buffer.append (mBuffer, offset, length);
618     }
619
620     /**
621      * Close the source.
622      * Once a source has been closed, further {@link #read() read},
623      * {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
624      * {@link #skip skip}, {@link #unread unread},
625      * {@link #getCharacter getCharacter} or {@link #getString getString}
626      * invocations will throw an IOException.
627      * Closing a previously-closed source, however, has no effect.
628      * @exception IOException If an I/O error occurs
629      */

630     public void destroy () throws IOException JavaDoc
631     {
632         mStream = null;
633         if (null != mReader)
634             mReader.close ();
635         mReader = null;
636         mBuffer = null;
637         mLevel = 0;
638         mOffset = 0;
639         mMark = -1;
640     }
641
642     /**
643      * Get the position (in characters).
644      * @return The number of characters that have already been read, or
645      * {@link #EOF EOF} if the source is closed.
646      */

647     public int offset ()
648     {
649         int ret;
650
651         if (null == mStream)
652             ret = EOF;
653         else
654             ret = mOffset;
655
656         return (ret);
657     }
658
659     /**
660      * Get the number of available characters.
661      * @return The number of characters that can be read without blocking or
662      * zero if the source is closed.
663      */

664     public int available ()
665     {
666         int ret;
667
668         if (null == mStream)
669             ret = 0;
670         else
671             ret = mLevel - mOffset;
672
673         return (ret);
674     }
675 }
676
Popular Tags