Page


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2004 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/03/13 14:51:43 $
10  // $Revision: 1.48 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.lexer;
28  
29  import java.io.InputStream  ;
30  import java.io.IOException  ;
31  import java.io.ObjectInputStream  ;
32  import java.io.ObjectOutputStream  ;
33  import java.io.Serializable  ;
34  import java.io.UnsupportedEncodingException  ;
35  import java.lang.reflect.InvocationTargetException  ;
36  import java.lang.reflect.Method  ;
37  import java.net.MalformedURLException  ;
38  import java.net.URL  ;
39  import java.net.URLConnection  ;
40  import java.net.UnknownHostException  ;
41  import java.util.zip.GZIPInputStream  ;
42  import java.util.zip.InflaterInputStream  ;
43  
44  import org.htmlparser.http.ConnectionManager;
45  import org.htmlparser.util.ParserException;
46  
47  /**
48   * Represents the contents of an HTML page.
49   * Contains the source of characters and an index of positions of line
50   * separators (actually the first character position on the next line).
51   */
52  public class Page
53      implements
54          Serializable  
55  {
56      /**
57       * The default charset.
58       * This should be <code>{@value}</code>,
59       * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1
60       * Another alias is "8859_1".
61       */
62      public static final String   DEFAULT_CHARSET = "ISO-8859-1";
63  
64      /**
65       * The default content type.
66       * In the absence of alternate information, assume html content ({@value}).
67       */
68      public static final String   DEFAULT_CONTENT_TYPE = "text/html";
69  
70      /**
71       * Character value when the page is exhausted.
72       * Has a value of {@value}.
73       */
74      public static final char EOF = (char)Source.EOF;
75  
76      /**
77       * The URL this page is coming from.
78       * Cached value of <code>getConnection().toExternalForm()</code> or
79       * <code>setUrl()</code>.
80       */
81      protected String   mUrl;
82  
83      /**
84       * The base URL for this page.
85       */
86      protected String   mBaseUrl;
87  
88      /**
89       * The source of characters.
90       */
91      protected Source mSource;
92  
93      /**
94       * Character positions of the first character in each line.
95       */
96      protected PageIndex mIndex;
97      
98      /**
99       * The connection this page is coming from or <code>null</code>.
100      */
101     protected transient URLConnection   mConnection;
102 
103     /**
104      * Connection control (proxy, cookies, authorization).
105      */
106     public static ConnectionManager mConnectionManager = new ConnectionManager ();
107 
108     /**
109      * Construct an empty page.
110      */
111     public Page ()
112     {
113         this ("");
114     }
115 
116     /**
117      * Construct a page reading from a URL connection.
118      * @param connection A fully conditioned connection. The connect()
119      * method will be called so it need not be connected yet.
120      * @exception ParserException An exception object wrapping a number of
121      * possible error conditions, some of which are outlined below.
122      * <li>IOException If an i/o exception occurs creating the
123      * source.</li>
124      * <li>UnsupportedEncodingException if the character set specified in the
125      * HTTP header is not supported.</li>
126      */
127     public Page (URLConnection   connection) throws ParserException
128     {
129         if (null == connection)
130             throw new IllegalArgumentException   ("connection cannot be null");
131         setConnection (connection);
132         mBaseUrl = null;
133     }
134 
135     /**
136      * Construct a page from a stream encoded with the given charset.
137      * @param stream The source of bytes.
138      * @param charset The encoding used.
139      * If null, defaults to the <code>DEFAULT_CHARSET</code>.
140      * @exception UnsupportedEncodingException If the given charset is not supported.
141      */
142     public Page (InputStream   stream, String   charset)
143         throws
144             UnsupportedEncodingException  
145     {
146         if (null == stream)
147             throw new IllegalArgumentException   ("stream cannot be null");
148         if (null == charset)
149             charset = DEFAULT_CHARSET;
150         mSource = new InputStreamSource (stream, charset);
151         mIndex = new PageIndex (this);
152         mConnection = null;
153         mUrl = null;
154         mBaseUrl = null;
155     }
156 
157     /**
158      * Construct a page from the given string.
159      * @param text The HTML text.
160      * @param charset <em>Optional</em>. The character set encoding that will
161      * be reported by {@link #getEncoding}. If charset is <code>null</code>
162      * the default character set is used.
163      */
164     public Page (String   text, String   charset)
165     {
166         if (null == text)
167             throw new IllegalArgumentException   ("text cannot be null");
168         if (null == charset)
169             charset = DEFAULT_CHARSET;
170         mSource = new StringSource (text, charset);
171         mIndex = new PageIndex (this);
172         mConnection = null;
173         mUrl = null;
174         mBaseUrl = null;
175     }
176 
177     /**
178      * Construct a page from the given string.
179      * The page will report that it is using an encoding of
180      * {@link #DEFAULT_CHARSET}.
181      * @param text The HTML text.
182      */
183     public Page (String   text)
184     {
185         this (text, null);
186     }
187 
188     //
189     // static methods
190     //
191 
192     /**
193      * Get the connection manager all Parsers use.
194      * @return The connection manager.
195      */
196     public static ConnectionManager getConnectionManager ()
197     {
198         return (mConnectionManager);
199     }
200 
201     /**
202      * Set the connection manager to use.
203      * @param manager The new connection manager.
204      */
205     public static void setConnectionManager (ConnectionManager manager)
206     {
207         mConnectionManager = manager;
208     }
209 
210     /**
211      * Get a CharacterSet name corresponding to a charset parameter.
212      * @param content A text line of the form:
213      * <pre>
214      * text/html; charset=Shift_JIS
215      * </pre>
216      * which is applicable both to the HTTP header field Content-Type and
217      * the meta tag http-equiv="Content-Type".
218      * Note this method also handles non-compliant quoted charset directives such as:
219      * <pre>
220      * text/html; charset="UTF-8"
221      * </pre>
222      * and
223      * <pre>
224      * text/html; charset='UTF-8'
225      * </pre>
226      * @return The character set name to use when reading the input stream.
227      * For JDKs that have the Charset class this is qualified by passing
228      * the name to findCharset() to render it into canonical form.
229      * If the charset parameter is not found in the given string, the default
230      * character set is returned.
231      * @see #findCharset
232      * @see #DEFAULT_CHARSET
233      */
234     public static String   getCharset (String   content)
235     {
236         final String   CHARSET_STRING = "charset";
237         int index;
238         String   ret;
239 
240         ret = DEFAULT_CHARSET;
241         if (null != content)
242         {
243             index = content.indexOf (CHARSET_STRING);
244 
245             if (index != -1)
246             {
247                 content = content.substring (index + CHARSET_STRING.length ()).trim ();
248                 if (content.startsWith ("="))
249                 {
250                     content = content.substring (1).trim ();
251                     index = content.indexOf (";");
252                     if (index != -1)
253                         content = content.substring (0, index);
254 
255                     //remove any double quotes from around charset string
256                     if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ()))
257                         content = content.substring (1, content.length () - 1);
258 
259                     //remove any single quote from around charset string
260                     if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ()))
261                         content = content.substring (1, content.length () - 1);
262 
263                     ret = findCharset (content, ret);
264 
265                     // Charset names are not case-sensitive;
266                     // that is, case is always ignored when comparing charset names.
267 //                    if (!ret.equalsIgnoreCase (content))
268 //                    {
269 //                        System.out.println (
270 //                            "detected charset \""
271 //                            + content
272 //                            + "\", using \""
273 //                            + ret
274 //                            + "\"");
275 //                    }
276                 }
277             }
278         }
279 
280         return (ret);
281     }
282 
283     /**
284      * Lookup a character set name.
285      * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em>
286      * This uses reflection so the code will still run under prior JDK's but
287      * in that case the default is always returned.
288      * @param name The name to look up. One of the aliases for a character set.
289      * @param _default The name to return if the lookup fails.
290      */
291     public static String   findCharset (String   name, String   _default)
292     {
293         String   ret;
294 
295         try
296         {
297             Class   cls;
298             Method   method;
299             Object   object;
300 
301             cls = Class.forName ("java.nio.charset.Charset");
302             method = cls.getMethod ("forName", new Class  [] { String  .class });
303             object = method.invoke (null, new Object  [] { name });
304             method = cls.getMethod ("name", new Class  [] { });
305             object = method.invoke (object, new Object  [] { });
306             ret = (String  )object;
307         }
308         catch (ClassNotFoundException   cnfe)
309         {
310             // for reflection exceptions, assume the name is correct
311             ret = name;
312         }
313         catch (NoSuchMethodException   nsme)
314         {
315             // for reflection exceptions, assume the name is correct
316             ret = name;
317         }
318         catch (IllegalAccessException   ia)
319         {
320             // for reflection exceptions, assume the name is correct
321             ret = name;
322         }
323         catch (InvocationTargetException   ita)
324         {
325             // java.nio.charset.IllegalCharsetNameException
326             // and java.nio.charset.UnsupportedCharsetException
327             // return the default
328             ret = _default;
329             System.out.println (
330                 "unable to determine cannonical charset name for "
331                 + name
332                 + " - using "
333                 + _default);
334         }
335 
336         return (ret);
337     }
338 
339     //
340     // Serialization support
341     //
342 
343     /**
344      * Serialize the page.
345      * There are two modes to serializing a page based on the connected state.
346      * If connected, the URL and the current offset is saved, while if
347      * disconnected, the underling source is saved.
348      * @param out The object stream to store this object in.
349      */
350     private void writeObject (ObjectOutputStream   out)
351         throws
352             IOException  
353     {
354         String   href;
355         Source source;
356         PageIndex index;
357 
358         // two cases, reading from a URL and not
359         if (null != getConnection ())
360         {
361             out.writeBoolean (true);
362             out.writeInt (mSource.offset ()); // need to preread this much
363             href = getUrl ();
364             out.writeObject (href);
365             setUrl (getConnection ().getURL ().toExternalForm ());
366             source = getSource ();
367             mSource = null; // don't serialize the source if we can avoid it
368             index = mIndex;
369             mIndex = null; // will get recreated; valid for the new page anyway?
370             out.defaultWriteObject ();
371             mSource = source;
372             mIndex = index;
373         }
374         else
375         {
376             out.writeBoolean (false);
377             href = getUrl ();
378             out.writeObject (href);
379             setUrl (null); // don't try and read a bogus URL
380             out.defaultWriteObject ();
381             setUrl (href);
382         }
383     }
384 
385     /**
386      * Deserialize the page.
387      * For details see <code>writeObject()</code>.
388      * @param in The object stream to decode.
389      */
390     private void readObject (ObjectInputStream   in)
391         throws
392             IOException  ,
393             ClassNotFoundException  
394     {
395         boolean fromurl;
396         int offset;
397         String   href;
398         URL   url;
399         Cursor cursor;
400 
401         fromurl = in.readBoolean ();
402         if (fromurl)
403         {
404             offset = in.readInt ();
405             href = (String  )in.readObject ();
406             in.defaultReadObject ();
407             // open the URL
408             if (null != getUrl ())
409             {
410                 url = new URL   (getUrl ());
411                 try
412                 {
413                     setConnection (url.openConnection ());
414                 }
415                 catch (ParserException pe)
416                 {
417                     throw new IOException   (pe.getMessage ());
418                 }
419             }
420             cursor = new Cursor (this, 0);
421             for (int i = 0; i < offset; i++)
422                 try
423                 {
424                     getCharacter (cursor);
425                 }
426                 catch (ParserException pe)
427                 {
428                     throw new IOException   (pe.getMessage ());
429                 }
430             setUrl (href);
431         }
432         else
433         {
434             href = (String  )in.readObject ();
435             in.defaultReadObject ();
436             setUrl (href);
437         }
438     }
439 
440     /**
441      * Reset the page by resetting the source of characters.
442      */
443     public void reset ()
444     {
445         getSource ().reset ();
446         mIndex = new PageIndex (this); // todo: is this really necessary?
447     }
448 
449     /**
450      * Close the page by destroying the source of characters.
451      */
452     public void close () throws IOException  
453     {
454         if (null != getSource ())
455             getSource ().destroy ();
456     }
457 
458     /**
459      * Clean up this page, releasing resources.
460      * Calls <code>close()</code>.
461      * @exception Throwable if <code>close()</code> throws an <code>IOException</code>.
462      */
463     protected void finalize () throws Throwable  
464     {
465         close ();
466     }
467     
468     /**
469      * Get the connection, if any.
470      * @return The connection object for this page, or null if this page
471      * is built from a stream or a string.
472      */
473     public URLConnection   getConnection ()
474     {
475         return (mConnection);
476     }
477 
478     /**
479      * Set the URLConnection to be used by this page.
480      * Starts reading from the given connection.
481      * This also resets the current url.
482      * @param connection The connection to use.
483      * It will be connected by this method.
484      * @exception ParserException If the <code>connect()</code> method fails,
485      * or an I/O error occurs opening the input stream or the character set
486      * designated in the HTTP header is unsupported.
487      */
488     public void setConnection (URLConnection   connection)
489         throws
490             ParserException
491     {
492         Stream stream;
493         String   type;
494         String   charset;
495         String   contentEncoding;
496 
497         mConnection = connection;
498         try
499         {
500             getConnection ().connect ();
501         }
502         catch (UnknownHostException   uhe)
503         {
504             throw new ParserException ("Connect to " + mConnection.getURL ().toExternalForm () + " failed.", uhe);
505         }
506         catch (IOException   ioe)
507         {
508             throw new ParserException ("Exception connecting to " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe);
509         }
510         type = getContentType ();
511         charset = getCharset (type);
512         try
513         {
514             contentEncoding = connection.getContentEncoding();
515             if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("gzip")))
516             {
517                 stream = new Stream (new GZIPInputStream   (getConnection ().getInputStream ()));
518             }
519             else if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("deflate")))
520             {
521                 stream = new Stream (new InflaterInputStream   (getConnection ().getInputStream ()));
522             }
523             else
524             {
525                 stream = new Stream (getConnection ().getInputStream ());
526             }
527 
528             try
529             {
530                 mSource = new InputStreamSource (stream, charset);
531             }
532             catch (UnsupportedEncodingException   uee)
533             {
534 //                StringBuffer msg;
535 //
536 //                msg = new StringBuffer (1024);
537 //                msg.append (getConnection ().getURL ().toExternalForm ());
538 //                msg.append (" has an encoding (");
539 //                msg.append (charset);
540 //                msg.append (") which is not supported, using ");
541 //                msg.append (DEFAULT_CHARSET);
542 //                System.out.println (msg.toString ());
543                 charset = DEFAULT_CHARSET;
544                 mSource = new InputStreamSource (stream, charset);
545             }
546         }
547         catch (IOException   ioe)
548         {
549             throw new ParserException ("Exception getting input stream from " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe);
550         }
551         mUrl = connection.getURL ().toExternalForm ();
552         mIndex = new PageIndex (this);
553     }
554 
555     /**
556      * Get the URL for this page.
557      * This is only available if the page has a connection
558      * (<code>getConnection()</code> returns non-null), or the document base has
559      * been set via a call to <code>setUrl()</code>.
560      * @return The url for the connection, or <code>null</code> if there is
561      * no conenction or the document base has not been set.
562      */
563     public String   getUrl ()
564     {
565         return (mUrl);
566     }
567 
568     /**
569      * Set the URL for this page.
570      * This doesn't affect the contents of the page, just the interpretation
571      * of relative links from this point forward.
572      * @param url The new URL.
573      */
574     public void setUrl (String   url)
575     {
576         mUrl = url;
577     }
578 
579     /**
580      * Gets the baseUrl.
581      * @return The base URL for this page, or <code>null</code> if not set.
582      */
583     public String   getBaseUrl ()
584     {
585         return (mBaseUrl);
586     }
587 
588     /**
589      * Sets the baseUrl.
590      * @param url The base url for this page.
591      */
592     public void setBaseUrl (String   url)
593     {
594         mBaseUrl = url;
595     }
596     
597     /**
598      * Get the source this page is reading from.
599      */
600     public Source getSource ()
601     {
602         return (mSource);
603     }
604 
605     /**
606      * Try and extract the content type from the HTTP header.
607      * @return The content type.
608      */
609     public String   getContentType ()
610     {
611         URLConnection   connection;
612         String   content;
613         String   ret;
614 
615         ret = DEFAULT_CONTENT_TYPE;
616         connection = getConnection ();
617         if (null != connection)
618         {
619             content = connection.getContentType ();
620             if (null != content)
621                 ret = content;
622         }
623 
624         return (ret);
625     }
626 
627     /**
628      * Read the character at the cursor position.
629      * The cursor position can be behind or equal to the current source position.
630      * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
631      * and updates the end-of-line index accordingly
632      * Advances the cursor position by one (or two in the \r\n case).
633      * @param cursor The position to read at.
634      * @return The character at that position, and modifies the cursor to
635      * prepare for the next read. If the source is exhausted a zero is returned.
636      * @exception ParserException If an IOException on the underlying source
637      * occurs, or an attemp is made to read characters in the future (the
638      * cursor position is ahead of the underlying stream)
639      */
640     public char getCharacter (Cursor cursor)
641         throws
642             ParserException
643     {
644         int i;
645         char ret;
646 
647         i = cursor.getPosition ();
648         if (mSource.offset () < i)
649             // hmmm, we could skip ahead, but then what about the EOL index
650             throw new ParserException ("attempt to read future characters from source");
651         else if (mSource.offset () == i)
652             try
653             {
654                 i = mSource.read ();
655                 if (Source.EOF == i)
656                     ret = EOF;
657                 else
658                 {
659                     ret = (char)i;
660                     cursor.advance ();
661                 }
662             }
663             catch (IOException   ioe)
664             {
665                 throw new ParserException (
666                     "problem reading a character at position "
667                     + cursor.getPosition (), ioe);
668             }
669         else
670         {
671             // historic read
672             try
673             {
674                 ret = mSource.getCharacter (i);
675             }
676             catch (IOException   ioe)
677             {
678                 throw new ParserException (
679                     "can't read a character at position "
680                     + i, ioe);
681             }
682             cursor.advance ();
683         }
684 
685         // handle \r
686         if ('\r' == ret)
687         {   // switch to single character EOL
688             ret = '\n';
689 
690             // check for a \n in the next position
691             if (mSource.offset () == cursor.getPosition ())
692                 try
693                 {
694                     i = mSource.read ();
695                     if (Source.EOF == i)
696                     {
697                         // do nothing
698                     }
699                     else if ('\n' == (char)i)
700                         cursor.advance ();
701                     else
702                         try
703                         {
704                             mSource.unread ();
705                         }
706                         catch (IOException   ioe)
707                         {
708                             throw new ParserException (
709                                 "can't unread a character at position "
710                                 + cursor.getPosition (), ioe);
711                         }
712                 }
713                 catch (IOException   ioe)
714                 {
715                     throw new ParserException (
716                         "problem reading a character at position "
717                         + cursor.getPosition (), ioe);
718                 }
719             else
720                 try
721                 {
722                     if ('\n' == mSource.getCharacter (cursor.getPosition ()))
723                         cursor.advance ();
724                 }
725                 catch (IOException   ioe)
726                 {
727                     throw new ParserException (
728                         "can't read a character at position "
729                         + cursor.getPosition (), ioe);
730                 }
731         }
732         if ('\n' == ret)
733             // update the EOL index in any case
734             mIndex.add (cursor);
735 
736         return (ret);
737     }
738 
739     /**
740      * Get the current encoding being used.
741      * @return The encoding used to convert characters.
742      */
743     public String   getEncoding ()
744     {
745         return (getSource ().getEncoding ());
746     }
747 
748     /**
749      * Begins reading from the source with the given character set.
750      * If the current encoding is the same as the requested encoding,
751      * this method is a no-op. Otherwise any subsequent characters read from
752      * this page will have been decoded using the given character set.<p>
753      * Some magic happens here to obtain this result if characters have already
754      * been consumed from this page.
755      * Since a Reader cannot be dynamically altered to use a different character
756      * set, the underlying stream is reset, a new Source is constructed
757      * and a comparison made of the characters read so far with the newly
758      * read characters up to the current position.
759      * If a difference is encountered, or some other problem occurs,
760      * an exception is thrown. 
761      * @param character_set The character set to use to convert bytes into
762      * characters.
763      * @exception ParserException If a character mismatch occurs between
764      * characters already provided and those that would have been returned
765      * had the new character set been in effect from the beginning. An
766      * exception is also thrown if the underlying stream won't put up with
767      * these shenanigans.
768      */
769     public void setEncoding (String   character_set)
770         throws
771             ParserException
772     {
773         getSource ().setEncoding (character_set);
774     }
775 
776     /**
777      * Build a URL from the link and base provided.
778      * @param link The (relative) URI.
779      * @param base The base URL of the page, either from the &lt;BASE&gt; tag
780      * or, if none, the URL the page is being fetched from.
781      * @return An absolute URL.
782      */
783     public URL   constructUrl (String   link, String   base)
784         throws MalformedURLException  
785     {
786         String   path;
787         boolean modified;
788         boolean absolute;
789         int index;
790         URL   url; // constructed URL combining relative link and base
791 
792         url = new URL   (new URL   (base), link);
793         path = url.getFile ();
794         modified = false;
795         absolute = link.startsWith ("/");
796         if (!absolute)
797         {   // we prefer to fix incorrect relative links
798             // this doesn't fix them all, just the ones at the start
799             while (path.startsWith ("/."))
800             {
801                 if (path.startsWith ("/../"))
802                 {
803                     path = path.substring (3);
804                     modified = true;
805                 }
806                 else if (path.startsWith ("/./") || path.startsWith("/."))
807                 {
808                     path = path.substring (2);
809                     modified = true;
810                 }
811                 else
812                     break;
813             }
814         }
815         // fix backslashes
816         while (-1 != (index = path.indexOf ("/\\")))
817         {
818             path = path.substring (0, index + 1) + path.substring (index + 2);
819             modified = true;
820         }
821         if (modified)
822             url = new URL   (url, path);
823 
824         return (url);
825     }
826 
827     /**
828      * Create an absolute URL from a relative link.
829      * @param link The reslative portion of a URL.
830      * @return The fully qualified URL or the original link if it was absolute
831      * already or a failure occured.
832      */
833     public String   getAbsoluteURL (String   link)
834     {
835         String   base;
836         URL   url;
837         String   ret;
838 
839         if ((null == link) || ("".equals (link)))
840             ret = "";
841         else
842             try
843             {
844                 base =  getBaseUrl ();
845                 if (null == base)
846                     base = getUrl ();
847                 if (null == base)
848                     ret = link;
849                 else
850                 {
851                     url = constructUrl (link, base);
852                     ret = url.toExternalForm ();
853                 }
854             }
855             catch (MalformedURLException   murle)
856             {
857                 ret = link;
858             }
859 
860         return (ret);
861     }
862 
863     /**
864      * Get the line number for a cursor.
865      * @param cursor The character offset into the page.
866      * @return The line number the character is in.
867      */
868     public int row (Cursor cursor)
869     {
870         return (mIndex.row (cursor));
871     }
872 
873     /**
874      * Get the line number for a cursor.
875      * @param position The character offset into the page.
876      * @return The line number the character is in.
877      */
878     public int row (int position)
879     {
880         return (mIndex.row (position));
881     }
882 
883     /**
884      * Get the column number for a cursor.
885      * @param cursor The character offset into the page.
886      * @return The character offset into the line this cursor is on.
887      */
888     public int column (Cursor cursor)
889     {
890         return (mIndex.column (cursor));
891     }
892 
893     /**
894      * Get the column number for a cursor.
895      * @param position The character offset into the page.
896      * @return The character offset into the line this cursor is on.
897      */
898     public int column (int position)
899     {
900         return (mIndex.column (position));
901     }
902 
903     /**
904      * Get the text identified by the given limits.
905      * @param start The starting position, zero based.
906      * @param end The ending position
907      * (exclusive, i.e. the character at the ending position is not included),
908      * zero based.
909      * @return The text from <code>start</code> to <code>end</code>.
910      * @see #getText(StringBuffer, int, int)
911      * @exception IllegalArgumentException If an attempt is made to get
912      * characters ahead of the current source offset (character position).
913      */
914     public String   getText (int start, int end)
915     {
916         String   ret;
917         
918         try
919         {
920             ret = mSource.getString (start, end - start);
921         }
922         catch (IOException   ioe)
923         {
924             throw new IllegalArgumentException   (
925                 "can't get the "
926                 + (end - start)
927                 + "characters at position "
928                 + start
929                 + " - "
930                 + ioe.getMessage ());
931         }
932         
933         return (ret);
934     }
935 
936     /**
937      * Put the text identified by the given limits into the given buffer.
938      * @param buffer The accumulator for the characters.
939      * @param start The starting position, zero based.
940      * @param end The ending position
941      * (exclusive, i.e. the character at the ending position is not included),
942      * zero based.
943      * @exception IllegalArgumentException If an attempt is made to get
944      * characters ahead of the current source offset (character position).
945      */
946     public void getText (StringBuffer   buffer, int start, int end)
947     {
948         int length;
949 
950         if ((mSource.offset () < start) || (mSource.offset () < end))
951             throw new IllegalArgumentException   ("attempt to extract future characters from source");
952         if (end < start)
953         {
954             length = end;
955             end = start;
956             start = length;
957         }
958         length = end - start;
959         try
960         {
961             mSource.getCharacters (buffer, start, length);
962         }
963         catch (IOException   ioe)
964         {
965             throw new IllegalArgumentException   (
966                 "can't get the "
967                 + (end - start)
968                 + "characters at position "
969                 + start
970                 + " - "
971                 + ioe.getMessage ());
972         }
973     }
974 
975     /**
976      * Get all text read so far from the source.
977      * @return The text from the source.
978      * @see #getText(StringBuffer)
979      */
980     public String   getText ()
981     {
982         return (getText (0, mSource.offset ()));
983     }
984 
985     /**
986      * Put all text read so far from the source into the given buffer.
987      * @param buffer The accumulator for the characters.
988      * @see #getText(StringBuffer,int,int)
989      */
990     public void getText (StringBuffer   buffer)
991     {
992         getText (buffer, 0, mSource.offset ());
993     }
994 
995     /**
996      * Put the text identified by the given limits into the given array at the specified offset.
997      * @param array The array of characters.
998      * @param offset The starting position in the array where characters are to be placed.
999      * @param start The starting position, zero based.
1000     * @param end The ending position
1001     * (exclusive, i.e. the character at the ending position is not included),
1002     * zero based.
1003     * @exception IllegalArgumentException If an attempt is made to get
1004     * characters ahead of the current source offset (character position).
1005     */
1006    public void getText (char[] array, int offset, int start, int end)
1007    {
1008        int length;
1009
1010        if ((mSource.offset () < start) || (mSource.offset () < end))
1011            throw new IllegalArgumentException   ("attempt to extract future characters from source");
1012        if (end < start)
1013        {   // swap
1014            length = end;
1015            end = start;
1016            start = length;
1017        }
1018        length = end - start;
1019        try
1020        {
1021            mSource.getCharacters (array, offset, start, end);
1022        }
1023        catch (IOException   ioe)
1024        {
1025            throw new IllegalArgumentException   (
1026                "can't get the "
1027                + (end - start)
1028                + "characters at position "
1029                + start
1030                + " - "
1031                + ioe.getMessage ());
1032        }
1033    }
1034
1035    /**
1036     * Get the text line the position of the cursor lies on.
1037     * @param cursor The position to calculate for.
1038     * @return The contents of the URL or file corresponding to the line number
1039     * containg the cursor position.
1040     */
1041    public String   getLine (Cursor cursor)
1042    {
1043        int line;
1044        int size;
1045        int start;
1046        int end;
1047
1048        line = row (cursor);
1049        size = mIndex.size ();
1050        if (line < size)
1051        {
1052            start = mIndex.elementAt (line);
1053            line++;
1054            if (line <= size)
1055                end = mIndex.elementAt (line);
1056            else
1057                end = mSource.offset ();
1058        }
1059        else // current line
1060        {
1061            start = mIndex.elementAt (line - 1);
1062            end = mSource.offset ();
1063        }
1064        
1065            
1066        return (getText (start,  end));
1067    }
1068
1069    /**
1070     * Get the text line the position of the cursor lies on.
1071     * @param position The position to calculate for.
1072     * @return The contents of the URL or file corresponding to the line number
1073     * containg the cursor position.
1074     */
1075    public String   getLine (int position)
1076    {
1077        return (getLine (new Cursor (this, position)));
1078    }
1079    
1080    /**
1081     * Display some of this page as a string.
1082     * @return The last few characters the source read in.
1083     */
1084    public String   toString ()
1085    {
1086        StringBuffer   buffer;
1087        int start;
1088        String   ret;
1089
1090        if (mSource.offset () > 0)
1091        {
1092            buffer = new StringBuffer   (43);
1093            start = mSource.offset () - 40;
1094            if (0 > start)
1095                start = 0;
1096            else
1097                buffer.append ("...");
1098            getText (buffer, start, mSource.offset ());
1099            ret = buffer.toString ();
1100        }
1101        else
1102            ret = super.toString ();
1103        
1104        return (ret);
1105    }
1106}
1107
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags