KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > lexer > Page


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/13 14:51:43 $
10
// $Revision: 1.48 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.lexer;
28
29 import java.io.InputStream JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.io.ObjectInputStream JavaDoc;
32 import java.io.ObjectOutputStream JavaDoc;
33 import java.io.Serializable JavaDoc;
34 import java.io.UnsupportedEncodingException JavaDoc;
35 import java.lang.reflect.InvocationTargetException JavaDoc;
36 import java.lang.reflect.Method JavaDoc;
37 import java.net.MalformedURLException JavaDoc;
38 import java.net.URL JavaDoc;
39 import java.net.URLConnection JavaDoc;
40 import java.net.UnknownHostException JavaDoc;
41 import java.util.zip.GZIPInputStream JavaDoc;
42 import java.util.zip.InflaterInputStream JavaDoc;
43
44 import org.htmlparser.http.ConnectionManager;
45 import org.htmlparser.util.ParserException;
46
47 /**
48  * Represents the contents of an HTML page.
49  * Contains the source of characters and an index of positions of line
50  * separators (actually the first character position on the next line).
51  */

52 public class Page
53     implements
54         Serializable JavaDoc
55 {
56     /**
57      * The default charset.
58      * This should be <code>{@value}</code>,
59      * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1
60      * Another alias is "8859_1".
61      */

62     public static final String JavaDoc DEFAULT_CHARSET = "ISO-8859-1";
63
64     /**
65      * The default content type.
66      * In the absence of alternate information, assume html content ({@value}).
67      */

68     public static final String JavaDoc DEFAULT_CONTENT_TYPE = "text/html";
69
70     /**
71      * Character value when the page is exhausted.
72      * Has a value of {@value}.
73      */

74     public static final char EOF = (char)Source.EOF;
75
76     /**
77      * The URL this page is coming from.
78      * Cached value of <code>getConnection().toExternalForm()</code> or
79      * <code>setUrl()</code>.
80      */

81     protected String JavaDoc mUrl;
82
83     /**
84      * The base URL for this page.
85      */

86     protected String JavaDoc mBaseUrl;
87
88     /**
89      * The source of characters.
90      */

91     protected Source mSource;
92
93     /**
94      * Character positions of the first character in each line.
95      */

96     protected PageIndex mIndex;
97     
98     /**
99      * The connection this page is coming from or <code>null</code>.
100      */

101     protected transient URLConnection JavaDoc mConnection;
102
103     /**
104      * Connection control (proxy, cookies, authorization).
105      */

106     public static ConnectionManager mConnectionManager = new ConnectionManager ();
107
108     /**
109      * Construct an empty page.
110      */

111     public Page ()
112     {
113         this ("");
114     }
115
116     /**
117      * Construct a page reading from a URL connection.
118      * @param connection A fully conditioned connection. The connect()
119      * method will be called so it need not be connected yet.
120      * @exception ParserException An exception object wrapping a number of
121      * possible error conditions, some of which are outlined below.
122      * <li>IOException If an i/o exception occurs creating the
123      * source.</li>
124      * <li>UnsupportedEncodingException if the character set specified in the
125      * HTTP header is not supported.</li>
126      */

127     public Page (URLConnection JavaDoc connection) throws ParserException
128     {
129         if (null == connection)
130             throw new IllegalArgumentException JavaDoc ("connection cannot be null");
131         setConnection (connection);
132         mBaseUrl = null;
133     }
134
135     /**
136      * Construct a page from a stream encoded with the given charset.
137      * @param stream The source of bytes.
138      * @param charset The encoding used.
139      * If null, defaults to the <code>DEFAULT_CHARSET</code>.
140      * @exception UnsupportedEncodingException If the given charset is not supported.
141      */

142     public Page (InputStream JavaDoc stream, String JavaDoc charset)
143         throws
144             UnsupportedEncodingException JavaDoc
145     {
146         if (null == stream)
147             throw new IllegalArgumentException JavaDoc ("stream cannot be null");
148         if (null == charset)
149             charset = DEFAULT_CHARSET;
150         mSource = new InputStreamSource (stream, charset);
151         mIndex = new PageIndex (this);
152         mConnection = null;
153         mUrl = null;
154         mBaseUrl = null;
155     }
156
157     /**
158      * Construct a page from the given string.
159      * @param text The HTML text.
160      * @param charset <em>Optional</em>. The character set encoding that will
161      * be reported by {@link #getEncoding}. If charset is <code>null</code>
162      * the default character set is used.
163      */

164     public Page (String JavaDoc text, String JavaDoc charset)
165     {
166         if (null == text)
167             throw new IllegalArgumentException JavaDoc ("text cannot be null");
168         if (null == charset)
169             charset = DEFAULT_CHARSET;
170         mSource = new StringSource (text, charset);
171         mIndex = new PageIndex (this);
172         mConnection = null;
173         mUrl = null;
174         mBaseUrl = null;
175     }
176
177     /**
178      * Construct a page from the given string.
179      * The page will report that it is using an encoding of
180      * {@link #DEFAULT_CHARSET}.
181      * @param text The HTML text.
182      */

183     public Page (String JavaDoc text)
184     {
185         this (text, null);
186     }
187
188     //
189
// static methods
190
//
191

192     /**
193      * Get the connection manager all Parsers use.
194      * @return The connection manager.
195      */

196     public static ConnectionManager getConnectionManager ()
197     {
198         return (mConnectionManager);
199     }
200
201     /**
202      * Set the connection manager to use.
203      * @param manager The new connection manager.
204      */

205     public static void setConnectionManager (ConnectionManager manager)
206     {
207         mConnectionManager = manager;
208     }
209
210     /**
211      * Get a CharacterSet name corresponding to a charset parameter.
212      * @param content A text line of the form:
213      * <pre>
214      * text/html; charset=Shift_JIS
215      * </pre>
216      * which is applicable both to the HTTP header field Content-Type and
217      * the meta tag http-equiv="Content-Type".
218      * Note this method also handles non-compliant quoted charset directives such as:
219      * <pre>
220      * text/html; charset="UTF-8"
221      * </pre>
222      * and
223      * <pre>
224      * text/html; charset='UTF-8'
225      * </pre>
226      * @return The character set name to use when reading the input stream.
227      * For JDKs that have the Charset class this is qualified by passing
228      * the name to findCharset() to render it into canonical form.
229      * If the charset parameter is not found in the given string, the default
230      * character set is returned.
231      * @see #findCharset
232      * @see #DEFAULT_CHARSET
233      */

234     public static String JavaDoc getCharset (String JavaDoc content)
235     {
236         final String JavaDoc CHARSET_STRING = "charset";
237         int index;
238         String JavaDoc ret;
239
240         ret = DEFAULT_CHARSET;
241         if (null != content)
242         {
243             index = content.indexOf (CHARSET_STRING);
244
245             if (index != -1)
246             {
247                 content = content.substring (index + CHARSET_STRING.length ()).trim ();
248                 if (content.startsWith ("="))
249                 {
250                     content = content.substring (1).trim ();
251                     index = content.indexOf (";");
252                     if (index != -1)
253                         content = content.substring (0, index);
254
255                     //remove any double quotes from around charset string
256
if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ()))
257                         content = content.substring (1, content.length () - 1);
258
259                     //remove any single quote from around charset string
260
if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ()))
261                         content = content.substring (1, content.length () - 1);
262
263                     ret = findCharset (content, ret);
264
265                     // Charset names are not case-sensitive;
266
// that is, case is always ignored when comparing charset names.
267
// if (!ret.equalsIgnoreCase (content))
268
// {
269
// System.out.println (
270
// "detected charset \""
271
// + content
272
// + "\", using \""
273
// + ret
274
// + "\"");
275
// }
276
}
277             }
278         }
279
280         return (ret);
281     }
282
283     /**
284      * Lookup a character set name.
285      * <em>Vacuous for JVM's without <code>java.nio.charset</code>.</em>
286      * This uses reflection so the code will still run under prior JDK's but
287      * in that case the default is always returned.
288      * @param name The name to look up. One of the aliases for a character set.
289      * @param _default The name to return if the lookup fails.
290      */

291     public static String JavaDoc findCharset (String JavaDoc name, String JavaDoc _default)
292     {
293         String JavaDoc ret;
294
295         try
296         {
297             Class JavaDoc cls;
298             Method JavaDoc method;
299             Object JavaDoc object;
300
301             cls = Class.forName ("java.nio.charset.Charset");
302             method = cls.getMethod ("forName", new Class JavaDoc[] { String JavaDoc.class });
303             object = method.invoke (null, new Object JavaDoc[] { name });
304             method = cls.getMethod ("name", new Class JavaDoc[] { });
305             object = method.invoke (object, new Object JavaDoc[] { });
306             ret = (String JavaDoc)object;
307         }
308         catch (ClassNotFoundException JavaDoc cnfe)
309         {
310             // for reflection exceptions, assume the name is correct
311
ret = name;
312         }
313         catch (NoSuchMethodException JavaDoc nsme)
314         {
315             // for reflection exceptions, assume the name is correct
316
ret = name;
317         }
318         catch (IllegalAccessException JavaDoc ia)
319         {
320             // for reflection exceptions, assume the name is correct
321
ret = name;
322         }
323         catch (InvocationTargetException JavaDoc ita)
324         {
325             // java.nio.charset.IllegalCharsetNameException
326
// and java.nio.charset.UnsupportedCharsetException
327
// return the default
328
ret = _default;
329             System.out.println (
330                 "unable to determine cannonical charset name for "
331                 + name
332                 + " - using "
333                 + _default);
334         }
335
336         return (ret);
337     }
338
339     //
340
// Serialization support
341
//
342

343     /**
344      * Serialize the page.
345      * There are two modes to serializing a page based on the connected state.
346      * If connected, the URL and the current offset is saved, while if
347      * disconnected, the underling source is saved.
348      * @param out The object stream to store this object in.
349      */

350     private void writeObject (ObjectOutputStream JavaDoc out)
351         throws
352             IOException JavaDoc
353     {
354         String JavaDoc href;
355         Source source;
356         PageIndex index;
357
358         // two cases, reading from a URL and not
359
if (null != getConnection ())
360         {
361             out.writeBoolean (true);
362             out.writeInt (mSource.offset ()); // need to preread this much
363
href = getUrl ();
364             out.writeObject (href);
365             setUrl (getConnection ().getURL ().toExternalForm ());
366             source = getSource ();
367             mSource = null; // don't serialize the source if we can avoid it
368
index = mIndex;
369             mIndex = null; // will get recreated; valid for the new page anyway?
370
out.defaultWriteObject ();
371             mSource = source;
372             mIndex = index;
373         }
374         else
375         {
376             out.writeBoolean (false);
377             href = getUrl ();
378             out.writeObject (href);
379             setUrl (null); // don't try and read a bogus URL
380
out.defaultWriteObject ();
381             setUrl (href);
382         }
383     }
384
385     /**
386      * Deserialize the page.
387      * For details see <code>writeObject()</code>.
388      * @param in The object stream to decode.
389      */

390     private void readObject (ObjectInputStream JavaDoc in)
391         throws
392             IOException JavaDoc,
393             ClassNotFoundException JavaDoc
394     {
395         boolean fromurl;
396         int offset;
397         String JavaDoc href;
398         URL JavaDoc url;
399         Cursor cursor;
400
401         fromurl = in.readBoolean ();
402         if (fromurl)
403         {
404             offset = in.readInt ();
405             href = (String JavaDoc)in.readObject ();
406             in.defaultReadObject ();
407             // open the URL
408
if (null != getUrl ())
409             {
410                 url = new URL JavaDoc (getUrl ());
411                 try
412                 {
413                     setConnection (url.openConnection ());
414                 }
415                 catch (ParserException pe)
416                 {
417                     throw new IOException JavaDoc (pe.getMessage ());
418                 }
419             }
420             cursor = new Cursor (this, 0);
421             for (int i = 0; i < offset; i++)
422                 try
423                 {
424                     getCharacter (cursor);
425                 }
426                 catch (ParserException pe)
427                 {
428                     throw new IOException JavaDoc (pe.getMessage ());
429                 }
430             setUrl (href);
431         }
432         else
433         {
434             href = (String JavaDoc)in.readObject ();
435             in.defaultReadObject ();
436             setUrl (href);
437         }
438     }
439
440     /**
441      * Reset the page by resetting the source of characters.
442      */

443     public void reset ()
444     {
445         getSource ().reset ();
446         mIndex = new PageIndex (this); // todo: is this really necessary?
447
}
448
449     /**
450      * Close the page by destroying the source of characters.
451      */

452     public void close () throws IOException JavaDoc
453     {
454         if (null != getSource ())
455             getSource ().destroy ();
456     }
457
458     /**
459      * Clean up this page, releasing resources.
460      * Calls <code>close()</code>.
461      * @exception Throwable if <code>close()</code> throws an <code>IOException</code>.
462      */

463     protected void finalize () throws Throwable JavaDoc
464     {
465         close ();
466     }
467     
468     /**
469      * Get the connection, if any.
470      * @return The connection object for this page, or null if this page
471      * is built from a stream or a string.
472      */

473     public URLConnection JavaDoc getConnection ()
474     {
475         return (mConnection);
476     }
477
478     /**
479      * Set the URLConnection to be used by this page.
480      * Starts reading from the given connection.
481      * This also resets the current url.
482      * @param connection The connection to use.
483      * It will be connected by this method.
484      * @exception ParserException If the <code>connect()</code> method fails,
485      * or an I/O error occurs opening the input stream or the character set
486      * designated in the HTTP header is unsupported.
487      */

488     public void setConnection (URLConnection JavaDoc connection)
489         throws
490             ParserException
491     {
492         Stream stream;
493         String JavaDoc type;
494         String JavaDoc charset;
495         String JavaDoc contentEncoding;
496
497         mConnection = connection;
498         try
499         {
500             getConnection ().connect ();
501         }
502         catch (UnknownHostException JavaDoc uhe)
503         {
504             throw new ParserException ("Connect to " + mConnection.getURL ().toExternalForm () + " failed.", uhe);
505         }
506         catch (IOException JavaDoc ioe)
507         {
508             throw new ParserException ("Exception connecting to " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe);
509         }
510         type = getContentType ();
511         charset = getCharset (type);
512         try
513         {
514             contentEncoding = connection.getContentEncoding();
515             if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("gzip")))
516             {
517                 stream = new Stream (new GZIPInputStream JavaDoc (getConnection ().getInputStream ()));
518             }
519             else if ((null != contentEncoding) && (-1 != contentEncoding.indexOf ("deflate")))
520             {
521                 stream = new Stream (new InflaterInputStream JavaDoc (getConnection ().getInputStream ()));
522             }
523             else
524             {
525                 stream = new Stream (getConnection ().getInputStream ());
526             }
527
528             try
529             {
530                 mSource = new InputStreamSource (stream, charset);
531             }
532             catch (UnsupportedEncodingException JavaDoc uee)
533             {
534 // StringBuffer msg;
535
//
536
// msg = new StringBuffer (1024);
537
// msg.append (getConnection ().getURL ().toExternalForm ());
538
// msg.append (" has an encoding (");
539
// msg.append (charset);
540
// msg.append (") which is not supported, using ");
541
// msg.append (DEFAULT_CHARSET);
542
// System.out.println (msg.toString ());
543
charset = DEFAULT_CHARSET;
544                 mSource = new InputStreamSource (stream, charset);
545             }
546         }
547         catch (IOException JavaDoc ioe)
548         {
549             throw new ParserException ("Exception getting input stream from " + mConnection.getURL ().toExternalForm () + " (" + ioe.getMessage () + ").", ioe);
550         }
551         mUrl = connection.getURL ().toExternalForm ();
552         mIndex = new PageIndex (this);
553     }
554
555     /**
556      * Get the URL for this page.
557      * This is only available if the page has a connection
558      * (<code>getConnection()</code> returns non-null), or the document base has
559      * been set via a call to <code>setUrl()</code>.
560      * @return The url for the connection, or <code>null</code> if there is
561      * no conenction or the document base has not been set.
562      */

563     public String JavaDoc getUrl ()
564     {
565         return (mUrl);
566     }
567
568     /**
569      * Set the URL for this page.
570      * This doesn't affect the contents of the page, just the interpretation
571      * of relative links from this point forward.
572      * @param url The new URL.
573      */

574     public void setUrl (String JavaDoc url)
575     {
576         mUrl = url;
577     }
578
579     /**
580      * Gets the baseUrl.
581      * @return The base URL for this page, or <code>null</code> if not set.
582      */

583     public String JavaDoc getBaseUrl ()
584     {
585         return (mBaseUrl);
586     }
587
588     /**
589      * Sets the baseUrl.
590      * @param url The base url for this page.
591      */

592     public void setBaseUrl (String JavaDoc url)
593     {
594         mBaseUrl = url;
595     }
596     
597     /**
598      * Get the source this page is reading from.
599      */

600     public Source getSource ()
601     {
602         return (mSource);
603     }
604
605     /**
606      * Try and extract the content type from the HTTP header.
607      * @return The content type.
608      */

609     public String JavaDoc getContentType ()
610     {
611         URLConnection JavaDoc connection;
612         String JavaDoc content;
613         String JavaDoc ret;
614
615         ret = DEFAULT_CONTENT_TYPE;
616         connection = getConnection ();
617         if (null != connection)
618         {
619             content = connection.getContentType ();
620             if (null != content)
621                 ret = content;
622         }
623
624         return (ret);
625     }
626
627     /**
628      * Read the character at the cursor position.
629      * The cursor position can be behind or equal to the current source position.
630      * Returns end of lines (EOL) as \n, by converting \r and \r\n to \n,
631      * and updates the end-of-line index accordingly
632      * Advances the cursor position by one (or two in the \r\n case).
633      * @param cursor The position to read at.
634      * @return The character at that position, and modifies the cursor to
635      * prepare for the next read. If the source is exhausted a zero is returned.
636      * @exception ParserException If an IOException on the underlying source
637      * occurs, or an attemp is made to read characters in the future (the
638      * cursor position is ahead of the underlying stream)
639      */

640     public char getCharacter (Cursor cursor)
641         throws
642             ParserException
643     {
644         int i;
645         char ret;
646
647         i = cursor.getPosition ();
648         if (mSource.offset () < i)
649             // hmmm, we could skip ahead, but then what about the EOL index
650
throw new ParserException ("attempt to read future characters from source");
651         else if (mSource.offset () == i)
652             try
653             {
654                 i = mSource.read ();
655                 if (Source.EOF == i)
656                     ret = EOF;
657                 else
658                 {
659                     ret = (char)i;
660                     cursor.advance ();
661                 }
662             }
663             catch (IOException JavaDoc ioe)
664             {
665                 throw new ParserException (
666                     "problem reading a character at position "
667                     + cursor.getPosition (), ioe);
668             }
669         else
670         {
671             // historic read
672
try
673             {
674                 ret = mSource.getCharacter (i);
675             }
676             catch (IOException JavaDoc ioe)
677             {
678                 throw new ParserException (
679                     "can't read a character at position "
680                     + i, ioe);
681             }
682             cursor.advance ();
683         }
684
685         // handle \r
686
if ('\r' == ret)
687         { // switch to single character EOL
688
ret = '\n';
689
690             // check for a \n in the next position
691
if (mSource.offset () == cursor.getPosition ())
692                 try
693                 {
694                     i = mSource.read ();
695                     if (Source.EOF == i)
696                     {
697                         // do nothing
698
}
699                     else if ('\n' == (char)i)
700                         cursor.advance ();
701                     else
702                         try
703                         {
704                             mSource.unread ();
705                         }
706                         catch (IOException JavaDoc ioe)
707                         {
708                             throw new ParserException (
709                                 "can't unread a character at position "
710                                 + cursor.getPosition (), ioe);
711                         }
712                 }
713                 catch (IOException JavaDoc ioe)
714                 {
715                     throw new ParserException (
716                         "problem reading a character at position "
717                         + cursor.getPosition (), ioe);
718                 }
719             else
720                 try
721                 {
722                     if ('\n' == mSource.getCharacter (cursor.getPosition ()))
723                         cursor.advance ();
724                 }
725                 catch (IOException JavaDoc ioe)
726                 {
727                     throw new ParserException (
728                         "can't read a character at position "
729                         + cursor.getPosition (), ioe);
730                 }
731         }
732         if ('\n' == ret)
733             // update the EOL index in any case
734
mIndex.add (cursor);
735
736         return (ret);
737     }
738
739     /**
740      * Get the current encoding being used.
741      * @return The encoding used to convert characters.
742      */

743     public String JavaDoc getEncoding ()
744     {
745         return (getSource ().getEncoding ());
746     }
747
748     /**
749      * Begins reading from the source with the given character set.
750      * If the current encoding is the same as the requested encoding,
751      * this method is a no-op. Otherwise any subsequent characters read from
752      * this page will have been decoded using the given character set.<p>
753      * Some magic happens here to obtain this result if characters have already
754      * been consumed from this page.
755      * Since a Reader cannot be dynamically altered to use a different character
756      * set, the underlying stream is reset, a new Source is constructed
757      * and a comparison made of the characters read so far with the newly
758      * read characters up to the current position.
759      * If a difference is encountered, or some other problem occurs,
760      * an exception is thrown.
761      * @param character_set The character set to use to convert bytes into
762      * characters.
763      * @exception ParserException If a character mismatch occurs between
764      * characters already provided and those that would have been returned
765      * had the new character set been in effect from the beginning. An
766      * exception is also thrown if the underlying stream won't put up with
767      * these shenanigans.
768      */

769     public void setEncoding (String JavaDoc character_set)
770         throws
771             ParserException
772     {
773         getSource ().setEncoding (character_set);
774     }
775
776     /**
777      * Build a URL from the link and base provided.
778      * @param link The (relative) URI.
779      * @param base The base URL of the page, either from the &lt;BASE&gt; tag
780      * or, if none, the URL the page is being fetched from.
781      * @return An absolute URL.
782      */

783     public URL JavaDoc constructUrl (String JavaDoc link, String JavaDoc base)
784         throws MalformedURLException JavaDoc
785     {
786         String JavaDoc path;
787         boolean modified;
788         boolean absolute;
789         int index;
790         URL JavaDoc url; // constructed URL combining relative link and base
791

792         url = new URL JavaDoc (new URL JavaDoc (base), link);
793         path = url.getFile ();
794         modified = false;
795         absolute = link.startsWith ("/");
796         if (!absolute)
797         { // we prefer to fix incorrect relative links
798
// this doesn't fix them all, just the ones at the start
799
while (path.startsWith ("/."))
800             {
801                 if (path.startsWith ("/../"))
802                 {
803                     path = path.substring (3);
804                     modified = true;
805                 }
806                 else if (path.startsWith ("/./") || path.startsWith("/."))
807                 {
808                     path = path.substring (2);
809                     modified = true;
810                 }
811                 else
812                     break;
813             }
814         }
815         // fix backslashes
816
while (-1 != (index = path.indexOf ("/\\")))
817         {
818             path = path.substring (0, index + 1) + path.substring (index + 2);
819             modified = true;
820         }
821         if (modified)
822             url = new URL JavaDoc (url, path);
823
824         return (url);
825     }
826
827     /**
828      * Create an absolute URL from a relative link.
829      * @param link The reslative portion of a URL.
830      * @return The fully qualified URL or the original link if it was absolute
831      * already or a failure occured.
832      */

833     public String JavaDoc getAbsoluteURL (String JavaDoc link)
834     {
835         String JavaDoc base;
836         URL JavaDoc url;
837         String JavaDoc ret;
838
839         if ((null == link) || ("".equals (link)))
840             ret = "";
841         else
842             try
843             {
844                 base = getBaseUrl ();
845                 if (null == base)
846                     base = getUrl ();
847                 if (null == base)
848                     ret = link;
849                 else
850                 {
851                     url = constructUrl (link, base);
852                     ret = url.toExternalForm ();
853                 }
854             }
855             catch (MalformedURLException JavaDoc murle)
856             {
857                 ret = link;
858             }
859
860         return (ret);
861     }
862
863     /**
864      * Get the line number for a cursor.
865      * @param cursor The character offset into the page.
866      * @return The line number the character is in.
867      */

868     public int row (Cursor cursor)
869     {
870         return (mIndex.row (cursor));
871     }
872
873     /**
874      * Get the line number for a cursor.
875      * @param position The character offset into the page.
876      * @return The line number the character is in.
877      */

878     public int row (int position)
879     {
880         return (mIndex.row (position));
881     }
882
883     /**
884      * Get the column number for a cursor.
885      * @param cursor The character offset into the page.
886      * @return The character offset into the line this cursor is on.
887      */

888     public int column (Cursor cursor)
889     {
890         return (mIndex.column (cursor));
891     }
892
893     /**
894      * Get the column number for a cursor.
895      * @param position The character offset into the page.
896      * @return The character offset into the line this cursor is on.
897      */

898     public int column (int position)
899     {
900         return (mIndex.column (position));
901     }
902
903     /**
904      * Get the text identified by the given limits.
905      * @param start The starting position, zero based.
906      * @param end The ending position
907      * (exclusive, i.e. the character at the ending position is not included),
908      * zero based.
909      * @return The text from <code>start</code> to <code>end</code>.
910      * @see #getText(StringBuffer, int, int)
911      * @exception IllegalArgumentException If an attempt is made to get
912      * characters ahead of the current source offset (character position).
913      */

914     public String JavaDoc getText (int start, int end)
915     {
916         String JavaDoc ret;
917         
918         try
919         {
920             ret = mSource.getString (start, end - start);
921         }
922         catch (IOException JavaDoc ioe)
923         {
924             throw new IllegalArgumentException JavaDoc (
925                 "can't get the "
926                 + (end - start)
927                 + "characters at position "
928                 + start
929                 + " - "
930                 + ioe.getMessage ());
931         }
932         
933         return (ret);
934     }
935
936     /**
937      * Put the text identified by the given limits into the given buffer.
938      * @param buffer The accumulator for the characters.
939      * @param start The starting position, zero based.
940      * @param end The ending position
941      * (exclusive, i.e. the character at the ending position is not included),
942      * zero based.
943      * @exception IllegalArgumentException If an attempt is made to get
944      * characters ahead of the current source offset (character position).
945      */

946     public void getText (StringBuffer JavaDoc buffer, int start, int end)
947     {
948         int length;
949
950         if ((mSource.offset () < start) || (mSource.offset () < end))
951             throw new IllegalArgumentException JavaDoc ("attempt to extract future characters from source");
952         if (end < start)
953         {
954             length = end;
955             end = start;
956             start = length;
957         }
958         length = end - start;
959         try
960         {
961             mSource.getCharacters (buffer, start, length);
962         }
963         catch (IOException JavaDoc ioe)
964         {
965             throw new IllegalArgumentException JavaDoc (
966                 "can't get the "
967                 + (end - start)
968                 + "characters at position "
969                 + start
970                 + " - "
971                 + ioe.getMessage ());
972         }
973     }
974
975     /**
976      * Get all text read so far from the source.
977      * @return The text from the source.
978      * @see #getText(StringBuffer)
979      */

980     public String JavaDoc getText ()
981     {
982         return (getText (0, mSource.offset ()));
983     }
984
985     /**
986      * Put all text read so far from the source into the given buffer.
987      * @param buffer The accumulator for the characters.
988      * @see #getText(StringBuffer,int,int)
989      */

990     public void getText (StringBuffer JavaDoc buffer)
991     {
992         getText (buffer, 0, mSource.offset ());
993     }
994
995     /**
996      * Put the text identified by the given limits into the given array at the specified offset.
997      * @param array The array of characters.
998      * @param offset The starting position in the array where characters are to be placed.
999      * @param start The starting position, zero based.
1000     * @param end The ending position
1001     * (exclusive, i.e. the character at the ending position is not included),
1002     * zero based.
1003     * @exception IllegalArgumentException If an attempt is made to get
1004     * characters ahead of the current source offset (character position).
1005     */

1006    public void getText (char[] array, int offset, int start, int end)
1007    {
1008        int length;
1009
1010        if ((mSource.offset () < start) || (mSource.offset () < end))
1011            throw new IllegalArgumentException JavaDoc ("attempt to extract future characters from source");
1012        if (end < start)
1013        { // swap
1014
length = end;
1015            end = start;
1016            start = length;
1017        }
1018        length = end - start;
1019        try
1020        {
1021            mSource.getCharacters (array, offset, start, end);
1022        }
1023        catch (IOException JavaDoc ioe)
1024        {
1025            throw new IllegalArgumentException JavaDoc (
1026                "can't get the "
1027                + (end - start)
1028                + "characters at position "
1029                + start
1030                + " - "
1031                + ioe.getMessage ());
1032        }
1033    }
1034
1035    /**
1036     * Get the text line the position of the cursor lies on.
1037     * @param cursor The position to calculate for.
1038     * @return The contents of the URL or file corresponding to the line number
1039     * containg the cursor position.
1040     */

1041    public String JavaDoc getLine (Cursor cursor)
1042    {
1043        int line;
1044        int size;
1045        int start;
1046        int end;
1047
1048        line = row (cursor);
1049        size = mIndex.size ();
1050        if (line < size)
1051        {
1052            start = mIndex.elementAt (line);
1053            line++;
1054            if (line <= size)
1055                end = mIndex.elementAt (line);
1056            else
1057                end = mSource.offset ();
1058        }
1059        else // current line
1060
{
1061            start = mIndex.elementAt (line - 1);
1062            end = mSource.offset ();
1063        }
1064        
1065            
1066        return (getText (start, end));
1067    }
1068
1069    /**
1070     * Get the text line the position of the cursor lies on.
1071     * @param position The position to calculate for.
1072     * @return The contents of the URL or file corresponding to the line number
1073     * containg the cursor position.
1074     */

1075    public String JavaDoc getLine (int position)
1076    {
1077        return (getLine (new Cursor (this, position)));
1078    }
1079    
1080    /**
1081     * Display some of this page as a string.
1082     * @return The last few characters the source read in.
1083     */

1084    public String JavaDoc toString ()
1085    {
1086        StringBuffer JavaDoc buffer;
1087        int start;
1088        String JavaDoc ret;
1089
1090        if (mSource.offset () > 0)
1091        {
1092            buffer = new StringBuffer JavaDoc (43);
1093            start = mSource.offset () - 40;
1094            if (0 > start)
1095                start = 0;
1096            else
1097                buffer.append ("...");
1098            getText (buffer, start, mSource.offset ());
1099            ret = buffer.toString ();
1100        }
1101        else
1102            ret = super.toString ();
1103        
1104        return (ret);
1105    }
1106}
1107
Popular Tags