KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Page


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.net.URL JavaDoc;
36 import java.net.URLConnection JavaDoc;
37 //#ifdef JDK1.1
38
import java.net.HttpURLConnection JavaDoc;
39 //#endif JDK1.1
40
import java.io.IOException JavaDoc;
41 import java.io.InputStream JavaDoc;
42 import rcm.util.Str;
43
44 /**
45  * A Web page. Although a Page can represent any MIME type, it mainly
46  * supports HTML pages, which are automatically parsed. The parsing produces
47  * a list of tags, a list of words, an HTML parse tree, and a list of links.
48  */

49 public class Page extends Region {
50
51     // typical page length, to optimize downloads
52
static final int TYPICAL_LENGTH = 20240;
53
54     // Permanent content
55
Link origin;
56     long lastModified = 0;
57     long expiration = 0;
58     String JavaDoc contentType;
59     String JavaDoc contentEncoding;
60     int responseCode = -1;
61     String JavaDoc responseMessage = null;
62     URL JavaDoc base;
63     String JavaDoc title;
64     Link[] links;
65
66     int contentLock;
67         // If page was downloaded from Net, represents number of
68
// callers who want to keep the content.
69
// If page was created from a string, set to -1.
70

71     // Discardable content (thrown away when contentLock falls to 0)
72
byte[] contentBytes;
73     String JavaDoc content;
74     Region[] tokens;
75     Text[] words;
76     Tag[] tags;
77     Element[] elements;
78     Element root;
79     String JavaDoc canonicalTags;
80
81     /**
82      * Make a Page by downloading and parsing a Link.
83      * @param link Link to download
84      */

85     public Page (Link link) throws IOException JavaDoc {
86         this (link, DownloadParameters.NO_LIMITS, new HTMLParser ());
87     }
88
89     /**
90      * Make a Page by downloading a Link.
91      * @param link Link to download
92      * @param dp Download parameters to use
93      */

94     public Page (Link link, DownloadParameters dp) throws IOException JavaDoc {
95         this (link, dp, new HTMLParser ());
96     }
97
98     /**
99      * Make a Page by downloading a Link.
100      * @param link Link to download
101      * @param parser HTML parser to use
102      */

103     public Page (Link link, DownloadParameters dp, HTMLParser parser) throws IOException JavaDoc {
104         super (null, 0, 0);
105         source = this;
106         origin = link;
107         base = getURL ();
108         download (dp, parser);
109         link.setPage (this);
110     }
111
112     /**
113      * Make a Page from a URL and a string of HTML.
114      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
115      * @param url URL to use as a base for relative links on the page
116      * @param html the HTML content of the page
117      */

118     public Page (URL JavaDoc url, String JavaDoc html) {
119         this (url, html, new HTMLParser ());
120     }
121
122     /**
123      * Make a Page from a URL and a string of HTML.
124      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
125      * @param url URL to use as a base for relative links on the page
126      * @param html the HTML content of the page
127      * @param parser HTML parser to use
128      */

129     public Page (URL JavaDoc url, String JavaDoc html, HTMLParser parser) {
130         super (null, 0, html.length());
131         source = this;
132         base = url;
133         this.content = html;
134         this.contentBytes = html.getBytes ();
135         contentLock = -1;
136         parse (parser);
137     }
138
139     /**
140      * Make a Page from a string of content. The content is not parsed.
141      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
142      * @param content HTML content of the page */

143     public Page (String JavaDoc content) {
144         super (null, 0, content.length());
145         // FIX: don't think base==null will work
146
source = this;
147         this.content = content;
148         this.contentBytes = content.getBytes ();
149         contentLock = -1;
150     }
151
152     /**
153      * Make a Page from a byte array of content. The content is not parsed.
154      * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
155      * @param content byte content of the page */

156     public Page (byte[] content) {
157         super (null, 0, content.length);
158         // FIX: don't think base==null will work
159
source = this;
160         this.contentBytes = new byte[content.length];
161         System.arraycopy (content, 0, this.contentBytes, 0, content.length);
162         this.content = new String JavaDoc (content);
163         contentLock = -1;
164     }
165
166     //
167
// Downloading
168
//
169

170     // This code generates SecurityExceptions in Netscape 4.0,
171
// and it doesn't seem to be necessary anyway: redirects are followed
172
// by Netscape and JDK by default, despite the fact that the JDK
173
// docs claim that setFollowRedirects() defaults to false
174

175     //static {
176
//try {
177
// HttpURLConnection.setFollowRedirects (true);
178
//} catch (Throwable t) { }
179
//}
180

181     /*
182      * Download the page. The downloaded page is parsed
183      * if its MIME type is HTML or unspecified.
184      * @param parser HTML parser to use
185      * @exception IOException if an error occurs in downloading the page
186      */

187     public void download (DownloadParameters dp, HTMLParser parser) throws IOException JavaDoc {
188         URLConnection JavaDoc conn =
189             Access.getAccess ().openConnection (origin);
190         
191         // fetch and store final redirected URL and response headers
192
InputStream JavaDoc in = conn.getInputStream ();
193         base = conn.getURL ();
194         lastModified = conn.getLastModified ();
195         expiration = conn.getExpiration ();
196         contentType = conn.getContentType ();
197         contentEncoding = conn.getContentEncoding ();
198
199 //#ifdef JDK1.1
200
// get HTTP response codes
201
if (conn instanceof HttpURLConnection JavaDoc) {
202             HttpURLConnection JavaDoc httpconn = (HttpURLConnection JavaDoc)conn;
203
204             responseCode = httpconn.getResponseCode ();
205             responseMessage = httpconn.getResponseMessage ();
206             if (responseMessage == null)
207                 responseMessage = "unknown error";
208             
209             if (responseCode >= 300)
210                 // HTTP failure
211
throw new IOException JavaDoc (responseCode + " " + responseMessage);
212         }
213 //#endif JDK1.1
214

215 // System.err.println ("Original URL: " + origin.getURL());
216
// System.err.println ("Final URL: " + conn.getURL());
217

218         // download content
219
int maxKB = dp.getMaxPageSize ();
220         int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE;
221         int expectedLength = conn.getContentLength ();
222         if (expectedLength > maxBytes)
223             throw new IOException JavaDoc ("Page greater than "
224                                    + maxBytes + " bytes");
225         if (expectedLength == -1)
226             expectedLength = TYPICAL_LENGTH;
227         byte[] buf = new byte[expectedLength];
228         int n;
229         int total = 0;
230
231         while ((n = in.read (buf, total, buf.length - total)) != -1) {
232             total += n;
233             if (total > maxBytes)
234                 throw new IOException JavaDoc ("Page greater than "
235                                        + maxBytes + " bytes");
236             if (total == buf.length) {
237                 // try to read one more character
238
int c = in.read ();
239                 if (c == -1)
240                     break; // EOF, we're done
241
else {
242                     // need more space in array. Double the array, but don't make
243
// it bigger than maxBytes.
244
byte[] newbuf = new byte[Math.min (buf.length * 2, maxBytes)];
245                     System.arraycopy (buf, 0, newbuf, 0, buf.length);
246                     buf = newbuf;
247                     buf[total++] = (byte) c;
248                 }
249             }
250         }
251         in.close ();
252         
253         if (total != buf.length) {
254             // resize the array to be precisely total bytes long
255
byte[] newbuf = new byte[total];
256             System.arraycopy (buf, 0, newbuf, 0, total);
257             buf = newbuf;
258         }
259  
260         contentBytes = buf;
261         content = new String JavaDoc (buf);
262         start = 0;
263         end = total;
264         contentLock = 1;
265
266         // parse the response
267
if (contentType == null
268             || contentType.startsWith ("text/html")
269             || contentType.startsWith ("content/unknown"))
270             parse (parser);
271     }
272
273     void downloadSafely () {
274       try {
275           download (new DownloadParameters (), new HTMLParser ());
276       } catch (Throwable JavaDoc e) {
277       }
278     }
279
280     //
281
// Parsing
282
//
283

284     /**
285      * Parse the page. Assumes the page has already been downloaded.
286      * @param parser HTML parser to use
287      * @exception RuntimeException if an error occurs in downloading the page
288      */

289     public void parse (HTMLParser parser) {
290         if (!hasContent())
291             downloadSafely ();
292         try {
293             parser.parse (this);
294         } catch (IOException JavaDoc e) {
295             throw new RuntimeException JavaDoc (e.toString());
296         }
297     }
298     
299     /**
300      * Test whether page has been parsed. Pages are parsed during
301      * download only if its MIME type is HTML or unspecified.
302      * @return true if page was parsed, false if not
303      */

304     public boolean isParsed () {
305         return tokens != null;
306     }
307
308     /**
309      * Test whether page is HTML.
310      * @return true if page is HTML.
311      */

312     public boolean isHTML () {
313         return root != null;
314     }
315
316     /**
317      * Test whether page is a GIF or JPEG image.
318      * @return true if page is a GIF or JPEG image, false if not
319      */

320     public boolean isImage () {
321         byte[] bytes = getContentBytes ();
322         return startsWith (bytes, GIF_MAGIC) || startsWith (bytes, JPG_MAGIC);
323     }
324
325     private static final byte[] GIF_MAGIC = {
326         (byte) 'G', (byte)'I', (byte)'F', (byte)'8'
327     };
328     private static final byte[] JPG_MAGIC = {
329         (byte) 0377, (byte) 0330, (byte) 0377,
330         (byte) 0340, (byte) 0, (byte) 020,
331         (byte) 'J', (byte) 'F', (byte) 'I', (byte) 'F'
332     };
333
334     private boolean startsWith (byte[] bytes, byte[] prefix) {
335         if (prefix.length > bytes.length)
336             return false;
337         for (int i = 0, n = prefix.length; i < n; ++i)
338             if (bytes[i] != prefix[i])
339                 return false;
340         return true;
341     }
342
343     //
344
// Content management
345
//
346

347     /**
348      * Lock the page's content (to prevent it from being discarded).
349      * This method increments a lock counter, representing all the
350      * callers interested in preserving the content. The lock
351      * counter is set to 1 when the page is initially downloaded.
352      */

353     public void keepContent () {
354         if (contentLock > 0)
355             ++contentLock;
356     }
357
358     /**
359      * Unlock the page's content (allowing it to be garbage-collected, to
360      * save space during a Web crawl). This method decrements a lock counter.
361      * If the counter falls to
362      * 0 (meaning no callers are interested in the content),
363      * the content is released. At least the following
364      * fields are discarded: content, tokens, tags, words, elements, and
365      * root. After the content has been discarded, calling getContent()
366      * (or getTokens(), getTags(), etc.) will force the page to be downloaded
367      * again. Hopefully the download will come from the cache, however.
368      * <P> Links are not considered part of the content, and are not subject to
369      * discarding by this method. Also, if the page was created from a string
370      * (rather than by downloading), its content is not subject to discarding
371      * (since there would be no way to recover it).
372      */

373     public void discardContent () {
374         if (contentLock == 0) // already discarded
375
return;
376             
377         if (--contentLock > 0) // somebody else still has a lock on the content
378
return;
379             
380         if (origin == null)
381             return; // without an origin, we'd have no way to recover this page
382

383         //System.err.println ("discarding content of " + toDescription());
384
contentBytes = null;
385         content = null;
386         tokens = null;
387         tags = null;
388         words = null;
389         elements = null;
390         root = null;
391         canonicalTags = null;
392
393         // keep links, but isolate them from the element tree
394
if (links != null) {
395             for (int i=0; i<links.length; ++i)
396                 if (links[i] instanceof Link)
397                     ((Link)links[i]).discardContent ();
398         }
399         
400         // FIX: debugging only: disconnect this page from its parent
401
//origin.page = null;
402
//origin = null;
403

404         contentLock = 0;
405     }
406
407     /**
408      * Test if page content is available.
409      * @return true if content is downloaded and available, false if content has not been downloaded
410      * or has been discarded.
411      */

412     public final boolean hasContent () {
413         return contentLock != 0;
414     }
415
416     //
417
// Page accessors
418
//
419

420     /**
421      * Get depth of page in crawl.
422      * @return depth of page from root (depth of page is same as depth of its originating link)
423      */

424     public int getDepth () {
425         return origin != null ? origin.getDepth () : 0;
426     }
427     
428     /**
429      * Get the Link that points to this page.
430      * @return the Link object that was used to download this page.
431      */

432     public Link getOrigin () {
433         return origin;
434     }
435
436     /**
437      * Get the base URL, relative to which the page's links were interpreted.
438      * The base URL defaults to the URL of the
439      * Link that was used to download the page. If any redirects occur
440      * while downloading the page, the final location becomes the new base
441      * URL. Lastly, if a <BASE> element is found in the page, that
442      * becomes the new base URL.
443      * @return the page's base URL.
444      */

445     public URL JavaDoc getBase () {
446         return base;
447     }
448
449     /**
450      * Get the URL.
451      * @return the URL of the link that was used to download this page
452      */

453     public URL JavaDoc getURL () {
454         return origin != null ? origin.getURL() : null;
455     }
456
457     /**
458      * Get the title of the page.
459      * @return the page's title, or null if the page hasn't been parsed.
460      */

461     public String JavaDoc getTitle () {
462         return title;
463     }
464
465     /**
466      * Get the content of the page as a String. May not work properly for
467      * binary data like images; use getContentBytes instead.
468      * @return the String content of the page.
469      */

470     public String JavaDoc getContent () {
471         if (!hasContent())
472             downloadSafely ();
473         return content;
474     }
475
476     /**
477      * Get the content of the page as an array of bytes.
478      * @return the content of the page in binary form.
479      */

480     public byte[] getContentBytes () {
481         if (!hasContent())
482             downloadSafely ();
483         return contentBytes;
484     }
485
486     /**
487      * Get the token sequence of the page. Tokens are tags and whitespace-delimited text.
488      * @return token regions in the page, or null if the page hasn't been downloaded or parsed.
489      */

490     public Region[] getTokens() {
491         if (!hasContent ())
492             downloadSafely ();
493         return tokens;
494     }
495
496     /**
497      * Get the tag sequence of the page.
498      * @return tags in the page, or null if the page hasn't been downloaded or parsed.
499      */

500     public Tag[] getTags () {
501         if (!hasContent ())
502             downloadSafely ();
503         return tags;
504     }
505
506     /**
507      * Get the words in the page. Words are whitespace- and tag-delimited text.
508      * @return words in the page, or null if the page hasn't been downloaded or parsed.
509      */

510     public Text[] getWords () {
511         if (!hasContent ())
512             downloadSafely ();
513         return words;
514     }
515
516     /**
517      * Get the HTML elements in the page. All elements in the page
518      * are included in the list, in the order they would appear in
519      * an inorder traversal of the HTML parse tree.
520      * @return HTML elements in the page ordered by inorder, or null if the page
521      * hasn't been downloaded or parsed.
522      */

523     public Element[] getElements () {
524         if (!hasContent ())
525             downloadSafely ();
526         return elements;
527     }
528     
529     /**
530      * Get the root HTML element of the page.
531      * @return first top-level HTML element in the page, or null
532      * if the page hasn't been downloaded or parsed.
533      */

534     public Element getRootElement () {
535         if (!hasContent ())
536             downloadSafely ();
537         return root;
538     }
539
540     /**
541      * Get the links found in the page.
542      * @return links in the page, or null
543      * if the page hasn't been downloaded or parsed.
544      */

545     public Link[] getLinks() {
546         return links;
547     }
548
549     /**
550      * Convert the link's URL to a String
551      * @return the URL represented as a string
552      */

553     public String JavaDoc toURL () {
554         return origin != null ? origin.toURL () : null;
555     }
556
557     /**
558      * Generate a human-readable description of the page.
559      * @return a description of the link, in the form "title [url]".
560      */

561     public String JavaDoc toDescription () {
562         return (title != null && title.length() > 0 ? title + " " : "") + "[" + getURL() + "]";
563     }
564
565     /**
566      * Get page containing the region.
567      * @return page containing the region
568      */

569     public String JavaDoc toString () {
570         return getContent ();
571     }
572
573     /**
574      * Get last-modified date of page.
575      * @return the date when the page was last modified, or 0 if not known.
576      * The value is number of seconds since January 1, 1970 GMT
577      */

578     public long getLastModified () {
579         return lastModified;
580     }
581     /**
582      * Set last-modified date of page.
583      * @param last the date when the page was last modified, or 0 if not known.
584      * The value is number of seconds since January 1, 1970 GMT
585      */

586     public void setLastModified (long last) {
587         lastModified = last;
588     }
589
590     /**
591      * Get expiration date of page.
592      * @return the expiration date of the page, or 0 if not known.
593      * The value is number of seconds since January 1, 1970 GMT.
594      */

595     public long getExpiration () {
596         return expiration;
597     }
598     /**
599      * Set expiration date of page.
600      * @param expire the expiration date of the page, or 0 if not known.
601      * The value is number of seconds since January 1, 1970 GMT.
602      */

603     public void setExpiration (long expire) {
604         expiration = expire;
605     }
606
607     /**
608      * Get MIME type of page.
609      * @return the MIME type of page, such as "text/html", or null if not known.
610      */

611     public String JavaDoc getContentType () {
612         return contentType;
613     }
614     /**
615      * Set MIME type of page.
616      * @param type the MIME type of page, such as "text/html", or null if not known.
617      */

618     public void setContentType (String JavaDoc type) {
619         contentType = type;
620     }
621
622     /**
623      * Get content encoding of page.
624      * @return the encoding type of page, such as "base-64", or null if not known.
625      */

626     public String JavaDoc getContentEncoding () {
627         return contentEncoding;
628     }
629     /**
630      * Set content encoding of page.
631      * @param encoding the encoding type of page, such as "base-64", or null if not known.
632      */

633     public void setContentEncoding (String JavaDoc encoding) {
634         contentEncoding = encoding;
635     }
636
637     /**
638      * Get response code returned by the Web server. For list of
639      * possible values, see java.net.HttpURLConnection.
640      * @return response code, such as 200 (for OK) or 404 (not found).
641      * Code is -1 if unknown.
642      * @see java.net.HttpURLConnection
643      */

644     public int getResponseCode () {
645         return responseCode;
646     }
647
648     /**
649      * Get response message returned by the Web server.
650      * @return response message, such as "OK" or "Not Found". The response message is null if the page failed to be fetched or not known.
651      */

652     public String JavaDoc getResponseMessage () {
653         return responseMessage;
654     }
655
656     /**
657      * Get raw content found in a region.
658      * @param start starting offset of region
659      * @param end ending offset of region
660      * @return raw HTML contained in the region
661      */

662     public String JavaDoc substringContent (int start, int end) {
663         return getContent ().substring (start, end);
664     }
665
666     /**
667      * Get HTML found in a region.
668      * @param start starting offset of region
669      * @param end ending offset of region
670      * @return representation of region as HTML
671      */

672     public String JavaDoc substringHTML (int start, int end) {
673         String JavaDoc s = getContent ().substring (start, end);
674         if (!isHTML ()) {
675             s = Str.replace (s, "&", "&amp;");
676             s = Str.replace (s, "<", "&lt;");
677             s = Str.replace (s, ">", "&gt;");
678             s = "<PRE>" + s + "</PRE>";
679         }
680         return s;
681     }
682
683     /**
684      * Get tagless text found in a region.
685      * Runs of whitespace and tags are reduced to a single space character.
686      * @param start starting offset of region
687      * @param end ending offset of region
688      * @return tagless text contained in the region
689      */

690     public String JavaDoc substringText (int start, int end) {
691         if (words == null)
692             return ""; // page is not parsed
693

694         // FIX: find some other mapping
695
StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
696         for (int j = findStart (words, start); j<words.length; ++j) {
697             if (words[j].end > end)
698                 break;
699             else {
700                 if (buf.length() > 0)
701                     buf.append (' ');
702                 buf.append (words[j].text);
703             }
704         }
705         return buf.toString();
706     }
707
708     /**
709      * Get HTML tags found in a region. Whitespace and text among the
710      * tags are deleted.
711      * @param start starting offset of region
712      * @param end ending offset of region
713      * @return tags contained in the region
714      */

715     public String JavaDoc substringTags (int start, int end) {
716         if (tags == null)
717             return ""; // page is not parsed
718

719         // FIX: find some other mapping
720
StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
721         for (int j = findStart (tags, start); j<tags.length; ++j) {
722             if (tags[j].end > end)
723                 break;
724             else {
725                 if (buf.length() > 0)
726                     buf.append (' ');
727                 buf.append (getContent ().substring (tags[j].start, tags[j].end));
728             }
729         }
730         return buf.toString();
731     }
732
733     /**
734      * Get canonicalized HTML tags found in a region.
735      * A canonicalized tag looks like the following:
736      * <PRE>
737      * &lt;tagname#index attr=value attr=value attr=value ...&gt
738      * <PRE>
739      * where tagname and attr are all lowercase, index is the tag's
740      * index in the page's tokens array. Attributes are sorted in
741      * increasing order by attribute name. Attributes without values
742      * omit the entire "=value" portion. Values are delimited by a
743      * space. All occurences of &lt, &gt, space, and % characters
744      * in a value are URL-encoded (e.g., space is converted to %20).
745      * Thus the only occurences of these characters in the canonical
746      * tag are the tag delimiters.
747      *
748      * <P>For example, raw HTML that looks like:
749      * <PRE>
750      * &lt;IMG SRC="http://foo.com/map&lt;&gt;.gif" ISMAP&gt;Image&lt;/IMG&gt;
751      * </PRE>
752      * would be canonicalized to:
753      * <PRE>
754      * &lt;img ismap SRC=http://foo.com/map%3C%3E.gif&gt;&lt;/img&gt;
755      * </PRE>
756      * <P>
757      * Comment and declaration tags (whose tag name is !) are omitted
758      * from the canonicalization.
759      *
760      * @param start starting offset of region
761      * @param end ending offset of region
762      * @return canonicalized tags contained in the region
763      */

764     public String JavaDoc substringCanonicalTags (int start, int end) {
765         if (tokens == null)
766             return ""; // page is not parsed
767

768         boolean all = (start == this.start && end == this.end);
769
770         if (all && canonicalTags != null)
771             return canonicalTags;
772
773         // FIX: find some other mapping
774
StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
775         for (int j = findStart (tokens, start); j<tokens.length; ++j) {
776             if (tokens[j].end > end)
777                 break;
778             else if (tokens[j] instanceof Tag)
779                 Tagexp.canonicalizeTag (buf, (Tag)tokens[j], j);
780         }
781
782         String JavaDoc result = buf.toString ();
783         if (all)
784             canonicalTags = result;
785         return result;
786     }
787
788     public static void main (String JavaDoc[] args) throws Exception JavaDoc {
789         int method = Link.GET;
790
791         for (int i=0; i<args.length; ++i) {
792             if (args[i].equals ("-post"))
793                 method = Link.POST;
794             else if (args[i].equals ("-get"))
795                 method = Link.GET;
796             else {
797                 Link link = method == Link.GET
798                              ? new Link (args[i])
799                              : new Link (args[i]); // FIX: POST?
800
try {
801                     Page p = new Page (link);
802                     System.out.write (p.getContentBytes ());
803                 } catch (IOException JavaDoc e) {
804                     System.out.println (e);
805                 }
806             }
807         }
808     }
809
810 }
811
Popular Tags