KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Link


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.util.Enumeration JavaDoc;
36 import java.io.File JavaDoc;
37 import java.net.URL JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39 import rcm.util.Prioritized;
40
41 /**
42  * Link to a Web page.
43  *
44  * @author Rob Miller
45  * @see Page
46  */

47 public class Link extends Element implements Prioritized {
48
49     protected URL JavaDoc url;
50     
51     private String JavaDoc directory;
52     private String JavaDoc filename;
53     private String JavaDoc query;
54     private String JavaDoc ref;
55     private Page page;
56     private int depth;
57     private String JavaDoc text = ""; // stored text of link anchor
58
private int status = LinkEvent.NONE;
59     private float priority;
60     private DownloadParameters dp;
61         // timeouts, etc. to use when downloading this link
62

63     /**
64      * Make a Link from a start tag and end tag and a base URL (for relative references).
65      * The tags must be on the same page.
66      * @param startTag Start tag of element
67      * @param endTag End tag of element
68      * @param base Base URL used for relative references
69      */

70     public Link (Tag startTag, Tag endTag, URL JavaDoc base) throws MalformedURLException JavaDoc {
71         super (startTag, endTag);
72         url = urlFromHref (startTag, base);
73         depth = startTag.getSource().getDepth() + 1;
74     }
75
76     /**
77      * Make a Link from a URL.
78      */

79     public Link (URL JavaDoc url) {
80         super (new Tag (new Page (""), 0, 0, "", true), null);
81         this.url = url;
82         depth = 0;
83     }
84
85     /**
86      * Make a Link from a File.
87      */

88     public Link (File JavaDoc file) throws MalformedURLException JavaDoc {
89         this (FileToURL (file));
90     }
91
92     /**
93      * Make a Link from a string URL.
94      * @exception java.net.MalformedURLException if the URL is invalid
95      */

96     public Link (String JavaDoc href) throws MalformedURLException JavaDoc {
97         this (new URL JavaDoc (href));
98         depth = 0;
99     }
100
101     /**
102      * Eliminate all references to page content.
103      */

104     public void discardContent () {
105         parent = null;
106         child = null;
107         sibling = null;
108     }
109
110     /**
111      * Disconnect this link from its downloaded page (throwing away the page).
112      */

113     public void disconnect () {
114         page = null;
115         status = LinkEvent.NONE;
116     }
117
118     /**
119      * Get depth of link in crawl.
120      * @return depth of link from root (depth of roots is 0)
121      */

122     public int getDepth () {
123         return depth;
124     }
125     
126     /**
127      * Get the URL.
128      * @return the URL of the link
129      */

130     public URL JavaDoc getURL () {
131         return url;
132     }
133
134     /**
135      * Get the network protocol of the link, like "ftp" or "http".
136      * @return the protocol portion of the link's URL
137      */

138     public String JavaDoc getProtocol () {
139         return getURL().getProtocol ();
140     }
141
142     /**
143      * Get the hostname of the link, like "www.cs.cmu.edu".
144      * @return the hostname portion of the link's URL
145      */

146     public String JavaDoc getHost () {
147         return getURL().getHost ();
148     }
149
150     /**
151      * Get the port number of the link.
152      * @return the port number of the link's URL, or -1 if no port number
153      * is explicitly specified in the URL
154      */

155     public int getPort () {
156         return getURL().getPort ();
157     }
158
159     /**
160      * Get the filename part of the link, which includes the pathname
161      * and query but not the anchor reference.
162      * Equivalent to getURL().getFile().
163      * @return the filename portion of the link's URL
164      */

165     public String JavaDoc getFile () {
166         return getURL().getFile ();
167     }
168
169     /**
170      * Get the directory part of the link, like "/home/dir/".
171      * Always starts and ends with '/'.
172      * @return the directory portion of the link's URL
173      */

174     public String JavaDoc getDirectory () {
175         if (directory == null)
176             parseURL ();
177         return directory;
178     }
179
180     /**
181      * Get the filename part of the link, like "index.html".
182      * Never contains '/'; may be the empty string.
183      * @return the filename portion of the link's URL
184      */

185     public String JavaDoc getFilename () {
186         if (filename == null)
187             parseURL ();
188         return filename;
189     }
190
191     /**
192      * Get the query part of the link.
193      * Either starts with a '?', or is empty.
194      * @return the query portion of the link's URL
195      */

196     public String JavaDoc getQuery () {
197         if (query == null)
198             parseURL ();
199         return query;
200     }
201
202     /**
203      * Get the anchor reference of the link, like "#ref".
204      * Either starts with '#', or is empty.
205      * @return the anchor reference portion of the link's URL
206      */

207     public String JavaDoc getRef () {
208         if (ref == null)
209             parseURL ();
210         return ref;
211     }
212
213     /**
214      * Get the URL of a page, omitting any anchor reference (like #ref).
215      * @return the URL sans anchor reference
216      */

217     public URL JavaDoc getPageURL () {
218         return getPageURL (getURL());
219     }
220
221     /**
222      * Get the URL of a page, omitting any anchor reference (like #ref).
223      * @return the URL sans anchor reference
224      */

225     public static URL JavaDoc getPageURL (URL JavaDoc url) {
226         String JavaDoc href = url.toExternalForm ();
227         int i = href.indexOf ('#');
228         try {
229             return (i != -1) ? new URL JavaDoc(href.substring (0, i)) : url;
230         } catch (MalformedURLException JavaDoc e) {
231             return url;
232         }
233     }
234
235     /**
236      * Get the URL of a Web service, omitting any query or anchor reference.
237      * @return the URL sans query and anchor reference
238      */

239     public URL JavaDoc getServiceURL () {
240         return getServiceURL (getURL());
241     }
242     
243     
244     /**
245      * Get the URL of a Web service, omitting any query or anchor reference.
246      * @return the URL sans query and anchor reference
247      */

248     public static URL JavaDoc getServiceURL (URL JavaDoc url) {
249         String JavaDoc href = url.toExternalForm ();
250         int i = href.indexOf ('?');
251         try {
252             return (i != -1 && url.getProtocol().equals ("http"))
253                 ? new URL JavaDoc(href.substring (0, i))
254                 : getPageURL(url);
255         } catch (MalformedURLException JavaDoc e) {
256             return url;
257         }
258     }
259
260     /**
261      * Get the URL of a page's directory.
262      * @return the URL sans filename, query and anchor reference
263      */

264     public URL JavaDoc getDirectoryURL () {
265         return getDirectoryURL (getURL());
266     }
267     
268     
269     /**
270      * Get the URL of a page's directory.
271      * @return the URL sans filename, query and anchor reference
272      */

273     public static URL JavaDoc getDirectoryURL (URL JavaDoc url) {
274         String JavaDoc file = url.getFile();
275         int qmark = file.indexOf ('?');
276         if (qmark == -1 || !url.getProtocol().equals ("http"))
277             qmark = file.length();
278         // find pivotal separator (between directory and filename)
279
int pivot = file.lastIndexOf ('/', Math.max(qmark-1, 0));
280         try {
281             if (pivot == -1)
282                 return new URL JavaDoc (url, "/");
283             else if (pivot == file.length()-1)
284                 return url;
285             else
286                 return new URL JavaDoc (url, file.substring (0, pivot+1));
287         } catch (MalformedURLException JavaDoc e) {
288             return url;
289         }
290     }
291
292     /**
293      * Get the URL of a page's parent directory.
294      * @return the URL sans filename, query and anchor reference
295      */

296     public URL JavaDoc getParentURL () {
297         return getParentURL (getURL());
298     }
299     
300     
301     /**
302      * Get the URL of a page's parent directory.
303      * @return the URL sans filename, query and anchor reference
304      */

305     public static URL JavaDoc getParentURL (URL JavaDoc url) {
306         URL JavaDoc dirURL = getDirectoryURL (url);
307         if (!dirURL.equals (url))
308             return dirURL;
309
310         String JavaDoc dir = dirURL.getFile ();
311         int lastSlash = dir.length()-1;
312         if (lastSlash == 0)
313             return dirURL;
314             
315         int penultSlash = dir.lastIndexOf ('/', lastSlash-1);
316
317         if (penultSlash == -1)
318             return dirURL;
319
320         try {
321             return new URL JavaDoc (url, dir.substring (0, penultSlash+1));
322         } catch (MalformedURLException JavaDoc e) {
323             return dirURL;
324         }
325     }
326     
327     // computes relative HREF for URL <there> when the current location
328
// is URL <here>
329
public static String JavaDoc relativeTo (URL JavaDoc here, URL JavaDoc there) {
330         if (here == null)
331             return there.toString();
332         //System.err.println ("From: " + here);
333
//System.err.println ("To: " + there);
334
if (here.getProtocol().equals (there.getProtocol())
335             && here.getHost().equals (there.getHost ())
336             && here.getPort() == there.getPort ()) {
337             String JavaDoc fn = relativeTo (here.getFile (),
338                                     there.getFile ());
339             String JavaDoc ref = there.getRef ();
340             return (ref != null) ? fn+ref : fn;
341         }
342         else {
343           //System.err.println ("Use: " + there);
344
return there.toString ();
345         }
346     }
347
348     // computes relative HREF for URL <there> when the current location
349
// is URL <here>
350
public static String JavaDoc relativeTo (URL JavaDoc here, String JavaDoc there) {
351         if (here == null)
352             return there;
353       try {
354         return relativeTo (here, new URL JavaDoc (here, there));
355       } catch (MalformedURLException JavaDoc e) {
356         return there;
357       }
358     }
359
360     // computes relative HREF for filename <there> when the current location
361
// is filename <here>
362
private static String JavaDoc relativeTo (String JavaDoc here, String JavaDoc there) {
363         StringBuffer JavaDoc result = new StringBuffer JavaDoc ();
364
365         int lcp = 0;
366
367         while (true) {
368             int i = here.indexOf ('/', lcp);
369             int j = there.indexOf ('/', lcp);
370
371             if (i == -1 || i != j || !here.regionMatches (lcp, there, lcp, i-lcp))
372                 break;
373             lcp = i+1;
374         }
375
376         // assert: first lcp characters of here and there are identical
377
// and (lcp==0 or here[lcp-1] == '/')
378

379         // here[0..lcp-1] is the common ancestor directory of here and there
380

381         // count hops up from here to the common ancestor
382
for (int i = here.indexOf ('/', lcp);
383              i != -1;
384              i = here.indexOf ('/', i+1)) {
385             result.append ("..");
386             result.append ('/');
387         }
388
389         // append path down from common ancestor to there
390
result.append (there.substring (lcp));
391
392         //System.out.println ("Use: " + result);
393
//System.out.println ();
394

395         return result.toString ();
396     }
397
398     /**
399      * Convert a local filename to a URL.
400      * For example, if the filename is "C:\FOO\BAR\BAZ",
401      * the resulting URL is "file:/C:/FOO/BAR/BAZ".
402      * @param file File to convert
403      * @return URL corresponding to file
404      */

405     public static URL JavaDoc FileToURL (File JavaDoc file) throws MalformedURLException JavaDoc {
406         return new URL JavaDoc ("file:" + toURLDelimiters (file.getAbsolutePath ()));
407     }
408     
409     /**
410      * Convert a file: URL to a filename appropriate to the
411      * current system platform. For example, on MS Windows,
412      * if the URL is "file:/FOO/BAR/BAZ", the resulting
413      * filename is "\FOO\BAR\BAZ".
414      * @param url URL to convert
415      * @return File corresponding to url
416      * @exception MalformedURLException if url is not a
417      * file: URL.
418      */

419     public static File JavaDoc URLToFile (URL JavaDoc url) throws MalformedURLException JavaDoc {
420         if (!url.getProtocol().equals ("file"))
421             throw new MalformedURLException JavaDoc ();
422             
423         String JavaDoc path = url.getFile ();
424         path = path.replace ('/', File.separatorChar);
425         // for MSWindows: change pathnames of the
426
// form /X:/ to X:/
427
if (path.length () > 3
428             && path.charAt (0) == File.separatorChar
429             && path.charAt(2) == ':'
430             && path.charAt (3) == File.separatorChar)
431             path = path.substring (1);
432             
433         return new File JavaDoc (path);
434     }
435     
436     public static String JavaDoc toURLDelimiters (String JavaDoc path) {
437         path = path.replace ('\\', '/');
438         if (!path.startsWith ("/"))
439             path = "/" + path;
440         return path;
441     }
442
443     /**
444      * Get the downloaded page to which the link points.
445      * @return the Page object, or null if the page hasn't been downloaded.
446      */

447     public Page getPage () {
448         return page;
449     }
450     /**
451      * Set the page corresponding to this link.
452      * @param page Page to which this link points
453      */

454     public void setPage (Page page) {
455         this.page = page;
456     }
457
458     /**
459      * Use the HTTP GET method to download this link.
460      */

461     public static final int GET = 0;
462     /**
463      * Use the HTTP POST method to access this link.
464      */

465     public static final int POST = 1;
466
467     /**
468      * Get the method used to access this link.
469      * @return GET or POST.
470      */

471     public int getMethod () {
472         return GET;
473     }
474
475     /**
476      * Convert the link's URL to a String
477      * @return the URL represented as a string
478      */

479     public String JavaDoc toURL () {
480         return getURL().toExternalForm ();
481     }
482
483     /**
484      * Generate a human-readable description of the link.
485      * @return a description of the link, in the form "[url]".
486      */

487     public String JavaDoc toDescription () {
488         return (text.length() > 0 ? text + " " : "") + "[" + getURL() + "]";
489     }
490
491     /**
492      * Convert the region to tagless text.
493      * @return a string consisting of the text in the page contained by this region
494      */

495     public String JavaDoc toText () {
496         return text;
497     }
498     
499     /**
500      * Set the tagless-text representation of this region.
501      * @param text a string consisting of the text in the page contained by this region
502      */

503     public void setText (String JavaDoc text) {
504         this.text = text;
505     }
506
507     private void parseURL () {
508         String JavaDoc protocol = getProtocol();
509         String JavaDoc file = getFile();
510         
511         int qmark = file.indexOf ('?');
512         if (qmark == -1 || !protocol.equals ("http")) {
513             query = "";
514             qmark = file.length();
515         }
516         else {
517             query = file.substring (qmark+1);
518             file = file.substring (0, qmark);
519         }
520     
521         int slash = file.lastIndexOf ('/', Math.max(qmark-1, 0));
522         if (slash == -1) {
523             directory = "";
524             filename = file;
525         }
526         else {
527             directory = file.substring (0, slash+1);
528             filename = file.substring (slash+1);
529         }
530
531         ref = getURL().getRef ();
532         if (ref == null)
533             ref = "";
534     }
535
536     /**
537      * Construct the URL for a link element, from its start tag and a base URL (for relative references).
538      * @param tag Start tag of link, such as &lt;A HREF="../../foo/index.html"&gt;.
539      * @param base Base URL used for relative references
540      * @return URL to which the link points
541      */

542     protected URL JavaDoc urlFromHref (Tag tag, URL JavaDoc base) throws MalformedURLException JavaDoc {
543         // element is a link -- make an instance of Link.
544
String JavaDoc hrefAttr = getHrefAttributeName (tag);
545         String JavaDoc href = tag.getHTMLAttribute (hrefAttr);
546         if (tag.tagName == Tag.APPLET) {
547             String JavaDoc codebase = tag.getHTMLAttribute ("codebase");
548             if (codebase != null)
549                 base = new URL JavaDoc (base, codebase);
550         }
551         return new URL JavaDoc (base, href);
552     }
553
554     /**
555      * Copy the link's start tag, replacing the URL. Note that the name of the attribute containing the URL
556      * varies from tag to tag: sometimes it is called HREF, sometimes SRC, sometimes CODE, etc.
557      * This method changes the appropriate attribute for this tag.
558      * @param newHref New URL or relative reference; e.g. "http://www.cs.cmu.edu/" or "/foo/index.html".
559      * @return copy of this link's start tag with its URL attribute replaced. The copy is
560      * a region of a fresh page containing only the tag.
561      */

562     public Tag replaceHref (String JavaDoc newHref) {
563         Tag tag = startTag;
564         
565         if (tag.getTagName() == Tag.APPLET) {
566             int i = newHref.lastIndexOf ('/');
567             if (i != -1) {
568                 tag = startTag.replaceHTMLAttribute ("codebase", newHref.substring (0, i+1));
569                 newHref = newHref.substring (i+1);
570             }
571         }
572         String JavaDoc hrefAttrName = getHrefAttributeName (tag);
573         if (hrefAttrName == null)
574             return tag;
575         return tag.replaceHTMLAttribute (hrefAttrName, newHref);
576     }
577     
578     private static String JavaDoc getHrefAttributeName (Tag tag) {
579         return (String JavaDoc)HTMLParser.linktag.get (tag.getTagName ());
580     }
581
582     /**
583      * Get the status of the link. Possible values are defined in LinkEvent.
584      * @return last event that happened to this link
585      */

586     public int getStatus () {
587         return status;
588     }
589
590     /**
591      * Set the status of the link. Possible values are defined in LinkEvent.
592      * @param event the event that just happened to this link
593      */

594     public void setStatus (int event) {
595         status = event;
596     }
597     
598     /**
599      * Get the priority of the link in the crawl.
600      */

601     public float getPriority () {
602         return priority;
603     }
604
605     /**
606      * Set the priority of the link in the crawl.
607      */

608     public void setPriority (float priority) {
609         this.priority = priority;
610     }
611     
612     /**
613      * Get the download parameters used for this link. Default is null.
614      */

615     public DownloadParameters getDownloadParameters () {
616         return dp;
617     }
618
619     /**
620      * Set the download parameters used for this link.
621      */

622     public void setDownloadParameters (DownloadParameters dp) {
623         this.dp = dp;
624     }
625
626     /*
627      * Testing
628      *
629      
630   public static void main (String[] args) throws Exception {
631     if (args[0].equals ("file"))
632       System.out.println (Link.FileToURL (new File (args[1])));
633     else if (args[0].equals ("url"))
634       System.out.println (Link.URLToFile (new URL (args[1])));
635   }
636      *
637      *
638      */

639 }
640
Popular Tags