KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > Link


1 /* Link
2 *
3 * $Id: Link.java,v 1.5.4.1 2007/01/13 01:31:17 stack-sf Exp $
4 *
5 * Created on Mar 7, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.extractor;
26
27 import java.io.Serializable JavaDoc;
28
29
30 /**
31  * Link represents one discovered "edge" of the web graph: the source
32  * URI, the destination URI, and the type of reference (represented by the
33  * context in which it was found).
34  *
35  * As such, it is a suitably generic item to returned from generic
36  * link-extraction utility code.
37  *
38  * @author gojomo
39  */

40 public class Link implements Serializable JavaDoc {
41
42     private static final long serialVersionUID = 7660959085498739376L;
43
44     /* contexts for when another syntax (XPath-like or header-based)
45      * in unavailable */

46     /** stand-in value for embeds without other context */
47     public static final String JavaDoc EMBED_MISC = "=EMBED_MISC".intern();
48     /** stand-in value for js-discovered urls without other context */
49     public static final String JavaDoc JS_MISC = "=JS_MISC".intern();
50     /** stand-in value for navlink urls without other context */
51     public static final String JavaDoc NAVLINK_MISC = "=NAVLINK_MISC".intern();
52     /** stand-in value for speculative/aggressively extracted urls without other context */
53     public static final String JavaDoc SPECULATIVE_MISC = "=SPECULATIVE_MISC".intern();
54     /** stanf-in value for prerequisite without other context */
55     public static final String JavaDoc PREREQ_MISC = "=PREREQ_MISC".intern();
56     
57     /* hop types */
58     /** navigation links, like A/@HREF */
59     public static final char NAVLINK_HOP = 'L'; // TODO: change to 'N' to avoid 'L'ink confusion?
60
/** implied prerequisite links, like dns or robots */
61     public static final char PREREQ_HOP = 'P';
62     /** embedded links necessary to render the page, like IMG/@SRC */
63     public static final char EMBED_HOP = 'E';
64     /** speculative/aggressively extracted links, perhaps embed or nav, as in javascript */
65     public static final char SPECULATIVE_HOP = 'X';
66     /** referral/redirect links, like header 'Location:' on a 301/302 response */
67     public static final char REFER_HOP = 'R';
68
69     /** URI where this Link was discovered */
70     private CharSequence JavaDoc source;
71     /** URI (absolute) where this Link points */
72     private CharSequence JavaDoc destination;
73     /** context of discovery -- will be an XPath-like element[/@attribute]
74      * fragment for HTML URIs, a header name with trailing ':' for header
75      * values, or one of the stand-in constants when other context is
76      * unavailable */

77     private CharSequence JavaDoc context;
78     /** hop-type, as character abbrieviation */
79     private char hopType;
80     
81     /**
82      * Create a Link with the given fields.
83      * @param source
84      * @param destination
85      * @param context
86      * @param hopType
87      */

88     public Link(CharSequence JavaDoc source, CharSequence JavaDoc destination,
89             CharSequence JavaDoc context, char hopType) {
90         super();
91         this.source = source;
92         this.destination = destination;
93         this.context = context;
94         this.hopType = hopType;
95     }
96
97     /**
98      * @return Returns the context.
99      */

100     public CharSequence JavaDoc getContext() {
101         return context;
102     }
103     /**
104      * @return Returns the destination.
105      */

106     public CharSequence JavaDoc getDestination() {
107         return destination;
108     }
109     /**
110      * @return Returns the source.
111      */

112     public CharSequence JavaDoc getSource() {
113         return source;
114     }
115
116     /**
117      * @return char hopType
118      */

119     public char getHopType() {
120         return hopType;
121     }
122
123     /**
124      * Create a suitable XPath-like context from an element name and optional
125      * attribute name.
126      *
127      * @param element
128      * @param attribute
129      * @return CharSequence context
130      */

131     public static CharSequence JavaDoc elementContext(CharSequence JavaDoc element, CharSequence JavaDoc attribute) {
132         return attribute == null? "": element + "/@" + attribute;
133     }
134     
135     @Override JavaDoc
136     public String JavaDoc toString() {
137         return this.destination + " " + this.hopType + " " + this.context;
138     }
139 }
140
Popular Tags