KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > LinkTransformer


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.io.*;
36 import java.net.URL JavaDoc;
37 import java.net.MalformedURLException JavaDoc;
38 import java.util.Hashtable JavaDoc;
39
40 /**
41  * Transformer that remaps URLs in links.
42  * <P>
43  * The default LinkTransformer simply converts all links
44  * to absolute URLs. Other common effects are easy to
45  * achieve:
46  * <UL>
47  * <LI>To make all links relative to a base URL, use
48  * setBase() to set a base URL.
49  * <LI>To replace certain URLs with different ones,
50  * use map() to set up the mappings.
51  * </UL>
52  * The default LinkTransformer strips out &lt;BASE&gt;
53  * elements. Instead, it can output a &lt;BASE&gt;
54  * element with a user-specified URL. Use setBase() to set
55  * the URL and setEmitBaseElement() to indicate that it
56  * should be emitted.
57  */

58 public class LinkTransformer extends HTMLTransformer {
59     protected Hashtable JavaDoc map;
60     protected URL JavaDoc base = null;
61     boolean emitBaseElement = false;
62     
63     boolean needToEmitBase = false;
64
65     /**
66      * Make a LinkTransformer writing to a file.
67      * @param filename Filename to write to
68      */

69     public LinkTransformer (String JavaDoc filename) throws IOException {
70         super (filename);
71     }
72
73     /**
74      * Make a LinkTransformer that writes pages to a
75      * file.
76      * @param filename Name of file to receive HTML output
77      * @param seekable True if file should be opened for random access
78      */

79     public LinkTransformer (String JavaDoc filename, boolean seekable) throws IOException {
80         super (filename, seekable);
81     }
82     
83     /**
84      * Make a LinkTransformer writing to a stream.
85      * @param out stream to write to
86      */

87     public LinkTransformer (OutputStream out) {
88         super (out);
89     }
90
91     /**
92      * Make a LinkTransformer writing to another HTMLTransformer
93      * @param next next transformer in filter chain
94      */

95     public LinkTransformer (HTMLTransformer next) {
96         super (next);
97     }
98
99     /**
100      * Get the base URL used by the LinkTransformer.
101      * A transformed link's URL is written out relative
102      * to this URL. For instance, if the base URL is
103      * http://www.yahoo.com/Entertainment/, then a link
104      * URL http://www.yahoo.com/News/Current/
105      * would be written out as ../News/Current/.
106      * @return base URL, or null if no base URL is set. Default is null.
107      */

108     public URL JavaDoc getBase () {
109         return base;
110     }
111
112     /**
113      * Set the base URL used by the LinkTransformer.
114      * A transformed link's URL is written out relative
115      * to this URL. For instance, if the base URL is
116      * http://www.yahoo.com/Entertainment/, then a link
117      * URL http://www.yahoo.com/News/Current/
118      * would be written out as ../News/Current/.
119      * @param base base URL, or null if no base URL should be used.
120      */

121     public synchronized void setBase (URL JavaDoc base) {
122         this.base = base;
123     }
124
125     /**
126      * Test whether the LinkTransformer should emit a
127      * &lt;BASE&gt; element pointing to the base URL.
128      * @return true if a &lt;BASE&gt; element should be
129      * emitted with each page.
130      */

131     public boolean getEmitBaseElement () {
132         return emitBaseElement;
133     }
134
135     /**
136      * Set whether the LinkTransformer should emit a
137      * &lt;BASE&gt; element pointing to the base URL.
138      * @param emitBase true if a &lt;BASE&gt; element should be
139      * emitted with each page.
140      */

141     public synchronized void setEmitBaseElement (boolean emitBase) {
142         emitBaseElement = emitBase;
143     }
144
145     /**
146      * Look up the href for a URL, taking any mapping
147      * into account.
148      * @param base base URL (or null if an absolute URL is desired)
149      * @param url URL of interest
150      * @return relative href for url from base
151      */

152     public String JavaDoc lookup (URL JavaDoc base, URL JavaDoc url) {
153         if (map != null) {
154             Object JavaDoc obj = map.get (url);
155             if (obj instanceof URL JavaDoc)
156                 return base != null
157                     ? Link.relativeTo (base, (URL JavaDoc)obj)
158                     : obj.toString ();
159             else if (obj instanceof String JavaDoc)
160                 return base != null
161                     ? Link.relativeTo (base, (String JavaDoc)obj)
162                     : obj.toString ();
163         }
164
165         return base != null
166             ? Link.relativeTo (base, url)
167             : url.toString ();
168     }
169
170     /**
171      * Map a URL to an href. For example, Concatenator
172      * uses this call to map page URLs to their corresponding
173      * anchors in the concatenation.
174      * @param url URL of interest
175      * @param href href which should be returned by lookup (null, url)
176      */

177     public synchronized void map (URL JavaDoc url, String JavaDoc href) {
178         if (map == null)
179             map = new Hashtable JavaDoc ();
180         map.put (url, href);
181     }
182
183     /**
184      * Map a URL to a new URL. For example, Mirror
185      * uses this call to map remote URLs to their corresponding
186      * local URLs.
187      * @param url URL of interest
188      * @param newURL URL which should be returned by lookup (null, url)
189      */

190     public synchronized void map (URL JavaDoc url, URL JavaDoc newURL) {
191         if (map == null)
192             map = new Hashtable JavaDoc ();
193         map.put (url, newURL);
194     }
195
196     /**
197      * Test whether a URL is mapped.
198      * @param url URL of interest
199      * @return true if map () was called to remap url
200      */

201     public boolean isMapped (URL JavaDoc url) {
202         return map != null && map.containsKey (url);
203     }
204
205     /**
206      * Write a page through the transformer. If
207      * getEmitBaseElement() is true and getBase() is
208      * non-null, then the transformer
209      * outputs a &lt;BASE&gt; element either inside the
210      * page's &lt;HEAD&gt; element (if present) or before
211      * the first tag that belongs in &lt;BODY&gt;.
212      * @param page Page to write
213      */

214     public synchronized void writePage (Page page) throws IOException {
215         needToEmitBase = emitBaseElement && base != null;
216         super.writePage (page);
217         needToEmitBase = false;
218     }
219
220     /**
221      * Handle an element written through the transformer.
222      * Remaps attributes that contain URLs.
223      * @param elem Element to transform
224      */

225     protected void handleElement (Element elem) throws IOException {
226         Tag tag = elem.getStartTag ();
227         String JavaDoc tagName = elem.getTagName ();
228
229         if (needToEmitBase && tag.isBodyTag ()) {
230             emit ("<BASE HREF=\"" + base.toString () + "\">");
231             needToEmitBase = false;
232         }
233
234         if (elem instanceof Link)
235             handleLink ((Link)elem);
236         else if (tagName == Tag.BASE)
237             handleBase (elem);
238         else if (needToEmitBase && tagName == Tag.HEAD) {
239             // put BASE at the end of HEAD, if we don't find it earlier
240
emit (elem.getStartTag ());
241             transformContents (elem);
242             if (needToEmitBase) {
243                 emit ("<BASE HREF=\"" + base.toString () + "\">");
244                 needToEmitBase = false;
245             }
246             if (elem.getEndTag () != null)
247                 emit (elem.getEndTag ());
248         }
249         else
250             super.handleElement (elem);
251     }
252     
253     /**
254      * Handle a Link's transformation.
255      * Default implementation replaces the link's URL
256      * with lookup(URL).
257      * @param link Link to transform
258      */

259     protected void handleLink (Link link) throws IOException {
260         emit ( link.replaceHref (lookup (base, link.getURL())) );
261         transformContents (link);
262         if (link.getEndTag () != null)
263            emit (link.getEndTag ());
264     }
265
266     /**
267      * Handle the BASE element.
268      * Default implementation removes if if EmitBaseElement
269      * is false, or changes its URL to Base if EmitBaseElement
270      * is true.
271      * @param elem BASE element to transform
272      */

273     protected void handleBase (Element elem) throws IOException {
274         Tag tag = elem.getStartTag ();
275         if (needToEmitBase) {
276             emit (tag.replaceHTMLAttribute ("href", base.toString()));
277             needToEmitBase = false;
278         }
279         else if (tag.hasHTMLAttribute ("href")
280                 && tag.countHTMLAttributes () > 1)
281             // tag has other attributes that we want to preserve
282
emit (tag.removeHTMLAttribute ("href"));
283         // otherwise skip the BASE element
284
}
285     
286     /*
287      * Testing
288      *
289     public static void main (String[] args) throws Exception {
290         OutputStream out = (args.length >= 2)
291             ? (OutputStream)new java.io.FileOutputStream (args[1])
292             : (OutputStream)System.out;
293         HTMLTransformer unparser = new LinkTransformer (out);
294
295         Link link = new Link (args[0]);
296         Page page = new Page (link);
297
298         unparser.write (page);
299         unparser.close ();
300     }
301      */

302
303 }
304
Popular Tags