KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > spider > docfilter > LinkLocalizer


1 package net.matuschek.spider.docfilter;
2
3 /************************************************
4     Copyright (c) 2001/2002 by Daniel Matuschek
5 *************************************************/

6
7
8 import java.io.ByteArrayInputStream JavaDoc;
9 import java.io.ByteArrayOutputStream JavaDoc;
10 import java.io.PrintWriter JavaDoc;
11 import java.net.MalformedURLException JavaDoc;
12 import java.net.URL JavaDoc;
13 import java.util.StringTokenizer JavaDoc;
14
15 import net.matuschek.http.HttpDoc;
16 import net.matuschek.util.NullWriter;
17
18 import org.w3c.dom.Node JavaDoc;
19 import org.w3c.dom.Element JavaDoc;
20 import org.w3c.dom.NodeList JavaDoc;
21 import org.w3c.dom.Document JavaDoc;
22
23 import org.w3c.tidy.Tidy;
24
25
26 /**
27  * Localizer tries to replace absolute links by relative links
28  * and should allow offline browsing.
29  *
30  * It uses JTidy to parse the file.
31  *
32  * @author Daniel Matuschek
33  * @version $Revision: 1.11 $
34  */

35 public class LinkLocalizer implements DocumentFilter
36 {
37   /** processing enabled ? */
38   protected boolean enabled=true;
39
40   /**
41    * This method processes the file and will replace
42    * absolute links by relative.
43    *
44    * @return the old document, if the ContentType is not
45    * text/html, a new (localized) document otherwise.
46    */

47   public HttpDoc process(HttpDoc input)
48     throws FilterException
49   {
50     if (input == null) {
51       return null;
52     }
53
54     if (! input.isHTML()) {
55       return input;
56     }
57
58     if (! enabled) {
59       return input;
60     }
61
62     // okay, parse the HTML code
63
ByteArrayInputStream JavaDoc bis = new ByteArrayInputStream JavaDoc(input.getContent());
64     Tidy tidy = new Tidy();
65     tidy.setUpperCaseTags(false);
66     tidy.setUpperCaseAttrs(false);
67     tidy.setErrout(new PrintWriter JavaDoc(new NullWriter()));
68
69     Document JavaDoc doc = tidy.parseDOM(bis,null);
70
71     rewriteDOM(doc,input.getURL());
72
73     ByteArrayOutputStream JavaDoc bos = new ByteArrayOutputStream JavaDoc();
74     tidy.pprint(doc,bos);
75
76     input.setContent(bos.toByteArray());
77     
78     return input;
79   }
80
81
82   /**
83    * Enable processing, the will parse the document and try to
84    * replace absolute by relative links.
85    */

86   public void enable() {
87     this.enabled=true;
88   }
89
90
91   /**
92    * Disable processing, the filter will not change the document
93    * content.
94    */

95   public void disable() {
96     this.enabled=false;
97   }
98
99
100   /**
101    * Is the link processing enabled ?
102    *
103    * @return true, if the filter processes links, false otherwise
104    */

105   public boolean isEnabled() {
106     return this.enabled;
107   }
108
109
110   /**
111    * Rewrite this DOM with relative URLs. Will process the whole DOM
112    *
113    * @param node root node of the DOM to modify
114    * @param url base URL of teh document itself (for relative addressing)
115    */

116   private void rewriteDOM(Node JavaDoc node, URL JavaDoc url)
117     throws FilterException
118   {
119
120     // this should not happen !
121
if (node==null) {
122       throw new FilterException("Got a null node");
123     }
124
125     // ELEMENT ?
126
if (node instanceof Element JavaDoc) {
127       String JavaDoc name = node.getNodeName();
128       if (name.equals("a")
129       || name.equals("area")) {
130     localizeAttrib(node,"href",url);
131
132       } else if (name.equals("img")
133          || name.equals("frame")) {
134     localizeAttrib(node,"src",url);
135
136       }
137     }
138
139     // recursive travel through all childs
140
NodeList JavaDoc childs = node.getChildNodes();
141
142     for (int i=0; i<childs.getLength(); i++) {
143       rewriteDOM(childs.item(i),url);
144     }
145     
146
147   }
148
149
150   /**
151    * Localize a given attribute for a Element. <br />
152    * Thanks to Paul Tan for the feedback
153    *
154    * @param node an element node that should be localized
155    * @param attribute name of the attribute that should be localized
156    * @param context an URL that is the context for relative
157    * addressing (base address)
158    */

159   private void localizeAttrib(Node JavaDoc node,
160                   String JavaDoc attribute,
161                   URL JavaDoc context)
162   {
163     Element JavaDoc el = (Element JavaDoc)node;
164     String JavaDoc oldValue = el.getAttribute(attribute);
165
166     // only localize if the attribute exists
167
// only localize if the file is in another directory
168
if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) {
169       String JavaDoc newValue = localizeURL(oldValue,context);
170       el.setAttribute(attribute, newValue);
171     } // end of if ()
172

173   }
174
175
176
177   /**
178    * Localize a given URL.
179    *
180    * Thanks to Paul Tan and Laurent Salinas for the feedback.
181    *
182    * @param urlStr a String containing a URL, can be relative
183    * (e.g. ../index.html) or absolute ("http://myserver/")
184    * @param context an URL that a the context URL for relative URLs
185    *
186    * @return a String containing an URL that will be relative to the given
187    * context if both URLs are on the same host, otherwise it will simply
188    * return urlStr
189    */

190   private String JavaDoc localizeURL(String JavaDoc urlStr, URL JavaDoc context) {
191     URL JavaDoc url;
192     try {
193       url = new URL JavaDoc(context, urlStr);
194     } catch (MalformedURLException JavaDoc e) {
195       return urlStr;
196     }
197
198     // only localize "http:" links
199
if (! url.getProtocol().equalsIgnoreCase("http")) {
200       return urlStr;
201     }
202
203     // only localize if new URL is on the same host !
204

205     if ((context != null)
206     && (context.getHost().equalsIgnoreCase(url.getHost()))) {
207       String JavaDoc ref = url.getRef();
208       String JavaDoc path = url.getPath();
209       
210       // Already relative
211
// this should only happen if the context
212
// is null
213
if (path.startsWith("../")) {
214     return urlStr;
215       }
216
217       // URL references
218
if ((ref != null) && (! ref.equals(""))) {
219     path = path+"#"+ref;
220       }
221
222       // implied index.html
223
if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') {
224     path = path+"index.html";
225       }
226     
227       return localizePath(url.getPath(),context.getPath());
228     } else {
229       return urlStr;
230     }
231   }
232
233
234   /**
235    * Localize a given path. Very dumb, but it works ;-)
236    *
237    * @param path path to localize
238    * @param context reference path
239    * @return a path that is given relative
240    *
241    * Example: <br />
242    * path="/images/test.gif" <br />
243    * context="/test/index.html"<br />
244    * result="../images/test.gif"
245    */

246   private String JavaDoc localizePath(String JavaDoc path, String JavaDoc context) {
247     StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(context,"/");
248     int depth = st.countTokens();
249     if (! context.endsWith("/")) {
250       depth--;
251     }
252
253     StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
254     if (depth>0) {
255       for (int i=0; i<depth; i++) {
256     sb.append("/..");
257       }
258       sb.deleteCharAt(0);
259     } else {
260       if (path.startsWith("/")) {
261     // delete first character (absolute path);
262
path=path.substring(1);
263       }
264     }
265     sb.append(path);
266
267     return sb.toString();
268   }
269
270 }
271
272
Popular Tags