KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Mirror


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.io.*;
36 import java.net.URL JavaDoc;
37 import java.net.MalformedURLException JavaDoc;
38 import java.util.Vector JavaDoc;
39
40 /**
41  * Offline mirror of a Web site. Web pages written to
42  * a mirror are stored as files on the local disk in a directory
43  * structure mirroring their URLs.
44  * <P>
45  */

46  
47 // FIX: discards ALL anchors (for some reason)
48
public class Mirror extends LinkTransformer {
49     String JavaDoc root;
50         // Root directory represented as file:/<dir>/
51

52     Vector JavaDoc files = new Vector JavaDoc ();
53         // collection of RewritableLinkTransformers, one for each
54
// file in the mirror
55

56     boolean needRewrite = false;
57
58     String JavaDoc defaultFilename = "index.html";
59         // name given to a directory URL (like http://foo.com/)
60
// when it is saved to disk
61

62     /**
63      * Make a new Mirror.
64      * @param directory Root directory (on local disk
65      * relative to which the mirror pages are stored)
66      */

67     public Mirror (String JavaDoc directory) throws IOException {
68         super ((HTMLTransformer)null);
69         if (!directory.endsWith ("/"))
70             directory += "/";
71         File rootFile = new File (directory);
72         if (!rootFile.isAbsolute ())
73             rootFile = new File (rootFile.getAbsolutePath ());
74         URL JavaDoc rootURL = Link.FileToURL (rootFile);
75         root = rootURL.toExternalForm ();
76     }
77     
78     /**
79      * Get the filename used for directory URLs.
80      * For example, if the default filename is "index.html",
81      * then the remote URL "http://www.xxx.com/path/" would
82      * map to the local pathname "www.xxx.com/path/index.html".
83      * @return default filename. Default is "index.html".
84      */

85     public String JavaDoc getDefaultFilename () {
86         return defaultFilename;
87     }
88
89     /**
90      * Set the filename used for directory URLs.
91      * For example, if the default filename is "index.html",
92      * then the remote URL "http://www.xxx.com/path/" would
93      * map to the local pathname "www.xxx.com/path/index.html".
94      * @param filename Default filename.
95      */

96     public synchronized void setDefaultFilename (String JavaDoc filename) {
97         defaultFilename = filename;
98     }
99
100     /**
101      * Get number of pages written to this mirror.
102      * @return number of calls to writePage() on this mirror
103      */

104     public synchronized int getPageCount () {
105         return files.size ();
106     }
107
108     public void write (Region region) throws IOException {
109         throw new IOException ("write(Region) not supported by Mirror");
110     }
111
112     public void write (String JavaDoc string) throws IOException {
113         throw new IOException ("write(String) not supported by Mirror");
114     }
115
116     /**
117      * Write a page to the mirror. Stores the page on the local
118      * disk, fixing up its links to point to the local
119      * copies of any pages already stored to this mirror.
120      * @param page Page to write
121      */

122     public synchronized void writePage (Page page) throws IOException {
123         URL JavaDoc url = page.getURL ();
124         String JavaDoc local = toLocalFileURL (url);
125         URL JavaDoc localURL = new URL JavaDoc (local);
126         File localFile = Link.URLToFile (localURL);
127
128         File parent = new File (localFile.getParent ());
129         if (parent != null)
130             Access.getAccess ().makeDir (parent);
131
132         MirrorTransformer out = new MirrorTransformer (this, localFile);
133         out.setBase (localURL);
134         out.setEmitBaseElement (getEmitBaseElement ());
135         out.writePage (page);
136         out.close ();
137         
138         needRewrite = !files.isEmpty ();
139         files.addElement (out);
140     }
141
142     /**
143      * Close the mirror. Makes sure that links point to local versions of
144      * pages wherever possible.
145      */

146     public synchronized void close () throws IOException {
147         rewrite ();
148     }
149
150     /**
151      * Rewrite the mirror to make local links consistent.
152      */

153     public synchronized void rewrite () throws IOException {
154         if (needRewrite) {
155             for (int i=0, n = files.size (); i < n; ++i) {
156                 RewritableLinkTransformer r =
157                     (RewritableLinkTransformer)files.elementAt (i);
158                 r.rewrite ();
159             }
160             needRewrite = false;
161         }
162     }
163     
164     // maps a remote URL to a local file URL ("<root>/<host>/<filename>")
165
// resulting URL is never slash-terminated
166
private String JavaDoc toLocalFileURL (URL JavaDoc remoteURL) {
167         if (isMapped (remoteURL))
168             return lookup (null, remoteURL);
169             
170         String JavaDoc remote = remoteURL.toExternalForm ();
171         URL JavaDoc remoteDirURL = Link.getDirectoryURL (remoteURL);
172         String JavaDoc remoteDir = remoteDirURL.toExternalForm();
173         String JavaDoc remoteFile = (remote.length() > remoteDir.length()) ? encode (remote.substring (remoteDir.length())) : defaultFilename;
174         String JavaDoc localDir = toLocalDirURL (remoteDirURL);
175         String JavaDoc local = localDir + remoteFile;
176         
177         map (remoteURL, local);
178         return local;
179     }
180
181     // Maps a remote directory URL (slash-terminated) to a local
182
// directory URL (slash-terminated)
183
private String JavaDoc toLocalDirURL (URL JavaDoc remoteURL) {
184         if (isMapped (remoteURL))
185             return lookupDir (null, remoteURL);
186
187         String JavaDoc remote = remoteURL.toExternalForm ();
188         String JavaDoc local;
189         URL JavaDoc remoteParentURL = Link.getParentURL (remoteURL);
190         
191         if (remoteParentURL.equals (remoteURL)) {
192             // we've reached http://host/
193
String JavaDoc host = remoteURL.getHost ();
194             int port = remoteURL.getPort ();
195             local = root
196                     + encode ((port != -1) ? host + ":" + port : host)
197                     + '/';
198         }
199         else {
200             String JavaDoc remoteParent = remoteParentURL.toExternalForm();
201             String JavaDoc remoteFile = encode (remote.substring (remoteParent.length(),
202                                                           remote.length()-1));
203             String JavaDoc localDir = toLocalDirURL (remoteParentURL);
204             local = localDir + remoteFile + "/";
205         }
206             
207         map (remoteURL, local);
208         return local;
209     }
210
211     /**
212      * Map a directory URL (of the form http://host/path/) to
213      * a local directory.
214      * @param url Directory URL. Must end with a slash.
215      * @param dir Local directory relative to which descendents of
216      * url should be saved.
217      */

218     public synchronized void mapDir (URL JavaDoc url, String JavaDoc dir) throws MalformedURLException JavaDoc {
219         if (!dir.endsWith ("/"))
220             dir += "/";
221         map (Link.getDirectoryURL (url),
222              Link.FileToURL (new File (dir + defaultFilename))
223              .toString());
224     }
225     
226     /**
227      * Lookup the local directory to which a remote directory
228      * URL maps.
229      * @param base local file URL to use as a base. If non-null,
230      * then the returned pathname is relative to this URL. If
231      * null, the returned pathname is an absolute URL (file:/path/).
232      * @param url remote directory URL to look up. Must end in slash.
233      */

234     public String JavaDoc lookupDir (URL JavaDoc base, URL JavaDoc url) {
235         String JavaDoc href = lookup (base, url);
236         int lastSlash = href.lastIndexOf ('/');
237         return href.substring(0, lastSlash+1);
238     }
239
240     private static String JavaDoc canonicalDir (String JavaDoc dir) {
241         dir = dir.replace ('\\', '/');
242         if (!dir.endsWith ("/"))
243             dir += "/";
244         if (!dir.startsWith ("/"))
245             dir = "/" + dir;
246         return dir;
247     }
248     
249     private static String JavaDoc encode (String JavaDoc component) {
250         char[] chars = component.toCharArray ();
251
252         for (int i=0; i<chars.length; ++i)
253             switch (chars[i]) {
254                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
255                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
256                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
257                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
258                 case 'Y': case 'Z':
259
260                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
261                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
262                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
263                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
264                 case 'y': case 'z':
265
266                 case '0': case '1': case '2': case '3': case '4':
267                 case '5': case '6': case '7': case '8': case '9':
268
269                 case '.': case '-': case '_': case '~':
270
271                     break;
272
273                 default:
274                     chars[i] = '_';
275                     break;
276             }
277
278         return new String JavaDoc (chars);
279     }
280
281     /*
282      * Testing
283      *
284      */

285     public static void main (String JavaDoc[] args) throws Exception JavaDoc {
286         String JavaDoc directory = args[args.length-1];
287         Mirror out = new Mirror (directory);
288         out.mapDir (new URL JavaDoc (args[0]), directory);
289         for (int i=0; i<args.length-1; ++i) {
290             Link link = new Link (args[i]);
291             Page page = new Page (link);
292             out.writePage (page);
293         }
294         out.close ();
295     }
296
297 }
298
299 class MirrorTransformer extends RewritableLinkTransformer {
300     Mirror mirror; // on the wall?
301

302     public MirrorTransformer (Mirror mirror, File file) throws IOException {
303         super (file.toString());
304         this.mirror = mirror;
305     }
306     
307     public String JavaDoc lookup (URL JavaDoc base, URL JavaDoc url) {
308         return mirror.lookup (base, url);
309     }
310     
311     public void map (URL JavaDoc remoteURL, String JavaDoc href) {
312         mirror.map (remoteURL, href);
313     }
314     
315     public void map (URL JavaDoc remoteURL, URL JavaDoc url) {
316         mirror.map (remoteURL, url);
317     }
318     
319     public boolean isMapped (URL JavaDoc url) {
320         return mirror.isMapped (url);
321     }
322 }
323     
324
Popular Tags