KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > RewritableLinkTransformer


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.io.*;
36 import java.net.URL JavaDoc;
37
38 /**
39  * Transformer that remaps URLs in links in such a way
40  * that if the URL mapping changes during (or after) some
41  * HTML has been transformed, the HTML can be fixed up after
42  * the fact. This class is used by Concatenator and Mirror,
43  * since in those operations, the URL mapping function
44  * changes as each page is written to the concatenation or
45  * mirror.
46  */

47 public class RewritableLinkTransformer extends LinkTransformer {
48
49     private RewriteRegion head, tail;
50     private File file;
51     private boolean closed = false;
52
53     /**
54      * Make a RewritableLinkTransformer.
55      * @param filename Filename to write to
56      */

57     public RewritableLinkTransformer (String JavaDoc filename) throws IOException {
58         super (filename, true);
59         file = new File (filename);
60     }
61
62     public void close () throws IOException {
63         super.close ();
64         closed = true;
65     }
66
67     static final String JavaDoc PLACEHOLDER = "@WEBSPHINX@";
68
69     protected void handleLink (Link link) throws IOException {
70         URL JavaDoc url = link.getURL ();
71
72         Tag t = link.replaceHref (PLACEHOLDER);
73         String JavaDoc s = t.toString();
74         int prefix = s.indexOf (PLACEHOLDER);
75         if (prefix != -1) {
76             int postfix = prefix + PLACEHOLDER.length();
77             
78             emit (s.substring (0, prefix));
79             
80             String JavaDoc href = lookup (base, url);
81             RewriteRegion node = addURL (url, getFilePointer(), href.length());
82             emit (href);
83             
84             emit (s.substring (postfix));
85         } else {
86             emit (s);
87         }
88
89         transformContents (link);
90         if (link.getEndTag () != null)
91            emit (link.getEndTag ());
92     }
93
94     private RewriteRegion addURL (URL JavaDoc url, long offset, int len) {
95         RewriteRegion node = new RewriteRegion ();
96         node.url = url;
97         node.offset = offset;
98         node.len = len;
99
100         if (tail == null) {
101             head = tail = node;
102         }
103         else {
104             node.next = tail.next;
105             tail.next = node;
106             node.prev = tail;
107             if (node.next != null)
108                 node.next.prev = node;
109             tail = node;
110         }
111
112         return node;
113     }
114
115     static final int BUFFER_SIZE = 8;
116
117     /**
118      * Rewrite the file, remapping all the URLs according to their
119      * current values from lookup().
120      */

121     public void rewrite () throws IOException {
122         flush ();
123         
124         if (head == null)
125             // no links to rewrite
126
return;
127
128         RandomAccessFile raf = closed
129             ? Access.getAccess ().readWriteFile (file)
130             : getRandomAccessFile ();
131
132         byte buf[] = new byte[BUFFER_SIZE];
133         long end = raf.length ();
134         long src = 0;
135         long dest = 0;
136         long left;
137         int n;
138         int growth = 0;
139         int shrinkage = 0;
140
141         // Forward pass
142
// Rewrite only URLs which are becoming shorter
143
raf.seek (dest);
144         for (RewriteRegion loc = head; loc != null; loc = loc.next) {
145             // loop invariant: file[0..dest-1] is rewritten,
146
// and next byte to copy to file[dest] is from file[src]
147
// and raf.getFilePointer() == dest
148
long diff = dest - src;
149
150             String JavaDoc href = lookup (base, loc.url);
151             loc.newHref = href;
152             loc.newLen = href.length ();
153
154             if (loc.newLen > loc.len) {
155                 // new URL is longer than old URL
156
// must postpone rewriting this until the backward pass
157
growth += loc.newLen - loc.len;
158                 loc.offset += diff;
159                 continue;
160             }
161             else
162                 shrinkage += loc.len - loc.newLen;
163
164             // rewrite up to loc
165
left = loc.offset - src;
166             while (left > BUFFER_SIZE) {
167                 raf.seek (src);
168                 raf.read (buf);
169                 raf.seek (dest);
170                 raf.write (buf);
171                 src += BUFFER_SIZE;
172                 dest += BUFFER_SIZE;
173                 left -= BUFFER_SIZE;
174             }
175             if (left > 0) {
176                 n = (int)left;
177                 raf.seek (src);
178                 raf.read (buf, 0, n);
179                 raf.seek (dest);
180                 raf.write (buf, 0, n);
181                 src += n;
182                 dest += n;
183                 left -= n;
184             }
185
186             // write loc
187
raf.writeBytes (href);
188
189             dest += loc.newLen;
190             src += loc.len;
191
192             loc.offset += diff;
193             loc.len = loc.newLen;
194         }
195
196         if (src > dest) {
197             // rewrite rest of file
198
while (true) {
199                 raf.seek (src);
200                 if ((n = raf.read (buf)) == -1)
201                     break;
202                 raf.seek (dest);
203                 raf.write (buf, 0, n);
204                 src += n;
205                 dest += n;
206             }
207         }
208         else
209             src = dest = end;
210
211         src = dest;
212         dest += growth;
213         for (RewriteRegion loc = tail; loc != null; loc = loc.prev) {
214             // loop invariant: file[dest...end-1] is rewritten,
215
// and next byte to copy to file[dest] is from file[src]
216
long diff = dest - src;
217
218             if (loc.newLen <= loc.len) {
219                 loc.offset += diff;
220                 continue;
221             }
222
223             // rewrite back to loc
224
left = src - (loc.offset + loc.len);
225             while (left > BUFFER_SIZE) {
226                 src -= BUFFER_SIZE;
227                 dest -= BUFFER_SIZE;
228                 left -= BUFFER_SIZE;
229                 raf.seek (src);
230                 raf.read (buf);
231                 raf.seek (dest);
232                 raf.write (buf);
233             }
234             if (left > 0) {
235                 n = (int)left;
236                 src -= n;
237                 dest -= n;
238                 raf.seek (src);
239                 raf.read (buf, 0, n);
240                 raf.seek (dest);
241                 raf.write (buf, 0, n);
242             }
243
244             // write loc
245
dest -= loc.newLen;
246             src -= loc.len;
247             raf.seek (dest);
248             raf.writeBytes (loc.newHref);
249
250             loc.offset = dest;
251             loc.len = loc.newLen;
252         }
253
254         if (src != dest)
255             System.err.println ("ASSERTION FAILURE: SRC=" + src + "!=dest=" + dest);
256
257         if (shrinkage > growth) {
258             // overwrite the rest of the file with spaces
259
for (int i=0; i<BUFFER_SIZE; ++i)
260                 buf[i] = (byte)' ';
261             left = shrinkage - growth;
262             raf.seek (end - left);
263             while (left > BUFFER_SIZE) {
264                 raf.write (buf);
265                 left -= BUFFER_SIZE;
266             }
267             if (left > 0)
268                 raf.write (buf, 0, (int)left);
269         }
270         else
271             raf.seek (end + (growth - shrinkage));
272
273
274         if (closed)
275             raf.close ();
276     }
277
278     /*
279      * Testing
280      *
281     public static void main (String[] args) throws Exception {
282         RewritableLinkTransformer unparser = new TestILTransformer (args[1]);
283         Link link = new Link (args[0]);
284         Page page = new Page (link);
285         System.out.println ("Writing " + page.toDescription());
286         unparser.writePage (page);
287         System.out.println ("Rewriting while open");
288         unparser.rewrite ();
289         unparser.close ();
290         System.out.println ("Rewriting after close");
291         unparser.rewrite ();
292     }
293      */

294 }
295
296 class RewriteRegion {
297     URL JavaDoc url;
298     long offset;
299     int len;
300
301     String JavaDoc newHref;
302     int newLen;
303
304     RewriteRegion next;
305     RewriteRegion prev;
306 }
307
308 /*
309  * Testing
310  *
311 class TestILTransformer extends RewritableLinkTransformer {
312     public TestILTransformer (String filename) throws IOException {
313         super (filename);
314     }
315
316     final static String BIG_STRING =
317         "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
318        +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
319        +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
320        +"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@";
321
322     public String lookup (URL base, URL url) {
323         if (closed)
324             return super.lookup (base, url);
325         else if (Math.random() > 0.5)
326             return BIG_STRING.substring (0, url.toString().length()*2);
327         else
328             return "";
329     }
330 }
331  */

332
Popular Tags