KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > HTMLTransformer


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.io.File JavaDoc;
36 import java.io.OutputStream JavaDoc;
37 import java.io.RandomAccessFile JavaDoc;
38 import java.io.IOException JavaDoc;
39 //#ifdef JDK1.1
40
import java.io.Writer JavaDoc;
41 import java.io.OutputStreamWriter JavaDoc;
42 //#endif JDK1.1
43
/*#ifdef JDK1.0
44 import java.io.PrintStream;
45 #endif JDK1.0*/

46
47 public class HTMLTransformer {
48
49 //#ifdef JDK1.1
50
private OutputStream JavaDoc stream; // output stream for binary content
51
private Writer JavaDoc writer; // output stream for HTML content
52
//#endif JDK1.1
53
/*#ifdef JDK1.0
54     private PrintStream stream; // output stream
55 #endif JDK1.0*/

56     private boolean openedStream = false;
57             // we opened the stream, so we'd better close it
58

59     private RandomAccessFile JavaDoc readwrite; // output file
60

61     private HTMLTransformer next; // next HTMLTransformer in the filter chain
62
private HTMLTransformer head; // head of filter chain
63
private HTMLTransformer tail; // tail of filter chain
64

65     // these fields are only valid on the tail element in the filter
66
// chain
67
private String JavaDoc content; // content of page being printed
68
private int emitStart, emitEnd; // start and end of pending region
69
// (the last region in the page which
70
// has been emit()ed but not actually
71
// written)
72
private int transformEnd; // end of region being transformed
73

74     /**
75      * Make an HTMLTransformer that writes pages to a
76      * stream.
77      * @param out Stream to receive HTML output
78      */

79     public HTMLTransformer (OutputStream JavaDoc out) {
80         head = tail = this;
81         next = null;
82         setOutput (out);
83     }
84
85     /**
86      * Make an HTMLTransformer that writes pages to a
87      * file.
88      * @param filename Name of file to receive HTML output
89      * @exception IOException if file cannot be opened
90      */

91     public HTMLTransformer (String JavaDoc filename) throws IOException JavaDoc {
92         head = tail = this;
93         next = null;
94         openFile (filename, false);
95     }
96
97
98     /**
99      * Make an HTMLTransformer that writes pages to a
100      * file.
101      * @param filename Name of file to receive HTML output
102      * @param seekable True if file should be opened for random access
103      */

104     public HTMLTransformer (String JavaDoc filename, boolean seekable) throws IOException JavaDoc {
105         head = tail = this;
106         next = null;
107         openFile (filename, seekable);
108     }
109
110
111     /**
112      * Make an HTMLTransformer that writes pages to a
113      * downstream HTMLTransformer. Use this constructor
114      * to chain together several HTMLTransformers.
115      * @param next HTMLTransformer to receive HTML output
116      */

117     public HTMLTransformer (HTMLTransformer next) {
118         this.next = next;
119         tail = next != null ? next.tail : this;
120         for (HTMLTransformer u = this; u != null; u = u.next)
121             u.head = this;
122     }
123
124     private void openFile (String JavaDoc filename, boolean seekable) throws IOException JavaDoc {
125         File JavaDoc file = new File JavaDoc (filename);
126
127         // open a stream first, to truncate the file to 0
128
OutputStream JavaDoc out = Access.getAccess ().writeFile (file, false);
129         
130         if (!seekable)
131             setOutput (out);
132         else {
133             out.close ();
134             RandomAccessFile JavaDoc raf = Access.getAccess ().readWriteFile (file);
135             setRandomAccessFile (raf);
136         }
137             
138         openedStream = true;
139     }
140
141 //#ifdef JDK1.1
142
public void setOutput (OutputStream JavaDoc out) {
143         if (next == null) {
144             stream = out;
145             writer = new OutputStreamWriter JavaDoc (out);
146         } else
147             next.setOutput (out);
148     }
149
150 // public void setOutput (Writer out) {
151
// if (next == null)
152
// stream = out;
153
// else
154
// next.setOutput (out);
155
// }
156

157     public OutputStream JavaDoc getOutputStream () {
158         return tail.stream;
159     }
160
161     public Writer JavaDoc getOutputWriter () {
162         return tail.writer;
163     }
164 //#endif JDK1.1
165

166 /*#ifdef JDK1.0
167     public void setOutput (OutputStream out) {
168         if (next == null)
169             stream = new PrintStream (out);
170         else
171             next.setOutput (out);
172     }
173
174     public OutputStream getOutput () {
175         return tail.stream;
176     }
177 #endif JDK1.0*/

178
179     public void setRandomAccessFile (RandomAccessFile JavaDoc raf) {
180         if (next == null)
181             readwrite = raf;
182         else
183             next.setRandomAccessFile (raf);
184     }
185     
186     public RandomAccessFile JavaDoc getRandomAccessFile () {
187         return tail.readwrite;
188     }
189
190     /**
191      * Writes a literal string through the HTML transformer
192      * (without parsing it or transforming it).
193      * @param string String to write
194      */

195     public synchronized void write (String JavaDoc string) throws IOException JavaDoc {
196         if (next == null)
197             emit (string);
198         else
199             next.write (string);
200     }
201
202     /**
203      * Writes a chunk of HTML through the HTML transformer.
204      * @param region Region to write
205      */

206     public synchronized void write (Region region) throws IOException JavaDoc {
207         if (next == null) {
208             emitPendingRegion ();
209             
210             String JavaDoc oldContent = content;
211             int oldEmitStart = emitStart;
212             int oldEmitEnd = emitEnd;
213             int oldTransformEnd = transformEnd;
214             
215             content = region.getSource().getContent ();
216             emitStart = emitEnd = region.getStart ();
217             transformEnd = region.getEnd ();
218
219             processElementsInRegion (region.getRootElement(),
220                                      region.getStart(),
221                                      region.getEnd());
222
223             emitPendingRegion ();
224
225             content = oldContent;
226             emitStart = oldEmitStart;
227             emitEnd = oldEmitEnd;
228             transformEnd = oldTransformEnd;
229         }
230         else
231             next.write (region);
232     }
233
234     /**
235      * Writes a page through the HTML transformer.
236      * @param page Page to write
237      */

238     public synchronized void writePage (Page page) throws IOException JavaDoc {
239         if (next == null) {
240             if (page.isHTML ())
241                 write (page);
242             else {
243                 System.err.println ("binary write of " + page.getURL ());
244                 writeStream (page.getContentBytes (),
245                              0, page.getLength ());
246             }
247         }
248         else
249             next.writePage (page);
250     }
251
252     /**
253      * Flushes transformer to its destination stream.
254      * Empties any buffers in the transformer chain.
255      */

256     public synchronized void flush () throws IOException JavaDoc {
257         if (next == null) {
258             emitPendingRegion ();
259             if (stream != null)
260                 stream.flush ();
261             if (writer != null)
262                 writer.flush ();
263         }
264         else
265             next.flush ();
266     }
267
268     /**
269      * Close the transformer. Flushes all buffered data
270      * to disk by calling flush(). This call may be
271      * time-consuming! Don't use the transformer again after
272      * closing it.
273      * @exception IOException if an I/O error occurs
274      */

275     public synchronized void close () throws IOException JavaDoc {
276         flush ();
277         if (next == null) {
278             if (openedStream) {
279                 if (stream != null)
280                     stream.close ();
281                 if (readwrite != null)
282                     readwrite.close ();
283             }
284         }
285         else
286             next.close ();
287     }
288     
289     /**
290      * Finalizes the transformer (calling close()).
291      */

292     protected void finalize() throws Throwable JavaDoc {
293         close ();
294     }
295
296     /**
297      * Get the file pointer.
298      * @return current file pointer
299      * @exception IOException if this transformer not opened for random access
300      */

301     public long getFilePointer () throws IOException JavaDoc {
302         if (readwrite == null)
303             throw new IOException JavaDoc ("HTMLTransformer not opened for random access");
304         return readwrite.getFilePointer ();
305     }
306
307     /**
308      * Seek to a file position.
309      * @param pos file position to seek
310      * @exception IOException if this transformer not opened for random access
311      */

312     public void seek (long pos) throws IOException JavaDoc {
313         if (readwrite == null)
314             throw new IOException JavaDoc ("HTMLTransformer not opened for random access");
315         readwrite.seek (pos);
316     }
317
318     /**
319      * Transform an element by passing it through the entire
320      * filter chain.
321      * @param elem Element to be transformed
322      */

323     protected void transformElement (Element elem) throws IOException JavaDoc {
324         head.handleElement (elem);
325     }
326
327     /**
328      * Transform the contents of an element. Passes
329      * the child elements through the filter chain
330      * and emits the text between them.
331      * @param elem Element whose contents should be transformed
332      */

333     protected void transformContents (Element elem) throws IOException JavaDoc {
334         Tag startTag = elem.getStartTag ();
335         Tag endTag = elem.getEndTag ();
336
337         tail.processElementsInRegion (elem.getChild(),
338                            startTag.getEnd(),
339                            endTag != null ? endTag.getStart() : elem.getEnd());
340     }
341
342     /**
343      * Handle the transformation of an HTML element.
344      * Override this method to modify the HTML as it is
345      * written.
346      * @param elem Element to transform
347      */

348     protected void handleElement (Element elem) throws IOException JavaDoc {
349         if (next == null) {
350             Tag startTag = elem.getStartTag ();
351             Tag endTag = elem.getEndTag ();
352             
353             emit (startTag);
354             transformContents (elem);
355             if (endTag != null)
356                 emit (endTag);
357         }
358         else
359             next.handleElement (elem);
360     }
361
362     /**
363      * Emit a region on the transformer chain's final output.
364      * (The region isn't passed through the chain.)
365      * @param r Region to emit
366      */

367     protected void emit (Region r) throws IOException JavaDoc {
368         tail.emitInternal (r.getSource().getContent(), r.getStart(), r.getEnd ());
369     }
370
371     /**
372      * Emit a string on the transformer chain's final output.
373      * @param string String to emit
374      */

375     protected void emit (String JavaDoc string) throws IOException JavaDoc {
376         tail.emitInternal (string, 0, string.length());
377     }
378
379     private void processElementsInRegion (Element elem, int start, int end) throws IOException JavaDoc {
380         if (this != tail)
381             throw new RuntimeException JavaDoc ("processElementsInRegion not called on tail");
382             
383         int p = start;
384         
385         if (elem != null && elem.getSource().getContent() == content)
386             end = Math.min (end, transformEnd);
387
388         while (elem != null && elem.getStartTag().getEnd() <= end) {
389             emitInternal (content, p, elem.getStart());
390             transformElement (elem);
391             p = elem.getEnd ();
392             elem = elem.getNext ();
393         }
394         emitInternal (content, Math.min (p, end), end);
395     }
396
397     private void emitInternal (String JavaDoc str, int start, int end) throws IOException JavaDoc {
398         if (this != tail)
399             throw new RuntimeException JavaDoc ("emitInternal not called on tail");
400             
401         if (str == content) {
402             start = Math.min (start, transformEnd);
403             end = Math.min (end, transformEnd);
404                 
405             if (start == emitEnd)
406                 emitEnd = end; // just extend the pending emit region
407
else {
408                 emitPendingRegion ();
409                 emitStart = start;
410                 emitEnd = end;
411             }
412         }
413         else {
414             emitPendingRegion ();
415             writeStream (str.substring (start, end));
416         }
417     }
418
419     private void emitPendingRegion () throws IOException JavaDoc {
420         if (this != tail)
421             throw new RuntimeException JavaDoc ("emitPendingRegion not called on tail");
422             
423         if (emitStart != emitEnd) {
424             writeStream (content.substring (emitStart, emitEnd));
425             emitStart = emitEnd;
426         }
427     }
428     
429     private void writeStream (String JavaDoc s) throws IOException JavaDoc {
430         if (writer != null) {
431             //#ifdef JDK1.1
432
writer.write (s);
433             //#endif JDK1.1
434
/*#ifdef JDK1.0
435             stream.print (s);
436             #endif JDK1.0*/

437         }
438         else
439             readwrite.writeBytes (s);
440     }
441
442     private void writeStream (byte[] buf, int offset, int len) throws IOException JavaDoc {
443         if (stream != null) {
444             //#ifdef JDK1.1
445
stream.write (buf, offset, len);
446             //#endif JDK1.1
447
/*#ifdef JDK1.0
448             stream.write (buf, offset, len);
449             #endif JDK1.0*/

450         }
451         else
452             readwrite.write (buf, offset, len);
453     }
454
455     /*
456      * Testing
457      *
458     public static void main (String[] args) throws Exception {
459         Link link = new Link (args[0]);
460         Page page = new Page (link);
461
462         OutputStream out = (args.length >= 2)
463             ? (OutputStream)new java.io.FileOutputStream (args[1])
464             : (OutputStream)System.out;
465         HTMLTransformer unparser = new TestTransformer (out);
466
467         int len = page.getLength();
468         unparser.write (new Region (page, 0, 3*len/4));
469             
470         unparser.close ();
471     }
472      */

473 }
474
475     /*
476      * Testing
477      *
478 class TestTransformer extends HTMLTransformer {
479     public TestTransformer (OutputStream out) {
480         super (out);
481     }
482     
483     protected void handleElement (Element elem) throws IOException {
484         System.out.println ("handling <" + elem.getTagName() + ">");
485         super.handleElement (elem);
486     }
487 }
488     */

489
Popular Tags