KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > Concatenator


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import rcm.util.Str;
36 import java.io.*;
37 import java.net.URL JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39 import java.util.Hashtable JavaDoc;
40
41 /**
42  * Transformer that concatenates multiple pages
43  * into a single HTML page.
44  * <P>
45  * The entire set of pages is preceded by a "prolog"
46  * and followed by an "epilog", which are constant
47  * strings of HTML. Each page is preceded
48  * by a "header" and followed by a "footer". Adjacent pages
49  * are separated by a "divider".
50  * <P>
51  * Concatenator performs the following
52  * transformations on pages before appending them together:
53  * <UL>
54  * <LI> deletes elements that would conflict, including
55  * &lt;HEADf&gt;, &lt;TITLEf&gt;, &lt;BODYf&gt;, &lt;HTMLf&gt,
56  * &lt;STYLE&gt;, and &lt;FRAMES&gt;.
57  * <LI> deletes &lt;BASEf&gt; or replaces it with a user-specified
58  * &lt;BASEf&gt;
59  * <LI> changes links among the written pages into
60  * in-page references, of the form "#concatenator_N"
61  * <LI> changes links to other pages into absolute references
62  * </UL>
63  *
64  */

65  
66  // FIX: transform anchors
67
public class Concatenator extends RewritableLinkTransformer {
68     
69     boolean needRewrite = false;
70
71     public static String JavaDoc defaultProlog = "<HTML><HEAD><TITLE>Concatenation</TITLE></HEAD><BODY>\n";
72     public static String JavaDoc defaultHeader = "<TABLE WIDTH=\"100%\"><TR>\n"
73                     +"<TD ALIGN=left><A NAME=\"%a\">%t [%u]</A>\n"
74                     +"<TD ALIGN=right>Page %p</TABLE>\n";
75     public static String JavaDoc defaultFooter = "";
76     public static String JavaDoc defaultDivider = "\n<DIV STYLE=\"page-break-after: always;\"><HR></DIV>\n";
77     public static String JavaDoc defaultEpilog = "\n</BODY></HTML>\n";
78
79     String JavaDoc prolog = defaultProlog;
80     String JavaDoc header = defaultHeader;
81     String JavaDoc footer = defaultFooter;
82     String JavaDoc divider = defaultDivider;
83     String JavaDoc epilog = defaultEpilog;
84
85     int nPages = 0;
86
87     /**
88      * Make a new Concatenator that writes to a file.
89      * @param filename Filename to write concatenated pages to
90      * @exception IOException if file cannot be opened
91      */

92     public Concatenator (String JavaDoc filename) throws IOException {
93         super (makeDirs(filename));
94     }
95     
96     private static String JavaDoc makeDirs (String JavaDoc filename) throws IOException {
97         File file = new File (filename);
98         File parent = new File (file.getParent ());
99         if (parent != null)
100             Access.getAccess ().makeDir (parent);
101         return filename;
102     }
103
104
105     /**
106      * Set the prolog.
107      * @param prolog string of HTML that is emitted at the beginning
108      * of the concatenation. Default value is: <BR>
109      * <CODE>&lt;HTML&gt;&lt;HEAD&gt;&lt;TITLE&gt;Concatenation&lt;/TITLE&gt;&lt;/HEAD&gt;&lt;BODY&gt;\n</CODE>
110      */

111     public synchronized void setProlog (String JavaDoc prolog) {
112         this.prolog = prolog;
113     }
114     /**
115      * Get the prolog.
116      * @return string of HTML that is emitted at the beginning
117      * of the concatenation.
118      */

119     public String JavaDoc getProlog () {
120         return prolog;
121     }
122
123      /**
124      * Set the header. The header can contain macro codes which
125      * are replaced with attributes of the page about to be written:
126      * <DL>
127      * <DT>%t
128      * <DD>title of the page
129      * <DT>%u
130      * <DD>URL of page
131      * <DT>%a
132      * <DD>anchor name of the page ("pageN", where N is the page number)
133      * <DT>%p
134      * <DD>page number (starting from 1)
135      * </DL>
136      * @param header string of HTML that is emitted before
137      * each page. The default value is:<BR>
138      * <CODE> &lt;TABLE WIDTH="100%"&gt;&lt;TR&gt;\n <BR>
139      * &lt;TD ALIGN=left&gt;&lt;A NAME="%a"&gt;%t [%u]&lt;/A&gt;\n <BR>
140      * &lt;TD ALIGN=right&gt;Page %p&lt;/TABLE&gt;\n</CODE>
141      */

142     public synchronized void setPageHeader (String JavaDoc header) {
143         this.header = header;
144     }
145     /**
146      * Get the header.
147      * @return string of HTML that is emitted before
148      * each page.
149      */

150     public String JavaDoc getPageHeader () {
151         return header;
152     }
153
154     /**
155      * Set the footer. The footer can contain the same
156      * macros as the header (%t, %u, %a, %p); see setPageHeader
157      * for more details.
158      * @param footer string of HTML that is emitted after
159      * each page.
160      */

161     public synchronized void setPageFooter (String JavaDoc footer) {
162         this.footer = footer;
163     }
164     /**
165      * Get the footer.
166      * @return string of HTML that is emitted after
167      * each page.
168      */

169     public String JavaDoc getPageFooter () {
170         return footer;
171     }
172     /**
173      * Set the divider.
174      * @param divider string of HTML that is emitted between
175      * each pair of pages.
176      */

177     public synchronized void setDivider (String JavaDoc divider) {
178         this.divider = divider;
179     }
180     /**
181      * Get the divider.
182      * @return string of HTML that is emitted between
183      * each pair of pages.
184     */

185     public String JavaDoc getDivider () {
186         return divider;
187     }
188
189     /**
190      * Set the epilog.
191      * @param epilog string of HTML that is emitted after
192      * the entire concatenation.
193      */

194     public synchronized void setEpilog (String JavaDoc epilog) {
195         this.epilog = epilog;
196     }
197     /**
198      * Get the epilog.
199      * @return string of HTML that is emitted after
200      * the entire concatenation.
201      */

202     public String JavaDoc getEpilog () {
203         return epilog;
204     }
205
206     /**
207      * Get number of pages written to this mirror.
208      * @return number of calls to writePage() on this mirror
209      */

210     public synchronized int getPageCount () {
211         return nPages;
212     }
213
214     /**
215      * Rewrite the concatenation. Makes sure all the links
216      * among concatenated pages have been fixed up.
217      */

218     public synchronized void rewrite () throws IOException {
219         if (needRewrite) {
220             super.rewrite ();
221             needRewrite = false;
222         }
223     }
224
225     /**
226      * Close the concatenation. Makes sure all the links
227      * among concatenated pages have been fixed up and closes
228      * the file.
229      */

230     public synchronized void close () throws IOException {
231         if (nPages == 0)
232             write (prolog);
233         emit (epilog);
234         rewrite ();
235         super.close ();
236     }
237
238     /**
239      * Write a page to the concatenation.
240      * @param page Page to write
241      */

242     public synchronized void writePage (Page page) throws IOException {
243         ++nPages;
244
245         emit ((nPages == 1) ? prolog : divider);
246
247         String JavaDoc title = page.getTitle ();
248         URL JavaDoc url = page.getURL ();
249         String JavaDoc urlString = url.toExternalForm ();
250         String JavaDoc anchor = "page" + nPages;
251         map (url, "#" + anchor);
252
253         emitTemplate (header, title, urlString, anchor, nPages);
254         if (page.isImage () && page.getURL() != null)
255             super.write ("<IMG SRC='" + page.getURL() + "'>");
256         else if (page.isHTML())
257             // it's HTML, can write it normally
258
super.writePage (page);
259         else
260             super.write (page.toHTML());
261         emitTemplate (footer, title, urlString, anchor, nPages);
262         
263         needRewrite = nPages > 1;
264     }
265
266     private void emitTemplate (String JavaDoc template,
267                                String JavaDoc title, String JavaDoc url,
268                                String JavaDoc anchor, int pages) throws IOException {
269         if (template == null || template.length() == 0)
270             return;
271             
272         template = Str.replace (template, "%t", title != null ? title : "");
273         template = Str.replace (template, "%u", url != null ? url : "");
274         template = Str.replace (template, "%a", anchor != null ? anchor : "");
275         template = Str.replace (template, "%p", String.valueOf (pages));
276         emit (template);
277     }
278
279     /**
280      * Process an HTML element for concatenation. Deletes
281      * tags that would
282      * conflict with other pages (such as &lt;HEAD&gt;),
283      * changes the URLs in Link elements, and deletes
284      * or remaps the BASE element.
285      * @param elem HTML element to process
286      */

287     protected void handleElement (Element elem) throws IOException {
288         String JavaDoc name = elem.getTagName ();
289         if ( name == Tag.TITLE
290             || name == Tag.STYLE
291             || name == Tag.BASE
292             || name == Tag.ISINDEX
293             || name == Tag.FRAMESET
294             || name == Tag.FRAME) {
295             // skip the entire element
296
}
297         else if ( name == Tag.HTML
298                  || name == Tag.HEAD
299                  || name == Tag.BODY
300                  || name == Tag.NOFRAMES) {
301             // skip only the start and end tags; preserve the content
302
transformContents (elem);
303         }
304         else
305             super.handleElement (elem);
306     }
307     
308     /*
309      * Testing
310      *
311      *
312      *
313      */

314     public static void main (String JavaDoc[] args) throws Exception JavaDoc {
315         HTMLTransformer out = new Concatenator (args[args.length-1]);
316         for (int i=0; i<args.length-1; ++i) {
317             Link link = new Link (args[i]);
318             Page page = new Page (link);
319             out.writePage (page);
320         }
321         out.close ();
322     }
323 }
324     
325
Popular Tags