KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > RecordTransformer


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.io.*;
36 import java.net.URL JavaDoc;
37 import rcm.util.Str;
38
39 public class RecordTransformer extends RewritableLinkTransformer {
40
41     String JavaDoc prolog = "<HTML><HEAD><TITLE>Extracted Records</TITLE></HEAD><BODY><TABLE>\n";
42     String JavaDoc epilog = "</TABLE></BODY></HTML>\n";
43
44     String JavaDoc recordStart = "<TR>\n<TD><A HREF=\"%u\">%n.</A>\n";
45     String JavaDoc recordEnd = "\n";
46     String JavaDoc recordDivider = "";
47     
48     String JavaDoc fieldStart = " <TD>";
49     String JavaDoc fieldEnd = "\n";
50     String JavaDoc fieldDivider = "";
51
52     int nRecords = 0;
53
54     public RecordTransformer (String JavaDoc filename) throws IOException {
55         super (filename);
56     }
57
58     public synchronized void setProlog (String JavaDoc prolog) {
59         this.prolog = prolog;
60     }
61     public synchronized String JavaDoc getProlog () {
62         return prolog;
63     }
64
65     public synchronized void setEpilog (String JavaDoc epilog) {
66         this.epilog = epilog;
67     }
68     public synchronized String JavaDoc getEpilog () {
69         return epilog;
70     }
71
72     public synchronized void setRecordStart (String JavaDoc recordStart) {
73         this.recordStart = recordStart;
74     }
75     public synchronized String JavaDoc getRecordStart () {
76         return recordStart;
77     }
78
79     public synchronized void setRecordEnd (String JavaDoc recordEnd) {
80         this.recordEnd = recordEnd;
81     }
82     public synchronized String JavaDoc getRecordEnd () {
83         return recordEnd;
84     }
85
86     public synchronized void setRecordDivider (String JavaDoc recordDivider) {
87         this.recordDivider = recordDivider;
88     }
89     public synchronized String JavaDoc getRecordDivider () {
90         return recordDivider;
91     }
92
93     public synchronized void setFieldStart (String JavaDoc fieldStart) {
94         this.fieldStart = fieldStart;
95     }
96     public synchronized String JavaDoc getFieldStart () {
97         return fieldStart;
98     }
99
100     public synchronized void setFieldEnd (String JavaDoc fieldEnd) {
101         this.fieldEnd = fieldEnd;
102     }
103     public synchronized String JavaDoc getFieldEnd () {
104         return fieldEnd;
105     }
106
107     public synchronized void setFieldDivider (String JavaDoc fieldDivider) {
108         this.fieldDivider = fieldDivider;
109     }
110     public synchronized String JavaDoc getFieldDivider () {
111         return fieldDivider;
112     }
113
114     /**
115      * Flush the record page to disk. Temporarily writes the epilog.
116      */

117     public synchronized void flush () throws IOException {
118         long p = getFilePointer ();
119         if (nRecords == 0)
120             emit (prolog);
121         emit (epilog);
122         seek (p);
123         super.flush ();
124     }
125         
126
127     public synchronized int getRecordCount () {
128         return nRecords;
129     }
130
131     public synchronized void writeRecord (Object JavaDoc[] fields, boolean asText) throws IOException {
132         ++nRecords;
133
134         emit ((nRecords == 1) ? prolog : recordDivider);
135         
136         URL JavaDoc url = urlOfFirstRegion (fields);
137         
138         emitTemplate (recordStart, url, nRecords);
139         for (int i=0; i<fields.length; ++i) {
140             if (i > 0)
141                 emit (fieldDivider);
142             emit (fieldStart);
143             
144             Object JavaDoc f = fields[i];
145             if (f instanceof Region) {
146                 Region r = (Region)fields[i];
147                 if (asText)
148                     write (r.toText());
149                 else
150                     write (r);
151             }
152             else
153                 write (f.toString ());
154                 
155             emit (fieldEnd);
156         }
157         emitTemplate (recordEnd, url, nRecords);
158     }
159     
160     private URL JavaDoc urlOfFirstRegion (Object JavaDoc[] fields) {
161         for (int i=0; i<fields.length; ++i)
162             if (fields[i] instanceof Region) {
163                 Region r = (Region)fields[i];
164                 return r.getSource().getURL();
165             }
166         return null;
167     }
168
169     private void emitTemplate (String JavaDoc template, URL JavaDoc url, int record) throws IOException {
170         if (template == null || template.length() == 0)
171             return;
172             
173         template = Str.replace (template, "%n", String.valueOf (record));
174         template = Str.replace (template, "%u", url != null ? url.toString () : "");
175         emit (template);
176     }
177
178     /*
179      * Testing
180      *
181     public static void main (String[] args) throws Exception {
182         Pattern p = new Tagexp (args[0].replace ('_', ' ') );
183         RecordTransformer records = new RecordTransformer (args[1]);
184         for (int i=2; i<args.length; ++i) {
185             Page page = new Page (new Link (args[i]));
186             PatternMatcher m = p.match (page);
187             for (Region r = m.nextMatch(); r != null; r = m.nextMatch())
188                 records.writeRecord (r.getFields (Pattern.groups), false);
189         }
190         records.close ();
191     }
192      */

193
194 }
195
Popular Tags