KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > workbench > ExtractAction


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx.workbench;
34
35 import websphinx.*;
36 import java.io.File JavaDoc;
37 import java.io.IOException JavaDoc;
38
39 public class ExtractAction implements Action, CrawlListener {
40     Pattern pattern;
41     String JavaDoc filename;
42     boolean useBrowser;
43     boolean textOnly;
44     
45     transient File JavaDoc file;
46     transient RecordTransformer records;
47     transient boolean noFields;
48     
49     public ExtractAction (Pattern pattern, boolean useBrowser, String JavaDoc filename, boolean textOnly) {
50         this.pattern = pattern;
51         this.filename = filename;
52         this.useBrowser = useBrowser;
53         this.textOnly = textOnly;
54     }
55     
56     public boolean equals (Object JavaDoc object) {
57         if (! (object instanceof ExtractAction))
58             return false;
59         ExtractAction a = (ExtractAction)object;
60         return same (a.filename, filename)
61             && a.useBrowser == useBrowser
62             && a.pattern.equals (pattern)
63             && a.textOnly == textOnly;
64     }
65
66     private boolean same (String JavaDoc s1, String JavaDoc s2) {
67         if (s1 == null || s2 == null)
68             return s1 == s2;
69         else
70             return s1.equals (s2);
71     }
72
73     public Pattern getPattern () {
74         return pattern;
75     }
76     public boolean getUseBrowser () {
77         return useBrowser;
78     }
79     public String JavaDoc getFilename () {
80         return filename;
81     }
82     public boolean getTextOnly () {
83         return textOnly;
84     }
85     
86     public void connected (Crawler crawler) {
87         crawler.addCrawlListener (this);
88     }
89
90     public void disconnected (Crawler crawler) {
91         crawler.removeCrawlListener (this);
92     }
93
94     private void showit () {
95       Browser browser = Context.getBrowser();
96       if (browser != null)
97         browser.show (file);
98     }
99
100     public synchronized void visit (Page page) {
101         try {
102             int n = 0;
103
104             PatternMatcher m = pattern.match (page);
105             for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {
106                 Object JavaDoc[] fields;
107                 if (noFields) {
108                     fields = new Object JavaDoc[1];
109                     fields[0] = r;
110                 }
111                 else
112                     fields = (Object JavaDoc[])r.getFields (Pattern.groups);
113                     
114                 records.writeRecord (fields, textOnly);
115                 ++n;
116             }
117             
118             if (n > 0)
119               records.flush ();
120         } catch (IOException JavaDoc e) {
121             throw new RuntimeException JavaDoc (e.toString());
122         }
123     }
124
125     /**
126      * Notify that the crawler started.
127      */

128     public synchronized void started (CrawlEvent event){
129         if (records == null) {
130             try {
131                 file = (filename != null)
132                   ? new File JavaDoc (filename)
133                   : Access.getAccess ().makeTemporaryFile ("extract", ".html");
134                 
135                 records = new RecordTransformer (file.toString());
136                 
137                 String JavaDoc[] fieldNames = pattern.getFieldNames ();
138                 noFields = (fieldNames.length == 0);
139                 records.setProlog (records.getProlog ()
140                                    + makeTableHeader (fieldNames));
141             } catch (IOException JavaDoc e) {
142                 System.err.println (e); // FIX: use GUI when available
143
}
144         }
145     }
146
147     private String JavaDoc makeTableHeader (String JavaDoc[] fieldNames) {
148         String JavaDoc result = "<TR>\n<TH>\n";
149         if (fieldNames.length == 0)
150             result += "<TH>\n";
151         else
152             for (int i=0; i<fieldNames.length; ++i)
153                 result += "<TH>" + fieldNames[i] + "\n";
154         return result;
155     }
156     
157     /**
158      * Notify that the crawler ran out of links to crawl
159      */

160     public synchronized void stopped (CrawlEvent event){
161         try {
162             if (records != null) {
163                 records.close ();
164                 records = null;
165                 if (useBrowser)
166                   showit ();
167             }
168         } catch (IOException JavaDoc e) {
169             System.err.println (e); // FIX: use GUI when available
170
}
171     }
172
173     /**
174      * Notify that the crawler's state was cleared.
175      */

176     public synchronized void cleared (CrawlEvent event){
177         try {
178             if (records != null) {
179                 records.close ();
180                 records = null;
181                 if (useBrowser)
182                   showit ();
183             }
184         } catch (IOException JavaDoc e) {
185             System.err.println (e); // FIX: use GUI when available
186
}
187     }
188
189     /**
190      * Notify that the crawler timed out.
191      */

192     public synchronized void timedOut (CrawlEvent event){
193         try {
194             records.close ();
195             records = null;
196             if (useBrowser)
197               showit ();
198         } catch (IOException JavaDoc e) {
199             System.err.println (e); // FIX: use GUI when available
200
}
201     }
202
203     /**
204      * Notify that the crawler is paused.
205      */

206     public synchronized void paused (CrawlEvent event){
207         try {
208             records.flush ();
209             if (useBrowser)
210               showit ();
211         } catch (IOException JavaDoc e) {
212             System.err.println (e); // FIX: use GUI when available
213
}
214     }
215 }
216
Popular Tags