KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > searchengine > NewsBot


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx.searchengine;
34
35 import websphinx.*;
36 import java.net.URL JavaDoc;
37 import java.net.URLEncoder JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39
40 /**
41  * <A HREF="http://www.newbot.com/">NewsBot</a> search engine.
42  */

43 public class NewsBot implements SearchEngine {
44
45     static Pattern patTitle = new Regexp ("^");
46
47     static Pattern patCount = new Regexp (
48         "Returned <B>(\\d+)</b> results"
49     );
50     static Pattern patNoHits = new Regexp (
51         "Sorry -- your search yielded no results"
52     );
53
54     // FIX: works only for Netscape
55
static Pattern patResult = new Tagexp (
56             "<font>"
57            +"(?{link}<A>(?{title})</A>)"
58            +"</font>"
59            +"<br>"
60            +"<font></font>(?{description})<br>"
61            +"<font><b></b></font><p>"
62     );
63
64     static Pattern patMoreLink = new Tagexp (
65         "<input type=image name=act.next>"
66     );
67
68     /**
69      * Classify a page. Sets the following labels:
70      * <TABLE>
71      * <TR><TH>Name <TH>Type <TH>Meaning
72      * <TR><TD>searchengine.source <TD>Page label <TD>NewsBot object that labeled this page
73      * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
74      * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
75      * contains subfields: rank, title, description, and link.
76      * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
77      * </TABLE>
78      */

79     public void classify (Page page) {
80         String JavaDoc title = page.getTitle ();
81         if (title != null && title.startsWith ("HotBot results:")) {
82             page.setObjectLabel ("searchengine.source", this);
83
84             Region count = patCount.oneMatch (page);
85             if (count != null)
86                 page.setField ("searchengine.count", count.getField ("0"));
87             
88             Region[] results = patResult.allMatches (page);
89             SearchEngineResult[] ser = new SearchEngineResult[results.length];
90             for (int i=0; i<results.length; ++i) {
91                 ser[i] = new SearchEngineResult (results[i]);
92                 //System.out.println (ser[i]);
93
}
94             page.setFields ("searchengine.results", ser);
95
96             PatternMatcher m = patMoreLink.match (page);
97             while (m.hasMoreElements ()) {
98                 Link link = (Link)m.nextMatch ();
99                 link.setLabel ("searchengine.more-results");
100                 link.setLabel ("hyperlink");
101             }
102         }
103         else System.err.println ("not a NewsBot page");
104
105     }
106
107     /**
108      * Priority of this classifier.
109      */

110     public static final float priority = 0.0F;
111     
112     /**
113      * Get priority of this classifier.
114      * @return priority.
115      */

116     public float getPriority () {
117         return priority;
118     }
119
120     /**
121      * Make a query URL for NewsBot.
122      * @param keywords list of keywords, separated by spaces
123      * @return URL that submits the keywords to NewsBot.
124      */

125     public URL JavaDoc makeQuery (String JavaDoc keywords) {
126         try {
127             java.util.StringTokenizer JavaDoc tok = new java.util.StringTokenizer JavaDoc (keywords);
128             StringBuffer JavaDoc output = new StringBuffer JavaDoc ();
129             while (tok.hasMoreElements ()) {
130                 String JavaDoc kw = tok.nextToken ();
131                 if (output.length () > 0)
132                     output.append (" or ");
133                 output.append (kw);
134             }
135
136             return new URL JavaDoc(
137 "http://engine.newbot.com/newbot/server/query.fpl?client_id=0sQaJNoAahXc&output=hotbot4&logad=1&client_sw=html&client_vr=0.9&client_last_updated=ignore&T0=hotbot&S0=date&P0=&F0=24&Q0="
138                            + URLEncoder.encode(output.toString())
139 + "&max_results=50&S0=rank&Search.x=55&Search.y=4"
140 );
141         } catch (MalformedURLException JavaDoc e) {
142             throw new RuntimeException JavaDoc ("internal error");
143         }
144     }
145
146     /**
147      * Get number of results per page for this search engine.
148      * @return typical number of results per page
149      */

150     public int getResultsPerPage () {
151         return 10;
152     }
153
154     /**
155      * Search NewsBot.
156      * @param keywords list of keywords, separated by spaces
157      * @return enumeration of SearchEngineResults returned by a NewsBot query constructed from the keywords.
158      */

159     public static Search search (String JavaDoc keywords) {
160         return new Search (new NewsBot(), keywords);
161     }
162
163     /**
164      * Search NewsBot.
165      * @param keywords list of keywords, separated by spaces
166      * @param maxResults maximum number of results to return
167      * @return enumeration of SearchEngineResults returned by an NewsBot query constructed from the keywords.
168      * The enumeration yields at most maxResults objects.
169      */

170     public static Search search (String JavaDoc keywords, int maxResults) {
171         return new Search (new NewsBot(), keywords, maxResults);
172     }
173 }
174
Popular Tags