KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > searchengine > NewsIndex


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx.searchengine;
34
35 import websphinx.*;
36 import java.net.URL JavaDoc;
37 import java.net.URLEncoder JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39
40 /**
41  * <A HREF="http://www.newsindex.com/">NewsIndex</a> search engine.
42  */

43 public class NewsIndex implements SearchEngine {
44
45     static Pattern patCount = new Regexp (
46         "<center>Headlines\\s+\\d+\\s+to\\s+\\d+\\s+of\\s+(\\d+)</center>"
47     );
48     static Pattern patNoHits = new Regexp (
49         "No articles were found matching your search criteria"
50     );
51
52     static Pattern patResult = new Tagexp (
53       "<dd>(?{link}(?{title}<a>.*?</a>))" // title and link
54
+ "<blockquote><b></b>" // news source
55
+ "(?{description})</blockquote>" // description and index date
56
);
57
58     /**
59      * Classify a page. Sets the following labels:
60      * <TABLE>
61      * <TR><TH>Name <TH>Type <TH>Meaning
62      * <TR><TD>searchengine.source <TD>Page label <TD>NewsIndex object that labeled the page
63      * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
64      * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
65      * contains subfields: title, description, and link.
66      * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
67      * </TABLE>
68      */

69     public void classify (Page page) {
70         String JavaDoc title = page.getTitle ();
71         if (title != null && title.equals ("News Index - Results")) {
72             page.setObjectLabel ("searchengine.source", this);
73
74             Region count = patCount.oneMatch (page);
75             if (count != null)
76                 page.setField ("searchengine.count", count.getField ("0"));
77             
78             Region[] results = patResult.allMatches (page);
79             SearchEngineResult[] ser = new SearchEngineResult[results.length];
80             for (int i=0; i<results.length; ++i)
81                 ser[i] = new SearchEngineResult (results[i]);
82             page.setFields ("searchengine.results", ser);
83
84             // find "more" link
85
Link[] links = page.getLinks ();
86             for (int i=0; i<links.length; ++i) {
87                 if (links[i].toText().equals ("Next 10 Headlines")) {
88                     links[i].setLabel ("searchengine.more-results");
89                     links[i].setLabel ("hyperlink");
90                     break;
91                 }
92             }
93         }
94     }
95
96     /**
97      * Priority of this classifier.
98      */

99     public static final float priority = 0.0F;
100     
101     /**
102      * Get priority of this classifier.
103      * @return priority.
104      */

105     public float getPriority () {
106         return priority;
107     }
108
109     /**
110      * Make a query URL for NewsIndex.
111      * @param keywords list of keywords, separated by spaces
112      * @return URL that submits the keywords to NewsIndex.
113      */

114     public URL JavaDoc makeQuery (String JavaDoc keywords) {
115         try {
116             return new URL JavaDoc("http://www.newsindex.com/cgi-bin/process.cgi?mode=any&query="
117                          + URLEncoder.encode(keywords));
118         } catch (MalformedURLException JavaDoc e) {
119             throw new RuntimeException JavaDoc ("internal error");
120         }
121     }
122
123     /**
124      * Get number of results per page for this search engine.
125      * @return typical number of results per page
126      */

127     public int getResultsPerPage () {
128         return 10;
129     }
130
131     /**
132      * Search NewsIndex.
133      * @param keywords list of keywords, separated by spaces
134      * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.
135      */

136     public static Search search (String JavaDoc keywords) {
137         return new Search (new NewsIndex(), keywords);
138     }
139
140     /**
141      * Search NewsIndex.
142      * @param keywords list of keywords, separated by spaces
143      * @param maxResults maximum number of results to return
144      * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.
145      * The enumeration yields at most maxResults objects.
146      */

147     public static Search search (String JavaDoc keywords, int maxResults) {
148         return new Search (new NewsIndex(), keywords, maxResults);
149     }
150 }
151
Popular Tags