KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > searchengine > MetaCrawler


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx.searchengine;
34
35 import websphinx.*;
36 import java.net.URL JavaDoc;
37 import java.net.URLEncoder JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39
40 /**
41  * <A HREF="http://www.metacrawler.com/">MetaCrawler</a> search engine.
42  */

43 public class MetaCrawler implements SearchEngine {
44
45     static Pattern patCount = new Regexp (
46         "Collated Results: 1 to \\d+ of (\\d+) references"
47     );
48     static Pattern patNoHits = new Regexp (
49         "Your search did not produce any results"
50     );
51
52     static Pattern patResult = new Tagexp (
53         "<dt><font color=#000000><b>(?{relevance})</b></font>" // relevance rating
54
+ "(?{link}(?{title}<a>.*?</a>))" // title and main link
55
+ "(?{description}<dt>.*?<font>)" // description
56
);
57
58     //static Pattern patMoreLink = new Regexp (
59
// "<a HREF=\"http://\\w+.metacrawler.com/crawler\\?general.*?\">\\d+</a>"
60
//);
61
static Pattern patMoreLink = new Tagexp (
62          "<a HREF=http://*.metacrawler.com/crawler\\?general*></a>"
63     );
64
65     /**
66      * Classify a page. Sets the following labels:
67      * <TABLE>
68      * <TR><TH>Name <TH>Type <TH>Meaning
69      * <TR><TD>searchengine.source <TD>Page label <TD>MetaCrawler object that labeled the page
70      * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
71      * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
72      * contains subfields: rank, title, description, and link.
73      * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
74      * </TABLE>
75      */

76     public void classify (Page page) {
77         String JavaDoc title = page.getTitle ();
78         if (title != null && title.startsWith ("Metacrawler query:")) {
79             page.setObjectLabel ("searchengine.source", this);
80
81             Region count = patCount.oneMatch (page);
82             if (count != null)
83                 page.setField ("searchengine.count", count.getField ("0"));
84             
85             Region[] results = patResult.allMatches (page);
86             SearchEngineResult[] ser = new SearchEngineResult[results.length];
87             for (int i=0; i<results.length; ++i)
88                 ser[i] = new SearchEngineResult (results[i]);
89             page.setFields ("searchengine.results", ser);
90
91             PatternMatcher m = patMoreLink.match (page);
92             while (m.hasMoreElements ()) {
93                 Link link = (Link)m.nextMatch ();
94                 link.setLabel ("searchengine.more-results");
95                 link.setLabel ("hyperlink");
96             }
97         }
98     }
99
100     /**
101      * Priority of this classifier.
102      */

103     public static final float priority = 0.0F;
104     
105     /**
106      * Get priority of this classifier.
107      * @return priority.
108      */

109     public float getPriority () {
110         return priority;
111     }
112
113     /**
114      * Make a query URL for MetaCrawler.
115      * @param keywords list of keywords, separated by spaces
116      * @return URL that submits the keywords to MetaCrawler.
117      */

118     public URL JavaDoc makeQuery (String JavaDoc keywords) {
119         try {
120             return new URL JavaDoc("http://www.metacrawler.com/crawler?general="
121                          + URLEncoder.encode(keywords)
122                          + "&method=1&format=1&region=&rpp=20&timeout=15&hpe=10");
123         } catch (MalformedURLException JavaDoc e) {
124             throw new RuntimeException JavaDoc ("internal error");
125         }
126     }
127
128     /**
129      * Get number of results per page for this search engine.
130      * @return typical number of results per page
131      */

132     public int getResultsPerPage () {
133         return 20;
134     }
135
136     /**
137      * Search MetaCrawler.
138      * @param keywords list of keywords, separated by spaces
139      * @return enumeration of SearchEngineResults returned by an MetaCrawler query constructed from the keywords.
140      */

141     public static Search search (String JavaDoc keywords) {
142         return new Search (new MetaCrawler(), keywords);
143     }
144
145     /**
146      * Search MetaCrawler.
147      * @param keywords list of keywords, separated by spaces
148      * @param maxResults maximum number of results to return
149      * @return enumeration of SearchEngineResults returned by an MetaCrawler query constructed from the keywords.
150      * The enumeration yields at most maxResults objects.
151      */

152     public static Search search (String JavaDoc keywords, int maxResults) {
153         return new Search (new MetaCrawler(), keywords, maxResults);
154     }
155 }
156
Popular Tags