KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > searchengine > AltaVista


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx.searchengine;
34 import websphinx.*;
35 import java.net.URL JavaDoc;
36 import java.net.URLEncoder JavaDoc;
37 import java.net.MalformedURLException JavaDoc;
38
39 /**
40  * <A HREF="http://altavista.digital.com/">AltaVista</a> search engine.
41  */

42 public class AltaVista implements SearchEngine {
43
44     static Pattern patCount = new Regexp (
45         "<font size=-1 face=\"arial, helvetica\">(?:About )?<b>(\\d+)</b> documents? match your query.");
46     static Pattern patNoHits = new Regexp (
47         "No documents match the query."
48     );
49
50     static Pattern patResult = new Tagexp (
51         "<dt><b>(?{rank})</b>" // rank
52
+ "(?{link}<a><b>(?{title})</b></a>)" // title and main link
53
+ "<dd>(?{description})<br>" // description
54
+ "(?:<i>(?:<a></a>)?</i><br>)+" // URL(s)
55
+ "<p>" // terminator
56
);
57
58     static Pattern patMoreLink = new Tagexp (
59         "<input type=image name=navig* value=nav.gif>"
60     );
61
62     /**
63      * Classify a page. Sets the following labels:
64      * <TABLE>
65      * <TR><TH>Name <TH>Type <TH>Meaning
66      * <TR><TD>searchengine.source <TD>Page label <TD>AltaVista object that labeled the page
67      * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
68      * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
69      * contains subfields: rank, title, description, and link.
70      * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
71      * </TABLE>
72      */

73     public void classify (Page page) {
74         String JavaDoc title = page.getTitle ();
75         if (title != null &&
76             (title.startsWith ("AltaVista: Simple Query")
77              || title.startsWith ("AltaVista: Advanced Query"))) {
78             page.setObjectLabel ("searchengine.source", this);
79
80             Region count = patCount.oneMatch (page);
81             if (count != null)
82                 page.setField ("searchengine.count", count.getField ("0"));
83             
84             Region[] results = patResult.allMatches (page);
85             SearchEngineResult[] ser = new SearchEngineResult[results.length];
86             for (int i=0; i<results.length; ++i)
87                 ser[i] = new SearchEngineResult (results[i]);
88             page.setFields ("searchengine.results", ser);
89
90             PatternMatcher m = patMoreLink.match (page);
91             while (m.hasMoreElements ()) {
92                 Link link = (Link)m.nextMatch ();
93                 link.setLabel ("searchengine.more-results");
94                 link.setLabel ("hyperlink");
95             }
96         }
97     }
98
99     /**
100      * Priority of this classifier.
101      */

102     public static final float priority = 0.0F;
103     
104     /**
105      * Get priority of this classifier.
106      * @return priority.
107      */

108     public float getPriority () {
109         return priority;
110     }
111
112     /**
113      * Make a query URL for AltaVista.
114      * @param keywords list of keywords, separated by spaces
115      * @return URL that submits the keywords to AltaVista.
116      */

117     public URL JavaDoc makeQuery (String JavaDoc keywords) {
118         try {
119             return new URL JavaDoc("http://altavista.digital.com/cgi-bin/query?pg=q&what=web&kl=XX&q="
120                          + URLEncoder.encode(keywords));
121         } catch (MalformedURLException JavaDoc e) {
122             throw new RuntimeException JavaDoc ("internal error");
123         }
124     }
125
126     /**
127      * Get number of results per page for this search engine.
128      * @return typical number of results per page
129      */

130     public int getResultsPerPage () {
131         return 10;
132     }
133
134     /**
135      * Search AltaVista.
136      * @param keywords list of keywords, separated by spaces
137      * @return enumeration of SearchEngineResults returned by an AltaVista query constructed from the keywords.
138      */

139     public static Search search (String JavaDoc keywords) {
140         return new Search (new AltaVista(), keywords);
141     }
142
143     /**
144      * Search AltaVista.
145      * @param keywords list of keywords, separated by spaces
146      * @param maxResults maximum number of results to return
147      * @return enumeration of SearchEngineResults returned by an AltaVista query constructed from the keywords.
148      * The enumeration yields at most maxResults objects.
149      */

150     public static Search search (String JavaDoc keywords, int maxResults) {
151         return new Search (new AltaVista(), keywords, maxResults);
152     }
153 }
154
Popular Tags