KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > websphinx > StandardClassifier


1 /*
2  * WebSphinx web-crawling toolkit
3  *
4  * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
5  * reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in
16  * the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */

32
33 package websphinx;
34
35 import java.net.URL JavaDoc;
36
37 /**
38  * Standard classifier, installed in every crawler by default.
39  * <P>On the entire page, this classifier sets the following labels:
40  * <UL>
41  * <LI><B>root</B>: page is the root page of a Web site. For instance,
42  * "http://www.digital.com/" and "http://www.digital.com/index.html" are both
43  * marked as root, but "http://www.digital.com/about" is not.
44  * </UL>
45  * <P>Also sets one or more of the following labels on every link:
46  * <UL>
47  * <LI><B>hyperlink</B>: link is a hyperlink (A, AREA, or FRAME tags) to another page on the Web (using http, file, ftp, or gopher protocols)
48  * <LI><B>image</B>: link is an inline image (IMG).
49  * <LI><B>form</B>: link is a form (FORM tag). A form generally requires some parameters to use.
50  * <LI><B>code</B>: link points to code (APPLET, EMBED, or SCRIPT).
51  * <LI><B>remote</B>: link points to a different Web server.
52  * <LI><B>local</B>: link points to the same Web server.
53  * <LI><B>same-page</B>: link points to the same page (e.g., by an anchor reference like "#top")
54  * <LI><B>sibling</B>: a local link that points to a page in the same directory (e.g. "sibling.html")
55  * <LI><B>descendent</B>: a local link that points downwards in the directory structure (e.g., "deep/deeper/deepest.html")
56  * <LI><B>ancestor</B>: a link that points upwards in the directory structure (e.g., "../..")
57  * </UL>
58  */

59 public class StandardClassifier implements Classifier {
60
61     /**
62      * Make a StandardClassifier.
63      */

64     public StandardClassifier () {
65     }
66
67     /**
68      * Classify a page.
69      * @param page Page to classify
70      */

71     // FIX: use regular expressions throughout this method
72
public void classify (Page page) {
73         Link origin = page.getOrigin ();
74         String JavaDoc pageHost = origin.getHost ();
75         int pagePort = origin.getPort ();
76         String JavaDoc pagePath = origin.getFile();
77         String JavaDoc pageFilename = origin.getFilename();
78
79         URL JavaDoc base = page.getBase ();
80         String JavaDoc baseHost = base.getHost ();
81         int basePort = base.getPort ();
82         String JavaDoc basePath = base.getFile ();
83
84         if (pageFilename.equals ("") || pageFilename.startsWith ("index.htm"))
85             page.setLabel ("root");
86
87         // FIX: Link needs to resolve "foo/bar/.." and "foo/." to "foo" in order for this
88
// stuff to work properly
89
Link[] links = page.getLinks ();
90         if (links != null) {
91             for (int i=0; i<links.length; ++i) {
92                 Link link = links[i];
93                 
94                 if ((link.getHost().equals (pageHost)
95                      && link.getPort() == pagePort)
96                     || (link.getHost().equals (baseHost)
97                         && link.getPort() == basePort)) {
98                     link.setLabel ("local");
99                     
100                     String JavaDoc linkPath = link.getFile ();
101                     
102                     if (linkPath.equals (pagePath)
103                         || linkPath.equals (basePath))
104                         link.setLabel ("same-page");
105                     else if (link.getDirectory ().equals (origin.getDirectory ()))
106                         link.setLabel ("sibling");
107                     else if (descendsFrom (linkPath, pagePath)
108                              || descendsFrom (linkPath, basePath))
109                         link.setLabel ("descendent");
110                     else if (descendsFrom (pagePath, linkPath)
111                              || descendsFrom (basePath, linkPath))
112                         link.setLabel ("ancestor");
113                     // NIY: child, parent
114
}
115                 else
116                     link.setLabel ("remote");
117
118                 // Link tag kinds: resource, form, hyperlink
119
String JavaDoc tagName = link.getTagName();
120                 
121                 if (tagName == Tag.IMG)
122                     link.setLabel ("image");
123                 else if (tagName == Tag.APPLET || tagName == Tag.EMBED || tagName == Tag.SCRIPT)
124                     link.setLabel ("code");
125                 else if (tagName == Tag.FORM)
126                     link.setLabel ("form");
127                 else if (tagName == Tag.A || tagName == Tag.AREA || tagName == Tag.FRAME) {
128                     String JavaDoc protocol = link.getProtocol ();
129                     
130                     if ((protocol.equals ("http")
131                          || protocol.equals ("ftp")
132                          || protocol.equals ("file")
133                          || protocol.equals ("gopher"))
134                         && link.getMethod() == Link.GET)
135                         link.setLabel ("hyperlink");
136                 }
137             }
138         }
139     }
140
141     private boolean descendsFrom (String JavaDoc path1, String JavaDoc path2) {
142         return path1.startsWith (path2.endsWith ("/")
143                                  ? path2
144                                  : path2 + "/");
145     }
146
147     /**
148      * Priority of this classifier.
149      */

150     public static final float priority = 0.0F;
151     
152     /**
153      * Get priority of this classifier.
154      * @return priority.
155      */

156     public float getPriority () {
157         return priority;
158     }
159 }
160
Popular Tags