KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > nu > xom > samples > PoliteSpider


1 /* Copyright 2002-2004 Elliotte Rusty Harold
2    
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6    
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10    GNU Lesser General Public License for more details.
11    
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307 USA
16    
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@metalab.unc.edu. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */

21
22 package nu.xom.samples;
23
24 import java.net.MalformedURLException JavaDoc;
25 import java.net.URL JavaDoc;
26 import java.util.HashSet JavaDoc;
27 import java.util.LinkedList JavaDoc;
28 import java.util.List JavaDoc;
29 import java.util.Set JavaDoc;
30
31 import nu.xom.Attribute;
32 import nu.xom.Builder;
33 import nu.xom.Document;
34 import nu.xom.Element;
35 import nu.xom.Elements;
36 import nu.xom.Node;
37 import nu.xom.ProcessingInstruction;
38
39 /**
40  *
41  * <p>
42  * Demonstrates the reading of attributes in namespaces,
43  * searching for particular processing instructions in the
44  * document prolog, and maintaining a stack of hierarchy-based
45  * state during document traversal.
46  * </p>
47  *
48  * @author Elliotte Rusty Harold
49  * @version 1.0
50  *
51  */

52 public class PoliteSpider {
53
54     private Set JavaDoc spidered = new HashSet JavaDoc();
55     private Builder parser = new Builder();
56     private List JavaDoc queue = new LinkedList JavaDoc();
57     
58     public static final String JavaDoc XLINK_NS
59      = "http://www.w3.org/1999/xlink";
60     public static final String JavaDoc XML_NS
61      = "http://www.w3.org/XML/1998/namespace";
62     
63     public void search(URL JavaDoc url) {
64         
65         try {
66             String JavaDoc systemID = url.toExternalForm();
67             Document doc = parser.build(systemID);
68             
69             boolean follow = true;
70             boolean index = true;
71             for (int i = 0; i < doc.getChildCount(); i++) {
72                 Node child = doc.getChild(i);
73                 if (child instanceof Element) break;
74                 if (child instanceof ProcessingInstruction){
75                     ProcessingInstruction instruction
76                       = (ProcessingInstruction) child;
77                     if (instruction.getTarget().equals("robots")) {
78                         Element data
79                           = PseudoAttributes.getAttributes(instruction);
80                         Attribute indexAtt = data.getAttribute("index");
81                         if (indexAtt != null) {
82                             String JavaDoc value = indexAtt.getValue().trim();
83                             if (value.equals("no")) index = false;
84                         }
85                         Attribute followAtt = data.getAttribute("follow");
86                         if (followAtt != null) {
87                             String JavaDoc value = followAtt.getValue().trim();
88                             if (value.equals("no")) follow = false;
89                         }
90                     }
91                 }
92             }
93             
94             if (index) System.out.println(url);
95             if (follow) search(doc.getRootElement(), url);
96         }
97         catch (Exception JavaDoc ex) {
98             // just skip this document
99
}
100         
101         if (queue.isEmpty()) return;
102         
103         URL JavaDoc discovered = (URL JavaDoc) queue.remove(0);
104         spidered.add(discovered);
105         search(discovered);
106         
107     }
108
109     private void search(Element element, URL JavaDoc base) {
110
111         Attribute href = element.getAttribute("href", XLINK_NS);
112         Attribute xmlbase = element.getAttribute("base", XML_NS);
113         try {
114             if (xmlbase != null) base = new URL JavaDoc(base, xmlbase.getValue());
115         }
116         catch (MalformedURLException JavaDoc ex) {
117             //Java can't handle the kind of URLs used inside this element
118
return;
119         }
120         if (href != null) {
121             String JavaDoc uri = href.getValue();
122             // absolutize URL
123
try {
124                 URL JavaDoc discovered = new URL JavaDoc(base, uri);
125                 // strip fragment identifier if any
126
discovered = new URL JavaDoc(
127                   discovered.getProtocol(),
128                   discovered.getHost(),
129                   discovered.getFile()
130                 );
131                 
132                 if (!spidered.contains(discovered)
133                   && !queue.contains(discovered)) {
134                     queue.add(discovered);
135                 }
136             }
137             catch (MalformedURLException JavaDoc ex) {
138                 // skip this one
139
}
140         }
141         Elements children = element.getChildElements();
142         for (int i = 0; i < children.size(); i++) {
143             search(children.get(i), base);
144         }
145         
146     }
147
148     public static void main(String JavaDoc[] args) {
149       
150         XLinkSpider spider = new XLinkSpider();
151         for (int i = 0; i < args.length; i++) {
152             try {
153                 spider.search(new URL JavaDoc(args[i]));
154             }
155             catch (MalformedURLException JavaDoc ex) {
156                 System.err.println(ex);
157             }
158         }
159       
160     } // end main()
161

162 }
Popular Tags