1 21 22 package nu.xom.samples; 23 24 import java.net.MalformedURLException ; 25 import java.net.URL ; 26 import java.util.HashSet ; 27 import java.util.LinkedList ; 28 import java.util.List ; 29 import java.util.Set ; 30 31 import nu.xom.Attribute; 32 import nu.xom.Builder; 33 import nu.xom.Document; 34 import nu.xom.Element; 35 import nu.xom.Elements; 36 import nu.xom.Node; 37 import nu.xom.ProcessingInstruction; 38 39 52 public class PoliteSpider { 53 54 private Set spidered = new HashSet (); 55 private Builder parser = new Builder(); 56 private List queue = new LinkedList (); 57 58 public static final String XLINK_NS 59 = "http://www.w3.org/1999/xlink"; 60 public static final String XML_NS 61 = "http://www.w3.org/XML/1998/namespace"; 62 63 public void search(URL url) { 64 65 try { 66 String systemID = url.toExternalForm(); 67 Document doc = parser.build(systemID); 68 69 boolean follow = true; 70 boolean index = true; 71 for (int i = 0; i < doc.getChildCount(); i++) { 72 Node child = doc.getChild(i); 73 if (child instanceof Element) break; 74 if (child instanceof ProcessingInstruction){ 75 ProcessingInstruction instruction 76 = (ProcessingInstruction) child; 77 if (instruction.getTarget().equals("robots")) { 78 Element data 79 = PseudoAttributes.getAttributes(instruction); 80 Attribute indexAtt = data.getAttribute("index"); 81 if (indexAtt != null) { 82 String value = indexAtt.getValue().trim(); 83 if (value.equals("no")) index = false; 84 } 85 Attribute followAtt = data.getAttribute("follow"); 86 if (followAtt != null) { 87 String value = followAtt.getValue().trim(); 88 if (value.equals("no")) follow = false; 89 } 90 } 91 } 92 } 93 94 if (index) System.out.println(url); 95 if (follow) search(doc.getRootElement(), url); 96 } 97 catch (Exception ex) { 98 } 100 101 if (queue.isEmpty()) return; 102 103 URL discovered = (URL ) queue.remove(0); 104 spidered.add(discovered); 105 search(discovered); 106 107 } 108 109 private void search(Element element, URL base) { 110 111 Attribute href = element.getAttribute("href", XLINK_NS); 112 Attribute xmlbase = element.getAttribute("base", XML_NS); 113 try { 114 if (xmlbase != null) base = new URL (base, xmlbase.getValue()); 115 } 116 catch (MalformedURLException ex) { 117 return; 119 } 120 if (href != null) { 121 String uri = href.getValue(); 122 try { 124 URL discovered = new URL (base, uri); 125 discovered = new URL ( 127 discovered.getProtocol(), 128 discovered.getHost(), 129 discovered.getFile() 130 ); 131 132 if (!spidered.contains(discovered) 133 && !queue.contains(discovered)) { 134 queue.add(discovered); 135 } 136 } 137 catch (MalformedURLException ex) { 138 } 140 } 141 Elements children = element.getChildElements(); 142 for (int i = 0; i < children.size(); i++) { 143 search(children.get(i), base); 144 } 145 146 } 147 148 public static void main(String [] args) { 149 150 XLinkSpider spider = new XLinkSpider(); 151 for (int i = 0; i < args.length; i++) { 152 try { 153 spider.search(new URL (args[i])); 154 } 155 catch (MalformedURLException ex) { 156 System.err.println(ex); 157 } 158 } 159 160 } 162 } | Popular Tags |