1 21 22 package nu.xom.samples; 23 24 import java.net.MalformedURLException ; 25 import java.net.URL ; 26 import java.util.HashSet ; 27 import java.util.LinkedList ; 28 import java.util.List ; 29 import java.util.Set ; 30 31 import nu.xom.Attribute; 32 import nu.xom.Builder; 33 import nu.xom.Document; 34 import nu.xom.Element; 35 import nu.xom.Elements; 36 37 38 50 public class XLinkSpider { 51 52 private Set spidered = new HashSet (); 53 private Builder parser = new Builder(); 54 private List queue = new LinkedList (); 55 56 public static final String XLINK_NS 57 = "http://www.w3.org/1999/xlink"; 58 public static final String XML_NS 59 = "http://www.w3.org/XML/1998/namespace"; 60 61 public void search(URL url) { 62 63 try { 64 String systemID = url.toExternalForm(); 65 Document doc = parser.build(systemID); 66 System.out.println(url); 67 search(doc.getRootElement(), url); 68 } 69 catch (Exception ex) { 70 } 72 73 if (queue.isEmpty()) return; 74 75 URL discovered = (URL ) queue.remove(0); 76 spidered.add(discovered); 77 search(discovered); 78 79 } 80 81 private void search(Element element, URL base) { 82 83 Attribute href = element.getAttribute("href", XLINK_NS); 84 Attribute xmlbase = element.getAttribute("base", XML_NS); 85 try { 86 if (xmlbase != null) { 87 base = new URL (base, xmlbase.getValue()); 88 } 89 } 90 catch (MalformedURLException ex) { 91 return; 94 } 95 if (href != null) { 96 String uri = href.getValue(); 97 try { 99 URL discovered = new URL (base, uri); 100 discovered = new URL ( 102 discovered.getProtocol(), 103 discovered.getHost(), 104 discovered.getFile() 105 ); 106 107 if (!spidered.contains(discovered) 108 && !queue.contains(discovered)) { 109 queue.add(discovered); 110 } 111 } 112 catch (MalformedURLException ex) { 113 } 115 } 116 Elements children = element.getChildElements(); 117 for (int i = 0; i < children.size(); i++) { 118 search(children.get(i), base); 119 } 120 121 } 122 123 public static void main(String [] args) { 124 125 XLinkSpider spider = new XLinkSpider(); 126 for (int i = 0; i < args.length; i++) { 127 try { 128 spider.search(new URL (args[i])); 129 } 130 catch (MalformedURLException ex) { 131 System.err.println(ex); 132 } 133 } 134 135 } 137 } | Popular Tags |