|                                                                                                              1
 21
 22  package nu.xom.samples;
 23
 24  import java.net.MalformedURLException
  ; 25  import java.net.URL
  ; 26  import java.util.HashSet
  ; 27  import java.util.LinkedList
  ; 28  import java.util.List
  ; 29  import java.util.Set
  ; 30
 31  import nu.xom.Attribute;
 32  import nu.xom.Builder;
 33  import nu.xom.Document;
 34  import nu.xom.Element;
 35  import nu.xom.Elements;
 36  import nu.xom.Node;
 37  import nu.xom.ProcessingInstruction;
 38
 39
 52  public class PoliteSpider {
 53
 54      private Set
  spidered = new HashSet  (); 55      private Builder parser = new Builder();
 56      private List
  queue = new LinkedList  (); 57
 58      public static final String
  XLINK_NS 59       = "http://www.w3.org/1999/xlink";
 60      public static final String
  XML_NS 61       = "http://www.w3.org/XML/1998/namespace";
 62
 63      public void search(URL
  url) { 64
 65          try {
 66              String
  systemID = url.toExternalForm(); 67              Document doc = parser.build(systemID);
 68
 69              boolean follow = true;
 70              boolean index = true;
 71              for (int i = 0; i < doc.getChildCount(); i++) {
 72                  Node child = doc.getChild(i);
 73                  if (child instanceof Element) break;
 74                  if (child instanceof ProcessingInstruction){
 75                      ProcessingInstruction instruction
 76                        = (ProcessingInstruction) child;
 77                      if (instruction.getTarget().equals("robots")) {
 78                          Element data
 79                            = PseudoAttributes.getAttributes(instruction);
 80                          Attribute indexAtt = data.getAttribute("index");
 81                          if (indexAtt != null) {
 82                              String
  value = indexAtt.getValue().trim(); 83                              if (value.equals("no")) index = false;
 84                          }
 85                          Attribute followAtt = data.getAttribute("follow");
 86                          if (followAtt != null) {
 87                              String
  value = followAtt.getValue().trim(); 88                              if (value.equals("no")) follow = false;
 89                          }
 90                      }
 91                  }
 92              }
 93
 94              if (index) System.out.println(url);
 95              if (follow) search(doc.getRootElement(), url);
 96          }
 97          catch (Exception
  ex) { 98                      }
 100
 101         if (queue.isEmpty()) return;
 102
 103         URL
  discovered = (URL  ) queue.remove(0); 104         spidered.add(discovered);
 105         search(discovered);
 106
 107     }
 108
 109     private void search(Element element, URL
  base) { 110
 111         Attribute href = element.getAttribute("href", XLINK_NS);
 112         Attribute xmlbase = element.getAttribute("base", XML_NS);
 113         try {
 114             if (xmlbase != null) base = new URL
  (base, xmlbase.getValue()); 115         }
 116         catch (MalformedURLException
  ex) { 117                         return;
 119         }
 120         if (href != null) {
 121             String
  uri = href.getValue(); 122                         try {
 124                 URL
  discovered = new URL  (base, uri); 125                                 discovered = new URL
  ( 127                   discovered.getProtocol(),
 128                   discovered.getHost(),
 129                   discovered.getFile()
 130                 );
 131
 132                 if (!spidered.contains(discovered)
 133                   && !queue.contains(discovered)) {
 134                     queue.add(discovered);
 135                 }
 136             }
 137             catch (MalformedURLException
  ex) { 138                             }
 140         }
 141         Elements children = element.getChildElements();
 142         for (int i = 0; i < children.size(); i++) {
 143             search(children.get(i), base);
 144         }
 145
 146     }
 147
 148     public static void main(String
  [] args) { 149
 150         XLinkSpider spider = new XLinkSpider();
 151         for (int i = 0; i < args.length; i++) {
 152             try {
 153                 spider.search(new URL
  (args[i])); 154             }
 155             catch (MalformedURLException
  ex) { 156                 System.err.println(ex);
 157             }
 158         }
 159
 160     }
 162 }
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |