1 package bplatt.spider; 2 3 21 22 import java.io.*; 23 import java.net.*; 24 import java.util.*; 25 import javax.swing.text.*; 26 import javax.swing.text.html.*; 27 28 public class PageInfo { 29 private URL url; 30 private URL parentUrl; 31 private String title; 32 private URL[] links; 33 private URL[] images; 34 private boolean valid; 35 private int responseCode; 36 private String contentType; 37 private int contentLength; 38 private final static URL[] dummy = new URL[1]; 39 private final static String HTML = "text/html"; 40 41 42 public PageInfo(URL url, URL parentUrl, String contentType, int contentLength, int responseCode) { 43 this.url = url; 44 this.parentUrl = parentUrl; 45 this.contentType = contentType; 46 this.contentLength = contentLength; 47 this.responseCode = responseCode; 48 valid = false; 49 } 50 51 public URL getUrl() { return(url); } 53 public URL getParentUrl() { return(parentUrl); } 54 public String getTitle() { return(title); } 55 public URL[] getLinks() { return(links); } 56 public URL[] getImages() { return(images); } 57 public String getContentType() { return(contentType); } 58 public boolean isValid() { return(valid); } 59 public int getResponseCode() { return responseCode; } 60 61 62 public void extract(Reader reader) throws IOException 63 { 64 if (reader == null || url == null || 66 responseCode != HttpURLConnection.HTTP_OK || 67 contentLength == 0 || contentType.equalsIgnoreCase(HTML) == false) { 68 valid = false; 69 return; 70 } 71 WebPageXtractor x = new WebPageXtractor(); 72 try { x.parse(reader); } 73 catch(EOFException e) { 74 valid = false; 75 return; 76 } 77 catch(SocketTimeoutException e) { 78 valid = false; 79 throw(e); 80 } 81 catch(IOException e) { 82 valid = false; 83 return; 84 } 85 ArrayList rawlinks = x.getLinks(); 86 ArrayList rawimages = x.getImages(); 87 88 ArrayList rawtitle = x.getTitle(); 90 if (rawtitle.isEmpty()) title = null; 91 else title = new String ((String )rawtitle.get(0)); 92 93 int numelem = rawlinks.size(); 95 if (numelem == 0) links = null; 96 else { 97 ArrayList t = new ArrayList(); 98 for (int i=0; i<numelem; ++i) { 99 String slink = (String )rawlinks.get(i); 100 try { 101 URL link = new URL(url,slink); 102 t.add(link); 103 } 104 catch(MalformedURLException e) { } 105 } 106 if (t.isEmpty()) links = null; 107 else links = (URL[])t.toArray(dummy); 108 } 109 110 numelem = rawimages.size(); 112 if (numelem == 0) images = null; 113 else { 114 ArrayList t = new ArrayList(); 115 for (int i=0; i<numelem; ++i) { 116 String simage = (String )rawimages.get(i); 117 try { 118 URL image = new URL(url,simage); 119 t.add(image); 120 } 121 catch(MalformedURLException e) { } 122 } 123 if (t.isEmpty()) images = null; 124 else images = (URL[])t.toArray(dummy); 125 } 126 127 valid = true; 129 } 130 131 132 public void dump() { 133 System.out.println("URL: "+url); 134 System.out.println("Parent URL: "+parentUrl); 135 System.out.println("Title: "+title); 136 if (links != null) { 137 System.out.print("Links: ["); 138 for (int i=0; i<links.length; ++i) { 139 System.out.print(links[i]); 140 if (i<(links.length-1)) System.out.print(", "); 141 } 142 System.out.println("]"); 143 } 144 if (images != null) { 145 System.out.print("Images: ["); 146 for (int i=0; i<images.length; ++i) { 147 System.out.print(images[i]); 148 if (i<(images.length-1)) System.out.print(", "); 149 } 150 System.out.println("]"); 151 } 152 System.out.println("Valid: "+valid); 153 System.out.println("Response Code: "+responseCode); 154 System.out.println("Content Type: "+contentType); 155 System.out.println("Content Length: "+contentLength); 156 } 157 } 158 | Popular Tags |