1 24 25 27 package weblech.spider; 28 29 import org.apache.log4j.Category; 30 31 import java.io.*; 32 import java.net.URL ; 33 import java.net.URLEncoder ; 34 35 import weblech.util.Log4j; 36 37 public class URLObject 38 { 39 private final static Category _logClass = Category.getInstance(URLObject.class); 40 41 static 42 { 43 Log4j.init(); 44 } 45 46 private final URL sourceURL; 47 private final String contentType; 48 private final byte[] content; 49 50 private final SpiderConfig config; 51 52 public URLObject(URL sourceURL, String contentType, byte[] content, SpiderConfig config) 53 { 54 this.sourceURL = sourceURL; 55 this.contentType = contentType; 56 this.content = content; 57 this.config = config; 58 } 59 60 public URLObject(URL sourceURL, SpiderConfig config) 61 { 62 this.sourceURL = sourceURL; 63 this.config = config; 64 65 String s = sourceURL.toExternalForm().toLowerCase(); 66 if(s.indexOf(".jpg") != -1) 67 { 68 contentType = "image/jpeg"; 69 } 70 else if(s.indexOf(".gif") != -1) 71 { 72 contentType = "image/gif"; 73 } 74 else 75 { 76 contentType = "text/html"; 77 } 78 79 if(existsOnDisk()) 80 { 81 82 File f = new File(convertToFileName()); 83 if(f.isDirectory()) 84 { 85 f = new File(f, "index.html"); 86 } 87 content = new byte[(int) f.length()]; 88 try 89 { 90 FileInputStream in = new FileInputStream(f); 91 in.read(content); 92 in.close(); 93 } 94 catch(IOException ioe) 95 { 96 _logClass.warn("IO Exception reading disk version of URL " + sourceURL, ioe); 97 } 98 } 99 else 100 { 101 content = new byte[0]; 102 } 103 } 104 105 public String getContentType() 106 { 107 return contentType; 108 } 109 110 public boolean isHTML() 111 { 112 return contentType.toLowerCase().startsWith("text/html"); 113 } 114 115 public boolean isXML() 116 { 117 return contentType.toLowerCase().startsWith("text/xml"); 118 } 119 120 public boolean isImage() 121 { 122 return contentType.startsWith("image/"); 123 } 124 125 public String getStringContent() 126 { 127 return new String (content); 128 } 129 130 private String convertToFileName() 131 { 132 String url = sourceURL.toExternalForm(); 133 int httpIdx = url.indexOf("http://"); 134 if(httpIdx == 0) 135 { 136 url = url.substring(7); 137 } 138 if(url.indexOf("/") < 0) 140 { 141 url = url + "/"; 142 } 143 if(url.endsWith("/")) 145 { 146 url = url + "index.html"; 147 } 148 url = textReplace("?", URLEncoder.encode("?"), url); 149 url = textReplace("&", URLEncoder.encode("&"), url); 150 return config.getSaveRootDirectory().getPath() + "/" + url; 151 } 152 153 public boolean existsOnDisk() 154 { 155 File f = new File(convertToFileName()); 156 return (f.exists() && !f.isDirectory()); 157 } 158 159 public void writeToFile() 160 { 161 writeToFile(convertToFileName()); 162 } 163 164 public void writeToFile(String fileName) 165 { 166 _logClass.debug("writeToFile(" + fileName + ")"); 167 try 168 { 169 File f = new File(fileName); 170 f.getParentFile().mkdirs(); 171 FileOutputStream out = new FileOutputStream(fileName); 172 out.write(content); 173 out.flush(); 174 out.close(); 175 } 176 catch(IOException ioe) 177 { 178 _logClass.warn("IO Exception writing to " + fileName, ioe); 179 } 180 } 181 182 public String toString() 183 { 184 StringBuffer sb = new StringBuffer (); 185 sb.append("URLObject: "); 186 sb.append(contentType); 187 if(false) { 189 sb.append("\n"); 190 sb.append(getStringContent()); 191 } 192 return sb.toString(); 193 } 194 195 private String textReplace(String find, String replace, String input) 196 { 197 int startPos = 0; 198 while(true) 199 { 200 int textPos = input.indexOf(find, startPos); 201 if(textPos < 0) 202 { 203 break; 204 } 205 input = input.substring(0, textPos) + replace + input.substring(textPos + find.length()); 206 startPos = textPos + replace.length(); 207 } 208 return input; 209 } 210 } 211 | Popular Tags |