1 24 25 27 package weblech.spider; 28 29 import weblech.util.Logger; 30 31 import java.io.File ; 32 import java.io.Serializable ; 33 import java.util.*; 34 import java.net.URL ; 35 import java.net.MalformedURLException ; 36 37 public class SpiderConfig extends Logger implements Serializable 38 { 39 private File saveRootDirectory; 40 private File mailtoLogFile; 41 42 private boolean refreshHTMLs; 43 private boolean refreshImages; 44 private boolean refreshOthers; 45 46 private Set htmlExtensions; 47 private Set imageExtensions; 48 49 private URL startLocation; 50 private String urlMatch; 51 52 private List interestingURLSubstrings; 53 private List boringURLSubstrings; 54 55 private boolean depthFirst; 56 private int maxDepth; 57 58 private String userAgent; 59 60 private String basicAuthUser; 61 private String basicAuthPassword; 62 63 private int spiderThreads; 64 65 private long checkpointInterval; 66 67 70 public SpiderConfig() 71 { 72 _logClass.debug("SpiderConfig()"); 73 74 saveRootDirectory = new File ("."); 75 mailtoLogFile = new File ("mailto.txt"); 76 77 refreshHTMLs = true; 78 refreshImages = false; 79 refreshOthers = false; 80 81 htmlExtensions = new HashSet(); 82 htmlExtensions.add("htm"); 83 htmlExtensions.add("html"); 84 htmlExtensions.add("shtml"); 85 86 imageExtensions = new HashSet(); 87 imageExtensions.add("jpg"); 88 imageExtensions.add("gif"); 89 imageExtensions.add("png"); 90 91 urlMatch = null; 92 interestingURLSubstrings = new ArrayList(); 93 boringURLSubstrings = new ArrayList(); 94 depthFirst = false; 95 maxDepth = 0; 96 97 userAgent = "WebLech Spider 0.01alpha"; 98 basicAuthUser = ""; 99 basicAuthPassword = ""; 100 101 spiderThreads = 1; 102 103 checkpointInterval = 0; 104 } 105 106 109 public SpiderConfig(Properties props) 110 { 111 _logClass.debug("SpiderConfig(props)"); 112 113 saveRootDirectory = new File (props.getProperty("saveRootDirectory", ".")); 114 if(!saveRootDirectory.exists()) 115 { 116 if(!saveRootDirectory.mkdirs()) 117 { 118 _logClass.error("Couldn't create root directory: " + saveRootDirectory); 119 _logClass.info("Defaulting to . instead"); 120 saveRootDirectory = new File ("."); 121 } 122 } 123 else if(!saveRootDirectory.isDirectory()) 124 { 125 _logClass.error("Save root is not a directory: " + saveRootDirectory); 126 _logClass.info("Defaulting to . instead"); 127 saveRootDirectory = new File ("."); 128 } 129 130 String mailtoFileStr = props.getProperty("mailtoLogFile", "mailto.txt"); 131 if(mailtoFileStr.indexOf(":") != -1 || mailtoFileStr.startsWith("/") || mailtoFileStr.startsWith("\\")) 133 { 134 _logClass.debug("Using absolute file name " + mailtoFileStr); 135 mailtoLogFile = new File (mailtoFileStr); 136 } 137 else 138 { 139 _logClass.debug("Constructing relative file name " + saveRootDirectory.getPath() + "/" + mailtoFileStr); 140 mailtoLogFile = new File (saveRootDirectory.getPath() + "/" + mailtoFileStr); 141 } 142 143 refreshHTMLs = Boolean.valueOf(props.getProperty("refreshHTMLs", "true")).booleanValue(); 144 refreshImages = Boolean.valueOf(props.getProperty("refreshImages", "false")).booleanValue(); 145 refreshOthers = Boolean.valueOf(props.getProperty("refreshOthers", "false")).booleanValue(); 146 147 htmlExtensions = parseSet(props.getProperty("htmlExtensions", "htm,html,shtml")); 148 imageExtensions = parseSet(props.getProperty("imageExtensions", "jpg,gif,png")); 149 150 String startLocStr = props.getProperty("startLocation"); 151 if(startLocStr != null) 152 { 153 try 154 { 155 startLocation = new URL (startLocStr); 156 } 157 catch(MalformedURLException murle) 158 { 159 _logClass.error("Caught MalformedURLException parsing start URL '" + startLocStr + "' : " + murle.getMessage(), murle); 160 } 161 } 162 else 163 { 164 _logClass.warn("startLocation not found in properties"); 165 } 166 167 urlMatch = props.getProperty("urlMatch"); 168 169 interestingURLSubstrings = parsePropCommaSeparated(props.getProperty("interestingURLs")); 170 boringURLSubstrings = parsePropCommaSeparated(props.getProperty("boringURLs")); 171 172 depthFirst = Boolean.valueOf(props.getProperty("depthFirst", "false")).booleanValue(); 173 try 174 { 175 String maxDepthStr = props.getProperty("maxDepth", "0"); 176 maxDepth = Integer.parseInt(maxDepthStr); 177 } 178 catch(NumberFormatException nfe) 179 { 180 _logClass.error("Caught number format exception parsing max depth, defaulting to 1", nfe); 181 maxDepth = 1; 182 } 183 184 userAgent = props.getProperty("userAgent", "WebLech Spider 0.01alpha"); 185 basicAuthUser = props.getProperty("basicAuthUser", ""); 186 basicAuthPassword = props.getProperty("basicAuthPassword", ""); 187 188 try 189 { 190 String threadsStr = props.getProperty("spiderThreads", "1"); 191 spiderThreads = Integer.parseInt(threadsStr); 192 } 193 catch(NumberFormatException nfe) 194 { 195 _logClass.error("Caught number format exception parsing number of threads, defaulting to 1", nfe); 196 spiderThreads = 1; 197 } 198 199 try 200 { 201 String intervalStr = props.getProperty("checkpointInterval", "0"); 202 checkpointInterval = Long.parseLong(intervalStr); 203 } 204 catch(NumberFormatException nfe) 205 { 206 _logClass.error("Caught number format exception parsing checkpoint interval, defaulting to 0", nfe); 207 spiderThreads = 1; 208 } 209 } 210 211 private List parsePropCommaSeparated(String str) 212 { 213 ArrayList result = new ArrayList(); 214 if(str != null && str.length() > 0) 215 { 216 StringTokenizer tok = new StringTokenizer(str, ","); 217 while(tok.hasMoreTokens()) 218 { 219 result.add(tok.nextToken()); 220 } 221 } 222 return result; 223 } 224 225 226 public void setRefreshHTMLs(boolean refreshHTMLs) 227 { 228 this.refreshHTMLs = refreshHTMLs; 229 } 230 231 public boolean refreshHTMLs() 232 { 233 return refreshHTMLs; 234 } 235 236 public void setRefreshImages(boolean refreshImages) 237 { 238 this.refreshImages = refreshImages; 239 } 240 241 public boolean refreshImages() 242 { 243 return refreshImages; 244 } 245 246 public void setRefreshOthers(boolean refreshOthers) 247 { 248 this.refreshOthers = refreshOthers; 249 } 250 251 public boolean refreshOthers() 252 { 253 return refreshOthers; 254 } 255 256 public void setSaveRootDirectory(File saveRootDirectory) 257 { 258 this.saveRootDirectory = saveRootDirectory; 259 } 260 261 public File getSaveRootDirectory() 262 { 263 return saveRootDirectory; 264 } 265 266 public void setMailtoLogFile(File mailtoLogFile) 267 { 268 this.mailtoLogFile = mailtoLogFile; 269 } 270 271 public File getMailtoLogFile() 272 { 273 return mailtoLogFile; 274 } 275 276 public void setStartLocation(URL startLocation) 277 { 278 this.startLocation = startLocation; 279 } 280 281 public URL getStartLocation() 282 { 283 return startLocation; 284 } 285 286 public void setURLMatch(String urlMatch) 287 { 288 this.urlMatch = urlMatch; 289 } 290 291 public String getURLMatch() 292 { 293 return urlMatch; 294 } 295 296 public List getInterestingURLSubstrings() 297 { 298 return interestingURLSubstrings; 299 } 300 301 public void setInterestingURLSubstrings(List interestingURLSubstrings) 302 { 303 this.interestingURLSubstrings = interestingURLSubstrings; 304 } 305 306 public List getBoringURLSubstrings() 307 { 308 return boringURLSubstrings; 309 } 310 311 public void setBoringURLSubstrings(List boringURLSubstrings) 312 { 313 this.boringURLSubstrings = boringURLSubstrings; 314 } 315 316 public boolean isInteresting(URL u) 317 { 318 return matchURL(u, interestingURLSubstrings); 319 } 320 321 public boolean isBoring(URL u) 322 { 323 return matchURL(u, boringURLSubstrings); 324 } 325 326 private boolean matchURL(URL u, List substrings) 327 { 328 String str = u.toExternalForm(); 329 for(Iterator i = substrings.iterator(); i.hasNext(); ) 330 { 331 String substr = (String ) i.next(); 332 if(str.indexOf(substr) != -1) 333 { 334 return true; 335 } 336 } 337 return false; 338 } 339 340 public void setDepthFirstSearch(boolean depthFirst) 341 { 342 this.depthFirst = depthFirst; 343 } 344 345 public boolean isDepthFirstSearch() 346 { 347 return depthFirst; 348 } 349 350 public void setMaxDepth(int maxDepth) 351 { 352 this.maxDepth = maxDepth; 353 } 354 355 public int getMaxDepth() 356 { 357 return maxDepth; 358 } 359 360 public void setUserAgent(String userAgent) 361 { 362 this.userAgent = userAgent; 363 } 364 365 public String getUserAgent() 366 { 367 return userAgent; 368 } 369 370 public void setBasicAuthUser(String basicAuthUser) 371 { 372 this.basicAuthUser = basicAuthUser; 373 } 374 375 public String getBasicAuthUser() 376 { 377 return basicAuthUser; 378 } 379 380 public void setBasicAuthPassword(String basicAuthPassword) 381 { 382 this.basicAuthPassword = basicAuthPassword; 383 } 384 385 public String getBasicAuthPassword() 386 { 387 return basicAuthPassword; 388 } 389 390 public void setSpiderThreads(int spiderThreads) 391 { 392 this.spiderThreads = spiderThreads; 393 } 394 395 public int getSpiderThreads() 396 { 397 return spiderThreads; 398 } 399 400 public void setCheckpointInterval(long interval) 401 { 402 this.checkpointInterval = interval; 403 } 404 405 public long getCheckpointInterval() 406 { 407 return checkpointInterval; 408 } 409 410 public String toString() 411 { 412 return "depthFirst:\t" + depthFirst 413 + "\nmaxDepth:\t" + maxDepth 414 + "\nhtmlExtensions:\t" + fromSet(htmlExtensions) 415 + "\nimageExtensions:\t" + fromSet(imageExtensions) 416 + "\nrefreshHTMLs:\t" + refreshHTMLs 417 + "\nrefreshImages:\t" + refreshImages 418 + "\nrefreshOthers:\t" + refreshOthers 419 + "\nsaveRootDirectory:\t" + saveRootDirectory 420 + "\nstartLocation:\t" + startLocation 421 + "\nurlMatch:\t" + urlMatch 422 + "\nuserAgent:\t" + userAgent 423 + "\nbasicAuthUser:\t" + basicAuthUser 424 + "\nbasicAuthPassword:\t" + "***" 425 + "\nspiderThreads:\t" + spiderThreads 426 + "\ncheckpointInterval:\t" + checkpointInterval; 427 } 428 429 private Set parseSet(String str) 430 { 431 _logClass.debug("parseSet(" + str + ")"); 432 HashSet result = new HashSet(); 433 StringTokenizer sTok = new StringTokenizer(str, ","); 434 while(sTok.hasMoreTokens()) 435 { 436 String tok = sTok.nextToken().trim(); 437 result.add(tok); 438 } 439 return result; 440 } 441 442 private String fromSet(Set s) 443 { 444 StringBuffer sb = new StringBuffer (); 445 boolean first = true; 446 for(Iterator i = s.iterator(); i.hasNext(); ) 447 { 448 String str = (String ) i.next(); 449 if(first) 450 { 451 first = false; 452 } 453 else 454 { 455 sb.append(","); 456 } 457 sb.append(str); 458 } 459 return sb.toString(); 460 } 461 462 } | Popular Tags |