1 16 package net.sf.jftp.tools; 17 18 import java.io.BufferedInputStream ; 19 import java.io.BufferedOutputStream ; 20 import java.io.BufferedReader ; 21 import java.io.BufferedWriter ; 22 import java.io.DataInputStream ; 23 import java.io.File ; 24 import java.io.FileOutputStream ; 25 import java.io.InputStreamReader ; 26 import java.io.OutputStreamWriter ; 27 import java.net.Socket ; 28 import java.util.Enumeration ; 29 import java.util.Hashtable ; 30 import java.util.StringTokenizer ; 31 import java.util.Vector ; 32 33 import net.sf.jftp.system.LocalIO; 34 import net.sf.jftp.system.logging.Log; 35 36 37 public class FileSearch 38 { 39 40 private int currentDepth = 0; 41 private Hashtable checked = new Hashtable (); 42 public static boolean quiet = true; 43 public static boolean ultraquiet = false; 44 45 String localDir = "."; 46 int MAX = 999999; 47 int MIN_TERM = 1; 48 int MIN_FACTOR = 1; 49 boolean LOAD = false; 50 String [] typeArray = { "" }; 51 String [] termArray = { "" }; 52 String [] optArray = { "" }; 53 String [] ignoreArray = { "" }; 54 String [] scanArray = { "" }; 55 56 57 public static void main(String argv[]) { 58 String [] typeArray = { ".gz", ".bz2", ".zip", ".rar" }; 59 String [] termArray = { "linux", "kernel" }; 60 String [] optArray = { "download", "file", "mirror", "location" }; 61 String [] ignoreArray = { ".gif", ".jpg", ".png", ".swf", ".jar", ".class", ".google." }; 62 String [] scanArray = { ".html", ".htm", "/", ".jsp", ".jhtml", ".phtml", ".asp", ".xml", ".js", ".cgi" }; 63 String url = "http://www.google.de/search?hl=de&q="; 64 65 for(int i=0; i<termArray.length; i++) { 66 url += termArray[i]+"+"; 67 } 68 69 FileSearch search = new FileSearch(); 70 71 search.typeArray = typeArray; 72 search.termArray = termArray; 73 search.optArray = optArray; 74 search.ignoreArray = ignoreArray; 75 search.scanArray = scanArray; 76 search.MIN_TERM = 1; 77 78 search.spider(url); 79 80 } 81 82 private void spider(String url) 83 { 84 try 85 { 86 if(url.indexOf("/") < 0) 87 { 88 url = url + "/"; 89 } 90 91 url = clear(url); 92 93 Log.out(">>> URL: "+url); 94 Log.out(">>> Scanning for "); 95 96 for(int i = 0; i < typeArray.length; i++) 97 { 98 Log.out(typeArray[i] + " "); 99 } 100 101 Log.out(""); 102 103 104 Log.out("Fetching initial HTML file..."); 105 106 Getter urlGetter = new Getter(localDir); 107 urlGetter.fetch(url, true); 108 109 Log.out("Searching for links..."); 110 LocalIO.pause(500); 111 112 crawl(url); 113 } 114 catch(Exception ex) 115 { 116 ex.printStackTrace(); 117 } 118 } 119 120 private String clear(String url) 121 { 122 int idx = url.indexOf("http://"); 123 124 if(idx >= 0) 125 { 126 url = url.substring(7); 127 } 128 129 return url; 130 } 131 132 private Vector addVector(Vector v, Vector x) 133 { 134 Enumeration e = x.elements(); 135 136 while(e.hasMoreElements()) 137 { 138 String next = (String ) e.nextElement(); 139 v.add(next); 140 } 141 142 return v; 143 } 144 145 private int rate(String content) { 146 int score = 0; 147 148 for(int i=0; i<termArray.length; i++) { 149 if(content.indexOf(termArray[i]) >= 0) score += 3; 150 } 151 152 if(score < MIN_TERM) return 0; 153 154 for(int i=0; i<optArray.length; i++) { 155 if(content.indexOf(optArray[i]) >= 0) score++; 156 } 157 158 return score; 159 } 160 161 private int checkForResult(String url) { 162 166 for(int i=0; i<ignoreArray.length; i++) { 167 if(url.indexOf(ignoreArray[i]) >= 0) return -1; 168 } 169 170 if(!checkForScanableUrl(url)) return -1; 171 172 return 1; 173 } 174 175 private boolean checkForScanableUrl(String url) { 176 177 if(checked.containsKey(url)) { 178 return false; 179 } 180 else { 181 checked.put(url, ""); 182 } 183 184 if(url.indexOf("/") > 0) { 185 String tmp = url.substring(0, url.indexOf("/")); 186 } 187 188 for(int i=0; i<scanArray.length; i++) { 189 if(url.endsWith(scanArray[i])) return true; 190 } 191 192 return false; 193 } 194 195 private void crawl(String url) throws Exception 196 { 197 url = clear(url); 198 199 int urlRating = checkForResult(url); 200 if(!quiet) Log.out("URL-Rating: "+url+" -> "+urlRating+" @"+currentDepth); 201 202 if(urlRating > 0) { 203 } else if(urlRating < 0 && currentDepth > 0) { 207 if(!quiet) Log.out("SKIP "+url); 208 return; 209 } 210 211 212 Getter urlGetter = new Getter(localDir); 213 String content = urlGetter.fetch(url); 214 215 int factor = rate(content); 216 if(!quiet) Log.out("Content-Rating: "+url+" -> "+factor+" @"+currentDepth); 217 218 if(factor < MIN_FACTOR) { 219 if(!quiet) Log.out("DROP: "+url); 220 return; 221 } 222 223 if(!ultraquiet) Log.out("Url: "+url+" -> "+urlRating+":"+factor+"@"+currentDepth); 224 225 Vector m = sort(content, url.substring(0, url.lastIndexOf("/")), 226 "href=\""); 227 m = addVector(m, 228 sort(content, url.substring(0, url.lastIndexOf("/")), 229 "src=\"")); 230 m = addVector(m, 231 sort(content, url.substring(0, url.lastIndexOf("/")), 232 "HREF=\"")); 233 m = addVector(m, 234 sort(content, url.substring(0, url.lastIndexOf("/")), 235 "SRC=\"")); 236 237 Enumeration links = m.elements(); 238 239 while(links.hasMoreElements()) 240 { 241 242 String next = (String ) links.nextElement(); 243 244 if(!quiet) Log.out("PROCESS: " + next); 245 boolean skip = false; 246 247 while(!skip) { 248 for(int i = 0; i < typeArray.length; i++) 249 { 250 if(next.endsWith(typeArray[i]) || 251 typeArray[i].trim().equals("*")) 252 { 253 Log.out("HIT: "+url+" -> "+next); 254 256 if(!LOAD || !checkForScanableUrl(url)) continue; 257 258 int x = next.indexOf("/"); 259 260 if((x > 0) && (next.substring(0, x).indexOf(".") > 0)) 261 { 262 Getter urlGetter2 = new Getter(localDir); 263 urlGetter2.fetch(next, false); 264 265 continue; 266 } 267 } 268 } 269 270 skip = true; 271 } 272 273 if(currentDepth < MAX) 274 { 275 276 int x = next.indexOf("/"); 277 278 if((x > 0) && (next.substring(0, x).indexOf(".") > 0)) 279 { 280 currentDepth++; 281 crawl(next); 282 currentDepth--; 283 } 284 } 285 } 286 } 287 288 private Vector sort(String content, String url, String index) 289 { 290 Vector res = new Vector (); 291 int wo = 0; 292 293 while(true) 294 { 295 wo = content.indexOf(index); 296 297 if(wo < 0) 298 { 299 return res; 300 } 301 302 content = content.substring(wo + index.length()); 303 304 String was = content.substring(0, content.indexOf("\"")); 305 306 was = createAbsoluteUrl(was, url); 307 res.add(was); 308 if(!quiet) Log.out("ADD: " + was); 309 } 310 } 311 312 private String [] check(String auswahl) 313 { 314 StringTokenizer tokenizer = new StringTokenizer (auswahl, "-", false); 315 String [] strArr = new String [tokenizer.countTokens()]; 316 int tmp = 0; 317 318 while(tokenizer.hasMoreElements()) 319 { 320 strArr[tmp] = (String ) tokenizer.nextElement(); 321 tmp++; 322 } 323 324 return strArr; 325 } 326 327 private String createAbsoluteUrl(String newLink, String baseUrl) 328 { 329 newLink = clear(newLink); 330 331 if(newLink.startsWith(baseUrl)) 332 { 333 return newLink; 334 } 335 336 if(newLink.startsWith("/") && (baseUrl.indexOf("/") > 0)) 337 { 338 newLink = baseUrl.substring(0, baseUrl.indexOf("/")) + newLink; 339 } 340 else if(newLink.startsWith("/") && (baseUrl.indexOf("/") < 0)) 341 { 342 newLink = baseUrl + newLink; 343 } 344 else if((newLink.indexOf(".") > 0)) 345 { 346 int idx = newLink.indexOf("/"); 347 String tmp = ""; 348 349 if(idx >= 0) 350 { 351 tmp = newLink.substring(0, idx); 352 } 353 354 if((tmp.indexOf(".") > 0)) 355 { 356 return clear(newLink); 357 } 358 359 if(baseUrl.endsWith("/")) 360 { 361 newLink = baseUrl + newLink; 362 } 363 else 364 { 365 newLink = baseUrl + "/" + newLink; 366 } 367 } 368 369 371 return newLink; 372 } 373 374 } 375 376 377 class Getter 378 { 379 private String localDir = null; 380 381 public Getter(String localDir) 382 { 383 this.localDir = localDir; 384 } 385 386 public String fetch(String url) 387 { 388 try 389 { 390 String host = url.substring(0, url.indexOf("/")); 391 String wo = url.substring(url.indexOf("/")); 392 String result = ""; 393 394 396 Socket deal = new Socket (host, 80); 397 deal.setSoTimeout(5000); 398 399 BufferedWriter out = new BufferedWriter (new OutputStreamWriter (deal.getOutputStream())); 400 BufferedReader in = new BufferedReader (new InputStreamReader (deal.getInputStream())); 401 402 out.write("GET http://" + url + " HTTP/1.0\n\n"); 403 out.flush(); 404 405 int len = 0; 406 407 while(!in.ready() && (len < 5000)) 408 { 409 chill(100); 410 len += 100; 411 } 412 413 while(in.ready()) 414 { 415 result = result + in.readLine(); 416 } 417 418 out.close(); 419 in.close(); 420 421 return result; 422 } 423 catch(Exception ex) 424 { 425 if(!FileSearch.quiet) ex.printStackTrace(); 426 } 427 428 return ""; 429 } 430 431 public void fetch(String url, boolean force) 432 { 433 try 434 { 435 String host = url.substring(0, url.indexOf("/")); 436 String wo = url.substring(url.indexOf("/")); 437 String result = ""; 438 439 if(!FileSearch.quiet) Log.debug(">>> " + host + wo); 440 441 File d = new File (localDir); 443 d.mkdir(); 444 445 File f = new File (localDir + wo.substring(wo.lastIndexOf("/") + 1)); 446 447 if(f.exists() && !force) 448 { 449 if(!FileSearch.quiet) Log.debug(">>> file already exists..."); 450 451 return; 452 } 453 else 454 { 455 f.delete(); 456 } 457 458 Socket deal = new Socket (host, 80); 459 BufferedWriter out = new BufferedWriter (new OutputStreamWriter (deal.getOutputStream())); 460 DataInputStream in = new DataInputStream (new BufferedInputStream (deal.getInputStream())); 461 462 BufferedOutputStream localOut = new BufferedOutputStream (new FileOutputStream (localDir + 463 wo.substring(wo.lastIndexOf("/") + 464 1))); 465 466 byte[] alu = new byte[2048]; 467 468 out.write("GET http://" + url + " HTTP/1.0\n\n"); 469 out.flush(); 470 471 boolean line = true; 472 boolean bin = false; 473 474 while(true) 475 { 476 chill(10); 477 478 String tmp = ""; 479 480 while(line) 481 { 482 String x = in.readLine(); 483 484 if(x == null) 485 { 486 break; 487 } 488 489 tmp += (x + "\n"); 490 491 if(x.equals("")) 492 { 493 line = false; 494 } 495 } 496 497 int x = in.read(alu); 498 499 if(x == -1) 500 { 501 if(line) 502 { 503 localOut.write(tmp.getBytes(), 0, tmp.length()); 504 } 505 506 out.close(); 507 in.close(); 508 localOut.flush(); 509 localOut.close(); 510 511 return; 512 } 513 else 514 { 515 localOut.write(alu, 0, x); 516 } 517 } 518 } 519 catch(Exception ex) 520 { 521 if(!FileSearch.quiet) ex.printStackTrace(); 522 } 523 } 524 525 public static void chill(int time) 526 { 527 try 528 { 529 Thread.sleep(time); 530 } 531 catch(Exception ex) 532 { 533 } 534 } 535 } 536 | Popular Tags |