1 22 package org.archive.crawler.extractor; 23 24 import java.io.IOException ; 25 import java.io.InputStream ; 26 import java.util.regex.Matcher ; 27 28 import javax.management.AttributeNotFoundException ; 29 30 import org.archive.crawler.datamodel.CoreAttributeConstants; 31 import org.archive.crawler.datamodel.CrawlURI; 32 import org.archive.crawler.settings.SimpleType; 33 import org.archive.crawler.settings.Type; 34 import org.archive.net.UURI; 35 import org.archive.util.TextUtils; 36 37 55 public class ExtractorUniversal extends Extractor 56 implements CoreAttributeConstants { 57 58 private static final long serialVersionUID = -7593380118857156939L; 59 60 63 private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes"; 64 65 68 private static long DEFAULT_MAX_DEPTH_BYTES = 10240; 69 70 private static String ATTR_MAX_URL_LENGTH = "max-url-length"; 71 72 73 private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH; 74 75 81 static final String IP_ADDRESS = 82 "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)"; 83 84 89 public static final String TLDs = 90 "(ac(/.*)?)" + "|(ad(/.*)?)" + "|(ae(/.*)?)" + "|(af(/.*)?)" + "|(ag(/.*)?)" + "|(ai(/.*)?)" + "|(al(/.*)?)" + "|(am(/.*)?)" + "|(an(/.*)?)" + "|(ao(/.*)?)" + "|(aero(/.*)?)" + "|(aq(/.*)?)" + "|(ar(/.*)?)" + "|(as(/.*)?)" + "|(at(/.*)?)" + "|(au(/.*)?)" + "|(aw(/.*)?)" + "|(az(/.*)?)" + "|(ba(/.*)?)" + "|(bb(/.*)?)" + "|(bd(/.*)?)" + "|(be(/.*)?)" + "|(bf(/.*)?)" + "|(bg(/.*)?)" + "|(bh(/.*)?)" + "|(bi(/.*)?)" + "|(biz(/.*)?)" + "|(bj(/.*)?)" + "|(bm(/.*)?)" + "|(bn(/.*)?)" + "|(bo(/.*)?)" + "|(br(/.*)?)" + "|(bs(/.*)?)" + "|(bt(/.*)?)" + "|(bv(/.*)?)" + "|(bw(/.*)?)" + "|(by(/.*)?)" + "|(bz(/.*)?)" + "|(ca(/.*)?)" + "|(cc(/.*)?)" + "|(cd(/.*)?)" + "|(cf(/.*)?)" + "|(cg(/.*)?)" + "|(ch(/.*)?)" + "|(ci(/.*)?)" + "|(ck(/.*)?)" + "|(cl(/.*)?)" + "|(cm(/.*)?)" + "|(cn(/.*)?)" + "|(co(/.*)?)" + "|(com(/.*)?)" + "|(coop(/.*)?)" + "|(cr(/.*)?)" + "|(cs(/.*)?)" + "|(cu(/.*)?)" + "|(cv(/.*)?)" + "|(cx(/.*)?)" + "|(cy(/.*)?)" + "|(cz(/.*)?)" + "|(de(/.*)?)" + "|(dj(/.*)?)" + "|(dk(/.*)?)" + "|(dm(/.*)?)" + "|(do(/.*)?)" + "|(dz(/.*)?)" + "|(ec(/.*)?)" + "|(edu(/.*)?)" + "|(ee(/.*)?)" + "|(eg(/.*)?)" + "|(eh(/.*)?)" + "|(er(/.*)?)" + "|(es(/.*)?)" + "|(et(/.*)?)" + "|(fi(/.*)?)" + "|(fj(/.*)?)" + "|(fk(/.*)?)" + "|(fm(/.*)?)" + "|(fo(/.*)?)" + "|(fr(/.*)?)" + "|(ga(/.*)?)" + "|(gd(/.*)?)" + "|(ge(/.*)?)" + "|(gf(/.*)?)" + "|(gg(/.*)?)" + "|(gh(/.*)?)" + "|(gi(/.*)?)" + "|(gl(/.*)?)" + "|(gm(/.*)?)" + "|(gn(/.*)?)" + "|(gov(/.*)?)" + "|(gp(/.*)?)" + "|(gq(/.*)?)" + "|(gr(/.*)?)" + "|(gs(/.*)?)" + "|(gt(/.*)?)" + "|(gu(/.*)?)" + "|(gw(/.*)?)" + "|(gy(/.*)?)" + "|(hk(/.*)?)" + "|(hm(/.*)?)" + "|(hn(/.*)?)" + "|(hr(/.*)?)" + "|(ht(/.*)?)" + "|(hu(/.*)?)" + "|(id(/.*)?)" + "|(ie(/.*)?)" + "|(il(/.*)?)" + "|(im(/.*)?)" + "|(in(/.*)?)" + "|(info(/.*)?)" + "|(int(/.*)?)" + "|(io(/.*)?)" + "|(iq(/.*)?)" + "|(ir(/.*)?)" + "|(is(/.*)?)" + "|(it(/.*)?)" + "|(je(/.*)?)" + "|(jm(/.*)?)" + "|(jo(/.*)?)" + "|(jp(/.*)?)" + "|(ke(/.*)?)" + "|(kg(/.*)?)" + "|(kh(/.*)?)" + "|(ki(/.*)?)" + "|(km(/.*)?)" + "|(kn(/.*)?)" + "|(kp(/.*)?)" + "|(kr(/.*)?)" + "|(kw(/.*)?)" + "|(ky(/.*)?)" + "|(kz(/.*)?)" + "|(la(/.*)?)" + "|(lb(/.*)?)" + "|(lc(/.*)?)" + "|(li(/.*)?)" + "|(lk(/.*)?)" + "|(lr(/.*)?)" + "|(ls(/.*)?)" + "|(lt(/.*)?)" + "|(lu(/.*)?)" + "|(lv(/.*)?)" + "|(ly(/.*)?)" + "|(ma(/.*)?)" + "|(mc(/.*)?)" + "|(md(/.*)?)" + "|(mg(/.*)?)" + "|(mh(/.*)?)" + "|(mil(/.*)?)" + "|(mk(/.*)?)" + "|(ml(/.*)?)" + "|(mm(/.*)?)" + "|(mn(/.*)?)" + "|(mo(/.*)?)" + "|(mp(/.*)?)" + "|(mq(/.*)?)" + "|(mr(/.*)?)" + "|(ms(/.*)?)" + "|(mt(/.*)?)" + "|(mu(/.*)?)" + "|(museum(/.*)?)" + "|(mv(/.*)?)" + "|(mw(/.*)?)" + "|(mx(/.*)?)" + "|(my(/.*)?)" + "|(mz(/.*)?)" + "|(na(/.*)?)" + "|(name(/.*)?)" + "|(nc(/.*)?)" + "|(ne(/.*)?)" + "|(net(/.*)?)" + "|(nf(/.*)?)" + "|(ng(/.*)?)" + "|(ni(/.*)?)" + "|(nl(/.*)?)" + "|(no(/.*)?)" + "|(np(/.*)?)" + "|(nr(/.*)?)" + "|(nt(/.*)?)" + "|(nu(/.*)?)" + "|(nz(/.*)?)" + "|(om(/.*)?)" + "|(org(/.*)?)" + "|(pa(/.*)?)" + "|(pe(/.*)?)" + "|(pf(/.*)?)" + "|(pg(/.*)?)" + "|(ph(/.*)?)" + "|(pk(/.*)?)" + "|(pl(/.*)?)" + "|(pm(/.*)?)" + "|(pn(/.*)?)" + "|(pr(/.*)?)" + "|(pro(/.*)?)" + "|(ps(/.*)?)" + "|(pt(/.*)?)" + "|(pw(/.*)?)" + "|(py(/.*)?)" + "|(qa(/.*)?)" + "|(re(/.*)?)" + "|(ro(/.*)?)" + "|(ru(/.*)?)" + "|(rw(/.*)?)" + "|(sa(/.*)?)" + "|(sb(/.*)?)" + "|(sc(/.*)?)" + "|(sd(/.*)?)" + "|(se(/.*)?)" + "|(sg(/.*)?)" + "|(sh(/.*)?)" + "|(si(/.*)?)" + "|(sj(/.*)?)" + "|(sk(/.*)?)" + "|(sl(/.*)?)" + "|(sm(/.*)?)" + "|(sn(/.*)?)" + "|(so(/.*)?)" + "|(sr(/.*)?)" + "|(sv(/.*)?)" + "|(st(/.*)?)" + "|(sy(/.*)?)" + "|(sz(/.*)?)" + "|(tc(/.*)?)" + "|(td(/.*)?)" + "|(tf(/.*)?)" + "|(tg(/.*)?)" + "|(th(/.*)?)" + "|(tj(/.*)?)" + "|(tk(/.*)?)" + "|(tm(/.*)?)" + "|(tn(/.*)?)" + "|(to(/.*)?)" + "|(tp(/.*)?)" + "|(tr(/.*)?)" + "|(tt(/.*)?)" + "|(tv(/.*)?)" + "|(tw(/.*)?)" + "|(tz(/.*)?)" + "|(ua(/.*)?)" + "|(ug(/.*)?)" + "|(uk(/.*)?)" + "|(um(/.*)?)" + "|(us(/.*)?)" + "|(uy(/.*)?)" + "|(uz(/.*)?)" + "|(va(/.*)?)" + "|(vc(/.*)?)" + "|(ve(/.*)?)" + "|(vg(/.*)?)" + "|(vi(/.*)?)" + "|(vn(/.*)?)" + "|(vu(/.*)?)" + "|(wf(/.*)?)" + "|(ws(/.*)?)" + "|(ye(/.*)?)" + "|(yt(/.*)?)" + "|(yu(/.*)?)" + "|(za(/.*)?)" + "|(zm(/.*)?)" + "|(zw(/.*)?)" ; 350 351 protected long numberOfCURIsHandled = 0; 352 protected long numberOfLinksExtracted= 0; 353 354 358 public ExtractorUniversal(String name) { 359 super(name, "Link extraction on unknown file types. A best effort" + 360 " extractor that looks at the raw byte code of any file " + 361 "that has not been handled by another extractor and tries" + 362 " to find URIs. Will only match absolute URIs."); 363 Type e; 364 e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES, 365 "How deep to look into files for URI strings, in bytes", 366 new Long (DEFAULT_MAX_DEPTH_BYTES))); 367 e.setExpertSetting(true); 368 e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH, 369 "Max length of URIs in bytes", new Long (DEFAULT_MAX_URL_LENGTH))); 370 e.setExpertSetting(true); 371 } 372 373 protected void extract(CrawlURI curi) { 374 if (!isHttpTransactionContentToProcess(curi)) { 375 return; 376 } 377 378 numberOfCURIsHandled++; 379 380 try { 381 InputStream instream = curi.getHttpRecorder().getRecordedInput(). 382 getContentReplayInputStream(); 383 int ch = instream.read(); 384 StringBuffer lookat = new StringBuffer (); 385 long counter = 0; 386 long maxdepth = ((Long )getAttribute(ATTR_MAX_DEPTH_BYTES,curi)). 387 longValue(); 388 if(maxdepth<=0){ 389 maxdepth = Long.MAX_VALUE; 390 } 391 long maxURLLength = ((Long )getAttribute(ATTR_MAX_URL_LENGTH,curi)). 392 longValue(); 393 boolean foundDot = false; 394 while(ch != -1 && ++counter <= maxdepth) { 395 if(lookat.length()>maxURLLength){ 396 lookat = new StringBuffer (); 398 foundDot = false; 399 } 400 else if(isURLableChar(ch)){ 401 if(ch == 46){ 403 foundDot = true; 405 } 406 lookat.append((char)ch); 407 } else if(lookat.length() > 3 && foundDot) { 408 String newURL = lookat.toString(); 412 if(looksLikeAnURL(newURL)) 413 { 414 416 if(newURL.toLowerCase().indexOf("http") > 0){ 419 newURL = newURL.substring(newURL.toLowerCase(). 421 indexOf("http")); 422 } 423 while(newURL.substring(newURL.length()-1).equals(".")) 424 { 425 newURL = newURL.substring(0,newURL.length()-1); 427 } 428 429 numberOfLinksExtracted++; 431 curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP); 432 } 433 lookat = new StringBuffer (); 435 foundDot = false; 436 } else if(lookat.length()>0) { 437 lookat = new StringBuffer (); 439 foundDot = false; 440 } 441 ch = instream.read(); 442 } 443 } catch(IOException e){ 444 e.printStackTrace(); 446 } catch (AttributeNotFoundException e) { 447 e.printStackTrace(); 449 } 450 curi.linkExtractorFinished(); 452 } 453 454 465 private boolean looksLikeAnURL(String lookat) { 466 if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){ 467 Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat); 470 boolean testVal = ip.matches(); 471 TextUtils.recycleMatcher(ip); 472 if(testVal){ 473 return true; 474 } 475 } 476 477 int dot = lookat.indexOf("."); 478 if(dot!=0){ while(dot != -1 && dot < lookat.length()){ 480 lookat = lookat.substring(dot+1); 481 if (isTLD(lookat.substring(0, lookat.length() <= 6? 482 lookat.length(): 6))) 483 { 484 return true; 485 } 486 dot = lookat.indexOf("."); 487 } 488 } 489 490 return false; 491 } 492 493 502 private boolean isTLD(String potentialTLD) { 503 if(potentialTLD.length()<2){ 504 return false; 505 } 506 507 potentialTLD.toLowerCase(); 508 Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD); 509 boolean ret = uri.matches(); 510 TextUtils.recycleMatcher(uri); 511 return ret; 512 } 513 514 537 private boolean isURLableChar(int ch) { 538 return (ch>=35 && ch<=38) 539 || (ch>=43 && ch<=59) 540 || (ch==61) 541 || (ch>=63 && ch<=90) 542 || (ch==95) 543 || (ch>=97 && ch<=122) 544 || (ch==126); 545 } 546 547 550 public String report() { 551 StringBuffer ret = new StringBuffer (); 552 ret.append("Processor: org.archive.crawler.extractor." + 553 "ExtractorUniversal\n"); 554 ret.append(" Function: Link extraction on unknown file" + 555 " types.\n"); 556 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 557 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 558 559 return ret.toString(); 560 } 561 } 562 | Popular Tags |