1 24 package org.archive.crawler.datamodel; 25 26 import java.io.IOException ; 27 import java.io.ObjectInputStream ; 28 import java.io.ObjectOutputStream ; 29 import java.io.PrintWriter ; 30 import java.io.Serializable ; 31 import java.util.ArrayList ; 32 import java.util.Iterator ; 33 import java.util.List ; 34 35 import org.apache.commons.httpclient.URIException; 36 import org.archive.crawler.extractor.Link; 37 import org.archive.net.UURI; 38 import org.archive.net.UURIFactory; 39 import org.archive.util.ArchiveUtils; 40 import org.archive.util.Reporter; 41 42 import st.ata.util.AList; 43 import st.ata.util.HashtableAList; 44 45 59 public class CandidateURI 60 implements Serializable , Reporter, CoreAttributeConstants { 61 private static final long serialVersionUID = -7152937921526560388L; 62 63 66 public static final int HIGHEST = 0; 67 68 71 public static final int HIGH = 1; 72 73 76 public static final int MEDIUM = 2; 77 78 81 public static final int NORMAL = 3; 82 83 private int schedulingDirective = NORMAL; 84 85 89 private transient UURI uuri; 90 91 92 private boolean isSeed = false; 93 94 private boolean forceRevisit = false; 96 105 private String pathFromSeed; 106 107 111 private transient UURI via; 112 113 116 private CharSequence viaContext; 117 118 130 private transient AList alist; 131 132 138 private String cachedCandidateURIString = null; 139 140 141 148 private String classKey; 149 150 154 protected CandidateURI () { 155 super(); 156 } 157 158 161 public CandidateURI(UURI u) { 162 this.uuri = u; 163 } 164 165 171 public CandidateURI(UURI u, String pathFromSeed, UURI via, 172 CharSequence viaContext) { 173 this.uuri = u; 174 this.pathFromSeed = pathFromSeed; 175 this.via = via; 176 this.viaContext = viaContext; 177 } 178 179 183 public void setIsSeed(boolean b) { 184 this.isSeed = b; 185 if (this.isSeed) { 186 if(pathFromSeed==null) { 187 this.pathFromSeed = ""; 188 } 189 } 192 } 193 194 197 public UURI getUURI() { 198 return this.uuri; 199 } 200 201 204 public boolean isSeed() { 205 return this.isSeed; 206 } 207 208 211 public String getPathFromSeed() { 212 return this.pathFromSeed; 213 } 214 215 218 public UURI getVia() { 219 return this.via; 220 } 221 222 225 public CharSequence getViaContext() { 226 return this.viaContext; 227 } 228 229 232 protected void setPathFromSeed(String string) { 233 pathFromSeed = string; 234 } 235 236 240 protected void setAList(AList alist) { 241 this.alist = alist; 242 } 243 244 public void setVia(UURI via) { 245 this.via = via; 246 } 247 248 252 public synchronized String getCandidateURIString() { 253 if (this.cachedCandidateURIString == null) { 254 this.cachedCandidateURIString = 255 "CandidateURI(" + toString() + ")"; 256 } 257 return this.cachedCandidateURIString; 258 } 259 260 264 public String flattenVia() { 265 return (via == null)? "": via.toString(); 266 } 267 268 275 public String toString() { 276 return getURIString(); 277 } 278 279 283 public String getURIString() { 284 return getUURI().toString(); 285 } 286 287 296 public boolean sameDomainAs(CandidateURI other) throws URIException { 297 String domain = getUURI().getHost(); 298 if (domain == null) { 299 return false; 300 } 301 while(domain.lastIndexOf('.') > domain.indexOf('.')) { 302 domain = domain.substring(domain.indexOf('.') + 1); 304 } 305 if(other.getUURI().getHost() == null) { 306 return false; 307 } 308 return other.getUURI().getHost().endsWith(domain); 309 } 310 311 321 public boolean forceFetch() { 322 return forceRevisit; 323 } 324 325 335 public void setForceFetch(boolean b) { 336 forceRevisit = b; 337 } 338 339 342 public int getSchedulingDirective() { 343 return schedulingDirective; 344 } 345 348 public void setSchedulingDirective(int schedulingDirective) { 349 this.schedulingDirective = schedulingDirective; 350 } 351 352 353 356 public boolean needsImmediateScheduling() { 357 return schedulingDirective == HIGH; 358 } 359 360 363 public boolean needsSoonScheduling() { 364 return schedulingDirective == MEDIUM; 365 } 366 367 379 public int getTransHops() { 380 String path = getPathFromSeed(); 381 int transCount = 0; 382 for(int i=path.length()-1;i>=0;i--) { 383 if(path.charAt(i)==Link.NAVLINK_HOP) { 384 break; 385 } 386 transCount++; 387 } 388 return transCount; 389 } 390 391 400 public static CandidateURI fromString(String uriHopsViaString) 401 throws URIException { 402 String args[] = uriHopsViaString.split("\\s+"); 403 String pathFromSeeds = (args.length > 1 && !args[1].equals("-")) ? 404 args[1]: ""; 405 UURI via = (args.length > 2 && !args[2].equals("-")) ? 406 UURIFactory.getInstance(args[2]) : null; 407 CharSequence viaContext = (args.length > 3 && !args[3].equals("-")) ? 408 args[2]: null; 409 return new CandidateURI(UURIFactory.getInstance(args[0]), 410 pathFromSeeds, via, viaContext); 411 } 412 413 public static CandidateURI createSeedCandidateURI(UURI uuri) { 414 CandidateURI c = new CandidateURI(uuri); 415 c.setIsSeed(true); 416 return c; 417 } 418 419 427 public CandidateURI createCandidateURI(UURI baseUURI, Link link) 428 throws URIException { 429 UURI u = (link.getDestination() instanceof UURI)? 430 (UURI)link.getDestination(): 431 UURIFactory.getInstance(baseUURI, 432 link.getDestination().toString()); 433 CandidateURI newCaURI = new CandidateURI(u, getPathFromSeed() + link.getHopType(), 434 getUURI(), link.getContext()); 435 newCaURI.inheritFrom(this); 436 return newCaURI; 437 } 438 439 449 public CandidateURI createCandidateURI(UURI baseUURI, Link link, 450 int scheduling, boolean seed) 451 throws URIException { 452 final CandidateURI caURI = createCandidateURI(baseUURI, link); 453 caURI.setSchedulingDirective(scheduling); 454 caURI.setIsSeed(seed); 455 return caURI; 456 } 457 458 463 protected void inheritFrom(CandidateURI ancestor) { 464 List heritableKeys = (List ) ancestor.getObject(A_HERITABLE_KEYS); 465 if(heritableKeys!=null) { 466 getAList().copyKeysFrom(heritableKeys.iterator(),ancestor.getAList()); 467 } 468 } 469 470 480 public String getClassKey() { 481 return classKey; 482 } 483 484 public void setClassKey(String key) { 485 classKey = key; 486 } 487 488 498 public AList getAList() { 499 if (this.alist == null) { 500 this.alist = new HashtableAList(); 501 } 502 return this.alist; 503 } 504 505 protected void clearAList() { 506 this.alist = null; 507 } 508 509 public void putObject(String key, Object value) { 510 getAList().putObject(key, value); 511 } 512 513 public Object getObject(String key) { 514 return getAList().getObject(key); 515 } 516 517 public String getString(String key) { 518 return getAList().getString(key); 519 } 520 521 public void putString(String key, String value) { 522 getAList().putString(key, value); 523 } 524 525 public long getLong(String key) { 526 return getAList().getLong(key); 527 } 528 529 public void putLong(String key, long value) { 530 getAList().putLong(key, value); 531 } 532 533 public int getInt(String key) { 534 return getAList().getInt(key); 535 } 536 537 public void putInt(String key, int value) { 538 getAList().putInt(key, value); 539 } 540 541 public boolean containsKey(String key) { 542 return getAList().containsKey(key); 543 } 544 545 public void remove(String key) { 546 getAList().remove(key); 547 } 548 549 public Iterator keys() { 550 return getAList().getKeys(); 551 } 552 553 558 public boolean isLocation() { 559 return this.pathFromSeed != null && this.pathFromSeed.length() > 0 && 560 this.pathFromSeed.charAt(this.pathFromSeed.length() - 1) == 561 Link.REFER_HOP; 562 } 563 564 573 private void writeObject(ObjectOutputStream stream) 574 throws IOException { 575 stream.defaultWriteObject(); 576 stream.writeUTF(uuri.toString()); 577 stream.writeObject((via == null) ? null : via.getURI()); 578 stream.writeObject((alist==null) ? null : alist); 579 } 580 581 589 private void readObject(ObjectInputStream stream) 590 throws IOException , ClassNotFoundException { 591 stream.defaultReadObject(); 592 uuri = readUuri(stream.readUTF()); 593 via = readUuri((String )stream.readObject()); 594 alist = (AList) stream.readObject(); 595 } 596 597 603 protected UURI readUuri(String u) { 604 if (u == null) { 605 return null; 606 } 607 try { 608 return UURIFactory.getInstance(u); 609 } catch (URIException ux) { 610 } 612 try { 613 return UURIFactory.getInstance("invalid:" + u); 615 } catch (URIException ux) { 616 ux.printStackTrace(); 617 } 619 try { 620 return UURIFactory.getInstance("invalid:"); 622 } catch (URIException e) { 623 e.printStackTrace(); 624 return null; 625 } 626 } 627 628 632 public String singleLineReport() { 633 return ArchiveUtils.singleLineReport(this); 634 } 635 636 public void singleLineReportTo(PrintWriter w) { 637 String className = this.getClass().getName(); 638 className = className.substring(className.lastIndexOf(".")+1); 639 w.print(className); 640 w.print(" "); 641 w.print(getUURI().toString()); 642 w.print(" "); 643 w.print(pathFromSeed); 644 w.print(" "); 645 w.print(flattenVia()); 646 } 647 648 651 public String singleLineLegend() { 652 return "className uri hopsPath viaUri"; 653 } 654 655 658 public String [] getReports() { 659 return new String [] {}; 661 } 662 663 666 public void reportTo(String name, PrintWriter writer) { 667 singleLineReportTo(writer); 668 writer.print("\n"); 669 } 670 671 674 public void reportTo(PrintWriter writer) throws IOException { 675 reportTo(null,writer); 676 } 677 678 685 public void makeHeritable(String key) { 686 @SuppressWarnings ("unchecked") 687 List <String > heritableKeys = (List <String >) getObject(A_HERITABLE_KEYS); 688 if(heritableKeys==null) { 689 heritableKeys = new ArrayList <String >(); 690 heritableKeys.add(A_HERITABLE_KEYS); 691 putObject(A_HERITABLE_KEYS,heritableKeys); 692 } 693 heritableKeys.add(key); 694 } 695 696 702 public void makeNonHeritable(String key) { 703 List heritableKeys = (List ) getObject(A_HERITABLE_KEYS); 704 if(heritableKeys==null) { 705 return; 706 } 707 heritableKeys.remove(key); 708 if(heritableKeys.size()==1) { 709 remove(A_HERITABLE_KEYS); 711 } 712 } 713 } 714 | Popular Tags |