1 package net.matuschek.spider; 2 3 11 12 import java.io.File ; 13 import java.io.FileInputStream ; 14 import java.io.IOException ; 15 import java.lang.reflect.Field ; 16 import java.lang.reflect.Modifier ; 17 import java.net.MalformedURLException ; 18 import java.net.URL ; 19 import java.util.Date ; 20 import java.util.HashMap ; 21 import java.util.HashSet ; 22 import java.util.StringTokenizer ; 23 import java.util.Vector ; 24 25 import net.matuschek.html.FormFiller; 26 import net.matuschek.html.HtmlDocument; 27 import net.matuschek.http.DocManagerException; 28 import net.matuschek.http.DownloadRuleSet; 29 import net.matuschek.http.ExtendedURL; 30 import net.matuschek.http.HttpConstants; 31 import net.matuschek.http.HttpDoc; 32 import net.matuschek.http.HttpDocManager; 33 import net.matuschek.http.HttpException; 34 import net.matuschek.http.HttpHeader; 35 import net.matuschek.http.HttpTool; 36 import net.matuschek.http.HttpToolCallback; 37 import net.matuschek.http.NTLMAuthorization; 38 import net.matuschek.http.cookie.CookieManager; 39 import net.matuschek.spider.docfilter.FilterChain; 40 import net.matuschek.spider.docfilter.FilterException; 41 42 import org.apache.log4j.Category; 43 import org.w3c.dom.Element ; 44 45 public class WebRobot implements Runnable , Cloneable { 46 47 48 private final static String ROBOT_NAME = "JoBo"; 49 50 51 private final static String AGENT_NAME = 52 ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)"; 53 54 55 protected RobotExceptionHandler exceptionHandler = 56 new DefaultRobotExceptionHandler(); 57 58 59 private final static int DEFAULT_DEPTH = 10; 60 61 62 protected URL startURL = null; 63 64 65 protected String startDir = ""; 66 67 68 protected int maxDepth = DEFAULT_DEPTH; 69 70 71 protected boolean walkToOtherHosts = false; 72 73 74 protected HttpDocManager docManager; 75 76 77 protected HttpTool httpTool = new HttpTool(); 78 79 80 protected Category log; 81 82 83 protected String startReferer = "-"; 84 85 86 protected NoRobots robCheck; 87 88 89 protected TaskList todo = null; 90 91 92 protected TaskList visited = null; 93 94 95 protected boolean ignoreRobotsTxt = false; 96 97 98 protected int sleepTime = 1; 99 100 101 protected FormFiller formFiller = new FormFiller(); 102 103 104 protected Vector visitMany = new Vector (); 105 106 107 protected WebRobotCallback webRobotCallback = null; 108 109 110 protected boolean stopIt = false; 111 112 113 protected URLCheck urlCheck = null; 114 115 116 protected boolean sleep; 117 118 119 protected Vector allowedURLs = new Vector (); 120 121 122 protected boolean allowWholeHost = true; 123 124 128 protected long maxDocumentAge = -1; 130 134 protected boolean allowWholeDomain = true; 135 136 140 protected boolean flexibleHostCheck = false; 141 142 145 protected FilterChain filters = null; 146 147 150 protected boolean allowCaching = true; 151 152 155 protected boolean duplicateCheck = false; 156 157 163 public WebRobot(int expectedDocumentCount) { 164 log = Category.getInstance(getClass().getName()); 165 content2UrlMap = new HashMap (expectedDocumentCount); 166 registerVisitedList(new HashedMemoryTaskList(false, 167 expectedDocumentCount)); 168 registerToDoList(new HashedMemoryTaskList(true, 169 expectedDocumentCount)); 170 this.expectedDocumentCount = expectedDocumentCount; 171 this.setAgentName(AGENT_NAME); 172 } 173 174 178 public WebRobot() { 179 this(DEFAULT_EXPECTED_DOCUMENT_COUNT); 180 } 181 182 191 public void registerToDoList(TaskList todo) { 192 this.todo = todo; 193 } 194 195 205 public void registerVisitedList(TaskList visited) { 206 this.visited = visited; 207 } 208 209 212 public URL getStartURL() { 213 return startURL; 214 } 215 216 220 public void setStartURL(URL startURL) { 221 String path = startURL.getPath(); 222 this.startURL = startURL; 223 224 if (path.endsWith("/")) { 226 this.startDir = startURL.getHost() + path; 227 } else { 228 int pos = path.lastIndexOf("/"); 229 if (pos < 0) { 230 this.startDir = startURL.getHost() + "/"; 232 } else { 233 this.startDir = startURL.getHost() + path.substring(0, pos + 1); 234 } 235 } 236 } 237 238 241 public int getMaxDepth() { 242 return maxDepth; 243 } 244 245 249 public void setMaxDepth(int maxDepth) { 250 this.maxDepth = maxDepth; 251 } 252 253 257 public int getBandwidth() { 258 return httpTool.getBandwidth(); 259 } 260 261 265 public void setBandwidth(int bandwidth) { 266 httpTool.setBandwidth(bandwidth); 267 } 268 269 274 public boolean getWalkToOtherHosts() { 275 return walkToOtherHosts; 276 } 277 278 283 public void setWalkToOtherHosts(boolean walkToOtherHosts) { 284 this.walkToOtherHosts = walkToOtherHosts; 285 } 286 287 293 public boolean getAllowWholeHost() { 294 return allowWholeHost; 295 } 296 297 303 public void setAllowWholeHost(boolean allowWholeHost) { 304 this.allowWholeHost = allowWholeHost; 305 } 306 307 313 public boolean getAllowWholeDomain() { 314 return allowWholeDomain; 315 } 316 317 324 public void setAllowWholeDomain(boolean allowWholeDomain) { 325 this.allowWholeDomain = allowWholeDomain; 326 } 327 328 339 public boolean getFlexibleHostCheck() { 340 return flexibleHostCheck; 341 } 342 343 355 public void setFlexibleHostCheck(boolean flexibleHostCheck) { 356 this.flexibleHostCheck = flexibleHostCheck; 357 } 358 359 365 public boolean getAllowCaching() { 366 return allowCaching; 367 } 368 369 380 public void setAllowCaching(boolean allowCaching) { 381 this.allowCaching = allowCaching; 382 } 383 384 388 public HttpDocManager getDocManager() { 389 return docManager; 390 } 391 392 404 public void setDocManager(HttpDocManager docManager) { 405 this.docManager = docManager; 406 } 407 408 415 public void setCookieManager(CookieManager cm) { 416 httpTool.setCookieManager(cm); 417 } 418 419 424 public CookieManager getCookieManager() { 425 return httpTool.getCookieManager(); 426 } 427 428 432 public void setDownloadRuleSet(DownloadRuleSet rules) { 433 httpTool.setDownloadRuleSet(rules); 434 } 435 436 440 public void setURLCheck(URLCheck check) { 441 this.urlCheck = check; 442 } 443 444 448 public void setProxy(String proxyDescr) throws HttpException { 449 httpTool.setProxy(proxyDescr); 450 } 451 452 455 public String getProxy() { 456 return httpTool.getProxy(); 457 } 458 459 462 public String getStartReferer() { 463 return startReferer; 464 } 465 466 470 public void setStartReferer(String startReferer) { 471 this.startReferer = startReferer; 472 } 473 474 480 public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) { 481 robCheck.setIgnore(ignoreRobotsTxt); 482 } 483 484 487 public int getSleepTime() { 488 return sleepTime; 489 } 490 491 498 public void setSleepTime(int sleepTime) { 499 this.sleepTime = sleepTime; 500 } 501 502 509 public void setFromAddress(String fromAddress) { 510 httpTool.setFromAddress(fromAddress); 511 } 512 513 518 public void setFormHandlers(Vector handlers) { 519 formFiller.setFormHandlers(handlers); 520 if (handlers != null && handlers.size() > 0) { 521 hasFormHandlers = true; 522 } 523 } 524 525 530 public Vector getFormHandlers() { 531 return formFiller.getFormHandlers(); 532 } 533 534 538 public String getAgentName() { 539 if (httpTool != null) { 540 return httpTool.getAgentName(); 541 } else { 542 return null; 543 } 544 } 545 546 551 public void setAgentName(String name) { 552 httpTool.setAgentName(name); 553 robCheck = new NoRobots(name, httpTool); 555 } 556 557 562 public int getTimeout() { 563 if (httpTool != null) { 564 return httpTool.getTimeout(); 565 } else { 566 return -1; 567 } 568 } 569 570 576 public void setTimeout(int timeout) { 577 httpTool.setTimeout(timeout); 578 } 579 580 584 public NTLMAuthorization getNtlmAuthorization() { 585 if (httpTool != null) { 586 return httpTool.getNtlmAuthorization(); 587 } else { 588 return null; 589 } 590 } 591 592 596 public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) { 597 httpTool.setNtlmAuthorization(ntlmAuthorization); 598 } 599 600 604 public boolean getIgnoreRobotsTxt() { 605 return ignoreRobotsTxt; 606 } 607 608 612 public Vector getVisitMany() { 613 return visitMany; 614 } 615 616 public void setVisitMany(Vector visitMany) { 617 this.visitMany = visitMany; 618 } 619 620 public void setHttpToolCallback(HttpToolCallback callback) { 621 httpTool.setCallback(callback); 622 } 623 624 public WebRobotCallback getWebRobotCallback() { 625 return webRobotCallback; 626 } 627 628 public void setWebRobotCallback(WebRobotCallback webRobotCallback) { 629 this.webRobotCallback = webRobotCallback; 630 } 631 632 637 public void setSleep(boolean sleep) { 638 this.sleep = sleep; 639 } 640 641 644 public boolean isSleeping() { 645 return this.sleep; 646 } 647 648 653 public void setAllowedURLs(Vector allowed) { 654 this.allowedURLs = allowed; 655 } 656 657 662 public Vector getAllowedURLs() { 663 return this.allowedURLs; 664 } 665 666 671 public void setEnableCookies(boolean enable) { 672 httpTool.setEnableCookies(enable); 673 } 674 675 679 public boolean getEnableCookies() { 680 return httpTool.getEnableCookies(); 681 } 682 683 689 public void setMaxDocumentAge(long maxAge) { 690 this.maxDocumentAge = maxAge; 691 } 692 693 694 695 700 public long getMaxDocumentAge() { 701 return this.maxDocumentAge; 702 } 703 704 711 public void setFilters(FilterChain filters) { 712 this.filters = filters; 713 } 714 715 718 public void clearCookies() { 719 httpTool.clearCookies(); 720 } 721 722 726 public void run() { 727 work(); 728 } 729 730 734 public void work() { 735 RobotTask task = createRobotTask(startURL, maxDepth, startReferer); 736 todo.add(task); 737 walkTree(); 738 cleanUp(); 740 log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh); 741 } 742 743 748 public void stopRobot() { 749 stopIt = true; 750 } 751 752 756 private int memoryLevel = 0; 757 758 759 protected boolean activatedNewTasks = true; 760 761 762 protected boolean activatedUrlHistory = true; 763 764 765 protected boolean activatedContentHistory = true; 766 767 768 private byte memoryBuffer[] = new byte[200 * 1024]; 769 770 773 774 public void walkTree() { 775 while ((todo.size() > 0) && (!stopIt)) { 776 RobotTask task; 777 synchronized(visited) { 778 task = todo.removeFirst(); 779 if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) { 780 log.debug("already visited: " + task.getUrl()); 781 continue; 782 } 783 if (activatedUrlHistory) { 784 visited.add(task); 785 } 786 } 787 788 boolean repeat = true; 789 while (repeat) { 790 try { 791 retrieveURL(task); 792 repeat = false; 793 } catch (OutOfMemoryError memoryError) { 794 handleMemoryError(memoryError); 795 } 796 } 797 798 while (sleep) { 800 if (webRobotCallback != null) { 802 webRobotCallback.webRobotSleeping(true); 803 } 804 805 try { 806 Thread.sleep(1000); 807 } catch (InterruptedException e) { 808 }; 809 } 810 811 if (webRobotCallback != null) { 813 webRobotCallback.webRobotSleeping(false); 814 } 815 816 if (webRobotCallback != null) { 818 webRobotCallback.webRobotUpdateQueueStatus(todo.size()); 819 } 820 spawnThread(); 821 } 822 823 if (webRobotCallback != null) { 825 finishThreads(); 826 } 827 } 828 829 835 protected void handleMemoryError(OutOfMemoryError memoryError) 836 throws OutOfMemoryError { 837 memoryLevel++; 838 log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")"); 839 switch (memoryLevel) { 840 case 1: 841 visited.clear(); activatedUrlHistory = false; 844 content2UrlMap.clear(); activatedContentHistory = false; 845 System.gc(); 846 break; 847 case 2: 848 activatedNewTasks = false; 852 memoryBuffer = null; 853 System.gc(); 854 break; 855 case 3: 856 throw memoryError; 859 default : 860 if (memoryBuffer != null) { 862 System.err.println(memoryBuffer[0]); 864 } 865 throw memoryError; 866 } 867 } 868 869 873 protected void finishThreads() { 874 webRobotCallback.webRobotDone(); 875 if (docManager != null) { 876 docManager.finish(); 877 } 878 } 879 880 885 protected synchronized void spawnThread() { 886 } 887 888 889 protected int iteration = 0; 890 891 896 public void retrieveURL(RobotTask task) { 897 if (task == null) { 898 log.debug("Empty task found, ignoring"); 899 return; 900 } 901 902 long now = System.currentTimeMillis(); 903 904 updateProgressInfo(); 905 906 URL u = task.getUrl(); 907 String urlString = u.toString(); 908 String referer = task.getReferer(); 909 int depth = task.getMaxDepth(); 910 911 if (depth < 0) { 912 log.info("Max search depth reached"); 913 return; 914 } 915 916 if (!isAllowed(u)) { 919 log.info("Url '" + u + "' filtered out."); 920 return; 921 } 922 923 if (u.getFile().equals("")) { 924 try { 925 urlString = urlString + "/"; 926 u = new URL (urlString); 927 task.setUrl(u); 929 } catch (MalformedURLException e) { 930 log.error("URL not well formed: " + e.toString()); 931 exceptionHandler.handleException(this, u, e); 933 return; 934 } 935 } 936 937 log.info("retrieving " + urlString); 938 httpTool.setReferer(referer); 939 940 HttpDoc doc = null; 941 Vector links = null; 942 boolean cached = false; 943 944 boolean reScan = true; 946 if ((docManager != null && allowCaching) 947 && (task.getMethod() == HttpConstants.GET) 948 && (task.getParamString() == null)) { 949 doc = docManager.retrieveFromCache(u); 950 958 959 if (doc != null) { 960 countCache++; 961 long lastRetrieved = doc.getDateAsMilliSeconds(); 962 double ageInSeconds = (now - lastRetrieved) / 1000; 963 if (ageInSeconds < 0) { 964 log.warn("DocumentAge < 0!"); 965 } 966 reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge; 967 if (reScan) { 968 long lastModified = doc.getLastModifiedAsMilliSeconds(); 969 Date lastModifiedDate = new Date (lastModified); 970 httpTool.setIfModifiedSince(lastModifiedDate); 971 } 972 } else { 973 httpTool.setIfModifiedSince(null); 974 } 975 } 976 977 if (reScan) { 979 HttpDoc newDoc; 980 boolean error = false; 981 try { 982 if (u.getProtocol().equalsIgnoreCase("file")) { 983 newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince()); 985 } else { 986 newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString()); 988 if (newDoc != null) { 989 newDoc.setDate(now); 990 } 991 sleepNow(); 992 } 993 994 if (newDoc!= null && !newDoc.isNotModified()) { 995 if (!(newDoc.isOk() || newDoc.isRedirect())) { 996 error = true; 997 } 998 } else { 999 if (doc != null) { 1003 doc.setDate(now); 1004 doc.setCached(false); 1005 newDoc = null; 1006 } 1007 } 1008 } catch (HttpException hex) { 1009 error = true; newDoc = null; 1010 } 1011 if (error) { 1012 int retry = task.retry(); 1013 if (retry <= maxRetries) { 1014 synchronized(visited) { 1015 todo.add(task); 1016 visited.remove(task); 1017 } 1018 log.info("Adding " + u + " for retry no. " + retry); 1019 return; 1020 } else { 1021 doc = docManager.retrieveFromCache(u); 1022 if (doc == null) { 1023 log.warn("Unsuccessfull retries for " + u); 1024 return; 1025 } else { 1026 long docDate = doc.getDateAsMilliSeconds(); 1027 long age = (now - docDate); 1028 age /= 1000; 1029 if (expirationAge < 0 || age < expirationAge) { 1030 newDoc = doc; 1031 cached = true; 1032 log.info("Cached document not expired: " + u); 1033 } else { 1034 log.warn("Cached document expired: " + u); 1035 docManager.removeDocument(u); 1036 return; 1037 } 1038 } 1039 } 1040 } 1041 1042 if (newDoc != null) { 1043 countWeb++; 1044 doc = newDoc; 1045 links = null; countRefresh++; 1047 } else { 1048 cached = true; 1049 countNoRefresh++; 1050 } 1051 } else { 1052 cached = true; 1053 log.debug("Page " + u + " retrieved from cache"); 1054 } 1055 1056 1060 if (doc == null) { 1064 log.info("not downloaded " + u); 1065 return; 1066 } 1067 1068 String duplicate=null; 1070 if (duplicateCheck) { 1071 duplicate = getContentVisitedURL(doc); 1072 if (duplicate != null) { 1073 log.info("URLs with same content found: " + urlString + " = " + duplicate); 1074 } else { 1075 try { 1076 duplicate = docManager.findDuplicate(doc); 1077 if (duplicate != null) { 1078 log.info("URLs with same content found in cache: " + urlString + " = " + duplicate); 1079 } 1080 } catch (IOException e) { 1081 e.printStackTrace(); 1082 } 1083 } 1084 1085 if (duplicate != null) { 1086 String pureDuplicate = removeParameters(duplicate); 1087 String pureUrl = removeParameters(urlString); 1088 if (!pureUrl.equals(pureDuplicate) && !cached) { 1089 try { 1091 HttpDoc linksDoc = docManager.retrieveFromCache(new URL (duplicate)); 1093 if (linksDoc != null) { 1094 doc.setLinks(linksDoc.getLinks()); 1095 } 1096 docManager.storeDocument(doc); 1097 } catch (Exception e) { 1098 e.printStackTrace(); 1099 } 1100 } 1101 RobotTask newTask; 1102 try { 1103 newTask = createRobotTask(new URL (duplicate), depth, referer); 1104 if (!visited.contains(newTask)) { 1106 addTask(newTask); 1107 } 1108 } catch (MalformedURLException e) { 1109 e.printStackTrace(); } 1111 return; 1112 } 1113 } 1114 1115 if (doc.isUnauthorized()) { 1117 log.info("got HTTP Unauthorized for URL " + u); 1118 } 1119 1120 if (doc.isOk() || cached) { 1121 if (webRobotCallback != null) { 1123 int contentLength=0; 1124 if (doc.getContent() != null) { contentLength=doc.getContent().length; } 1125 webRobotCallback.webRobotRetrievedDoc(urlString, contentLength); 1126 } 1127 1128 try { 1130 if (doc.isHTML() && (depth > 0)) { 1131 HtmlDocument htmlDoc = null; 1134 HttpHeader contentTypeHeader = doc.getHeader("Content-type"); 1135 if (contentTypeHeader != null) { 1136 String contentType = contentTypeHeader.getValue(); 1137 int index = contentType.toLowerCase().indexOf("charset="); 1138 if (index > 0) { 1139 htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8)); 1140 } else { 1141 htmlDoc = new HtmlDocument(u, doc.getContent()); 1142 } 1143 } else { 1144 htmlDoc = new HtmlDocument(u, doc.getContent()); 1145 } 1146 1147 1149 if (depth > 0) { 1153 if (duplicate != null) { 1154 HttpDoc linksDoc = docManager.retrieveFromCache(new URL (duplicate)); 1155 doc.setLinks(linksDoc.getLinks()); 1156 } else if (cached) { 1157 } 1158 if (links == null) { 1159 links = htmlDoc.getLinks(); 1160 doc.setLinks(links); 1161 } 1162 if (duplicate == null) { 1163 HashSet checkedLinks = new HashSet (); 1164 for (int i = 0; i < links.size(); i++) { 1165 URL link = (URL ) links.elementAt(i); 1166 log.info("Link: "+link); 1167 if (!checkedLinks.contains(link)) { 1170 checkedLinks.add(link); 1171 String myReferer = u.toString(); 1172 if (u.getUserInfo() != null) { 1173 int endindex = myReferer.indexOf("@")+1; 1175 myReferer = "http://"+ myReferer.substring(endindex); 1176 } 1177 1178 RobotTask newTask = createRobotTask((URL ) links.elementAt(i), depth - 1, myReferer); 1179 if (!visited.contains(newTask)) { 1181 if (newTask.urlString.endsWith(".jpg")) { 1183 addTaskAtStart(newTask); 1184 } else { 1185 addTask(newTask); 1186 } 1187 } 1188 } 1189 } 1190 } 1191 } 1192 1193 if (hasFormHandlers) { 1194 Vector forms = htmlDoc.getElements("form"); 1196 for (int i = 0; i < forms.size(); i++) { 1197 ExtendedURL eurl = formFiller.fillForm(u, (Element ) forms.elementAt(i)); 1198 if (eurl != null) { 1199 RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString()); 1200 newTask.setParamString(eurl.getParams()); 1201 newTask.setMethod(eurl.getRequestMethod()); 1202 addTask(newTask); 1203 } 1204 } 1205 } 1206 1207 } 1208 } catch (OutOfMemoryError e) { 1210 throw e; 1211 } catch (Throwable e){ 1212 log.error("Unexpected error while extraction links from url '" + u + "':"+e); 1213 e.printStackTrace(); 1214 } 1216 1217 if ((docManager != null)) { 1219 try { 1220 if (filters != null) { 1221 doc = filters.process(doc); 1222 } else { 1223 log.debug("No filters defined"); 1224 } 1225 1226 if (isProcessingAllowed(doc)) { 1227 docManager.processDocument(doc); 1228 } else { 1229 String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5); 1230 doc.setContent("Not for indexing".getBytes()); 1231 doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5); 1232 } 1233 1234 try { 1235 docManager.storeDocument(doc); 1236 } catch (Exception e) { 1237 log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage()); 1238 } 1239 if (activatedContentHistory && duplicate==null) { 1240 setContentVisitedURL(doc, urlString); 1241 } 1242 } catch (DocManagerException e1) { 1243 log.error("could not process document: " + e1.getMessage()); 1244 exceptionHandler.handleException(this, u, e1); 1245 } catch (FilterException e2) { 1246 log.error(e2.getMessage()); 1247 } 1248 } 1249 1250 } else { 1251 1253 if (doc.isRedirect()) { 1254 String ref = doc.getLocation(); 1255 log.info("Got redirect to " + ref); 1256 1257 try { 1258 URL u2 = new URL (u, ref); 1259 1261 RobotTask newTask = createRobotTask(u2, depth - 1, referer); 1265 1266 addTaskAtStart(newTask); 1268 } catch (MalformedURLException e) { 1269 } 1271 } else if (doc.isNotFound()) { 1273 exceptionHandler.handleException(this, u, new HttpException("Document not found")); 1275 } else if (doc.isUnauthorized()) { 1276 exceptionHandler.handleException( 1278 this, 1279 u, 1280 new HttpException("No authorization for the document.")); 1281 } else { 1282 exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+").")); 1284 } 1285 } 1286 } 1287 1288 1293 public void updateProgressInfo() { 1294 } 1295 1296 1299 public void sleepNow() { 1300 if (sleepTime > 0) { 1301 synchronized(this) { 1302 if (webRobotCallback != null) { 1303 webRobotCallback.webRobotSleeping(true); 1304 } 1305 1306 try { 1307 Thread.sleep(sleepTime * 1000); 1308 } catch (InterruptedException e) { 1309 } 1310 1311 if (webRobotCallback != null) { 1312 webRobotCallback.webRobotSleeping(false); 1313 } 1314 } 1315 } 1316 } 1317 1318 1323 private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException { 1324 HttpDoc doc = new HttpDoc(); 1325 1326 try { 1327 String host = url.getHost(); 1328 String filename = url.getFile(); 1329 if ((host == null) || (host.equals(""))) { 1330 if ((filename.startsWith("\\")) || (filename.startsWith("/"))) { 1333 filename = filename.substring(1); 1334 } 1335 } else { 1336 filename = "//" + host + filename; 1337 } 1338 String mimetypestr = getMimeTypeForFilename(filename); 1340 if (mimetypestr != null) { 1341 HttpHeader header = new HttpHeader("content-type", mimetypestr); 1342 doc.addHeader(header); 1343 } 1344 1345 File file = new File (filename); 1347 if (!file.exists()) { 1348 doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND); 1349 return doc; 1350 } 1351 long fileLastModified = file.lastModified(); 1352 long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime(); 1353 if (fileLastModified > ifModifiedSinceTime) { 1354 byte[] content = readFileToByteArray(file); 1355 doc.setContent(content); 1356 doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK); 1357 } else { 1358 doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED); 1359 } 1360 doc.setLastModified(fileLastModified); 1361 doc.setDate(System.currentTimeMillis()); 1362 doc.setURL(url); 1363 1364 return doc; 1365 } catch (Exception e) { 1366 throw new HttpException(e.getMessage()); 1367 } 1368 } 1369 1370 1375 protected String getMimeTypeForFilename(String filename) { 1376 if (filename.endsWith(".html") || filename.endsWith(".htm")) { 1377 return "text/html"; 1378 } else { 1379 return null; 1380 } 1381 } 1382 1383 1386 protected void cleanUp() { 1387 stopIt = false; 1388 visited.clear(); 1389 todo.clear(); 1390 } 1391 1392 1395 protected void addTask(RobotTask task) { 1396 if (taskAddAllowed(task) && activatedNewTasks) { 1397 todo.add(task); 1398 } 1399 } 1400 1401 1405 protected void addTaskAtStart(RobotTask task) { 1406 if (taskAddAllowed(task) && activatedNewTasks) { 1407 todo.addAtStart(task); 1408 } 1409 } 1410 1411 1417 protected boolean taskAddAllowed(RobotTask task) { 1418 if (task == null) { 1419 log.info("Null task not allowed"); 1420 return false; 1421 } 1422 1423 if (!isAllowed(task.getUrl())) { 1424 return false; 1425 } 1426 1427 if (todo.contains(task)) { 1428 return false; 1429 } 1430 1431 return true; 1432 } 1433 1434 1439 protected boolean isAllowed(URL u) { 1440 1441 if (basicURLCheck(u)) { 1443 1444 if ((urlCheck != null) && (!urlCheck.checkURL(u))) { 1446 log.debug("not allowed by URLCheck:" + u); 1447 return false; 1448 } 1449 1450 if (robCheck.ok(u)) { 1451 return true; 1452 } else { 1453 log.debug("not allowed by robots.txt:" + u); 1454 return false; 1455 } 1456 } 1457 return false; 1458 } 1459 1460 1465 protected boolean isProcessingAllowed(HttpDoc doc) { 1466 URL u = doc.getURL(); 1467 if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) { 1468 log.debug("processing not allowed by URLCheck:" + u); 1469 return false; 1470 } 1471 1472 DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet(); 1473 if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) { 1474 log.debug("processing not allowed by DownloadRuleSet:" + u); 1475 return false; 1476 } 1477 1478 return true; 1479 } 1480 1481 1497 protected boolean basicURLCheck(URL currURL) { 1498 String currURLStr = currURL.getHost() + currURL.getPath(); 1499 String currHost = currURL.getHost().toLowerCase(); 1500 String startHost = startURL.getHost().toLowerCase(); 1501 1502 if (walkToOtherHosts) { 1504 return true; 1505 } 1506 1507 if (currURLStr.startsWith(startDir)) { 1509 return true; 1510 } 1511 1512 if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) { 1514 return true; 1515 } 1516 1517 if (flexibleHostCheck) { 1519 if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) { 1520 return true; 1521 } 1522 } 1523 1524 if (allowWholeDomain) { 1526 if (currHost.endsWith(getDomain(startHost))) { 1527 return true; 1528 } 1529 } 1530 1531 for (int i = 0; i < allowedURLs.size(); i++) { 1533 String s = (String ) allowedURLs.elementAt(i); 1534 if (currURLStr.startsWith(s)) { 1535 return true; 1536 } 1537 } 1538 log.debug("URL " + currURLStr + " not allowed"); 1539 return false; 1540 } 1541 1542 1549 private String cutWWW(String hostname) { 1550 if (hostname.toLowerCase().startsWith("www.")) { 1551 return hostname.substring(4); 1552 } else { 1553 return hostname; 1554 } 1555 } 1556 1557 1564 private String getDomain(String hostname) { 1565 int pos = hostname.indexOf("."); 1566 if (pos < 0) { 1567 return hostname; 1569 } else { 1570 return hostname.substring(pos + 1); 1571 } 1572 } 1573 1574 1578 public RobotExceptionHandler getExceptionHandler() { 1579 return exceptionHandler; 1580 } 1581 1582 1587 public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) { 1588 if (newExceptionHandler != null) { 1589 exceptionHandler = newExceptionHandler; 1590 } 1591 } 1592 1593 1598 public void setStart(String startURL) { 1599 try { 1600 setStartURL(new URL (startURL)); 1601 } catch (MalformedURLException e) { 1602 e.printStackTrace(); 1603 } 1604 } 1605 1606 1611 public String getStart() { 1612 URL url = getStartURL(); 1613 if (url != null) { 1614 return url.toExternalForm(); 1615 } else { 1616 return null; 1617 } 1618 } 1619 1620 1623 public void finish() { 1624 if (httpTool != null) { 1625 httpTool.finish(); 1626 } 1627 if (robCheck != null) { 1628 robCheck.finish(); 1629 } 1630 if (docManager != null) { 1631 docManager.finish(); 1632 } 1633 } 1634 1635 public static void main(String [] args) { 1636 if (args.length > 0) System.err.println("Arguments will be ignored!"); 1637 Field [] fields = WebRobot.class.getDeclaredFields(); 1638 StringBuffer str = new StringBuffer (60); 1639 for (int i = 0; i < fields.length; i++) { 1640 if (!Modifier.isFinal(fields[i].getModifiers()) 1641 && !Modifier.isStatic(fields[i].getModifiers())) { 1642 str.delete(0, str.length()); 1643 str.append(" robot." + fields[i].getName() + " = " + fields[i].getName() + ";"); 1644 while (str.length() < 50) { 1645 str.append(" "); 1646 } 1647 System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")"); 1648 } 1649 } 1650 } 1651 1652 1653 private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000; 1654 1655 1656 protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT; 1657 1658 1659 protected HashMap content2UrlMap; 1660 1661 1662 long countCache = 0; 1663 1664 1665 long countWeb = 0; 1666 1667 1668 long countNoRefresh = 0; 1669 1670 1671 long countRefresh = 0; 1672 1673 1679 public String getContentVisitedURL(HttpDoc doc) { 1680 Object key = doc.getContentMD5(); 1681 synchronized(content2UrlMap) { 1682 String url = (String ) content2UrlMap.get(key); 1683 return url; 1684 } 1685 } 1686 1687 1693 public void setContentVisitedURL(HttpDoc doc, String url) { 1694 Object key = doc.getContentMD5(); 1695 synchronized(content2UrlMap) { 1696 content2UrlMap.put(key, url); 1697 } 1698 } 1699 1700 private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) { 1701 url = removeWasteParameters(url); 1702 return new RobotTask(url, maxDepth, startReferer); 1703 } 1704 1705 1706 boolean hasFormHandlers = false; 1707 1708 1709 protected Vector wasteParameters = new Vector (); 1710 1711 1716 public void setWasteParameters(Vector wasteParameters) { 1717 this.wasteParameters = wasteParameters; 1718 } 1719 1720 1724 public Vector getWasteParameters() { 1725 return this.wasteParameters; 1726 } 1727 1728 1733 public URL removeWasteParameters(URL url) { 1734 String urlString = url.toExternalForm(); 1735 String newUrlString = removeParametersFromString(urlString, wasteParameters); 1736 if (urlString != newUrlString) { 1737 try { 1738 url = new URL (newUrlString); 1739 } catch (MalformedURLException ex) { 1740 ex.printStackTrace(); 1741 } 1742 }; 1743 return url; 1744 } 1745 1746 1752 public static String removeParametersFromString(String urlString, Vector wasteParameters) { 1753 if (wasteParameters != null && wasteParameters.size() > 0) { 1754 int questionMark = urlString.indexOf("?"); 1755 if (questionMark>0 && questionMark<urlString.length()) { 1756 int restPosition = urlString.indexOf("#", questionMark); 1757 String parameters; 1758 String rest; 1759 if (restPosition<0) { 1760 parameters = urlString.substring(questionMark+1); 1761 rest = null; 1762 } else { 1763 parameters = urlString.substring(questionMark+1,restPosition); 1764 rest = urlString.substring(restPosition); 1765 } 1766 1767 StringBuffer filteredUrl = new StringBuffer (urlString.substring(0,questionMark)); 1768 StringTokenizer tokenizer = new StringTokenizer (parameters, "&"); 1769 String and = "?"; 1770 boolean changed = false; 1771 while (tokenizer.hasMoreTokens()) { 1772 String token = tokenizer.nextToken(); 1773 boolean keep = true; 1774 for (int w=0; w<wasteParameters.size(); w++) { 1775 String wasteParameter = (String ) wasteParameters.elementAt(w); 1776 if (token.startsWith(wasteParameter + "=")) { 1777 keep = false; 1778 changed = true; 1779 break; 1780 } 1781 } 1782 if (keep) { 1783 filteredUrl.append(and); 1784 filteredUrl.append(token); 1785 and = "&"; 1786 } 1787 } 1788 if (rest != null) filteredUrl.append(rest); 1789 if (changed) { 1790 urlString = filteredUrl.toString(); 1791 } 1792 } 1793 } 1794 return urlString; 1795 } 1796 1797 1798 protected long startTime = System.currentTimeMillis(); 1799 1800 1801 protected int maxRetries = 0; 1802 1803 1807 public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; } 1808 1809 1813 public int getMaxRetries() { return maxRetries; } 1814 1815 1820 protected long expirationAge = -1; 1821 1822 1828 public void setExpirationAge(long age) { expirationAge = age; } 1829 1830 1834 public long getExpirationAge() { return expirationAge; } 1835 1836 1841 private final static String removeParameters(String url) { 1842 int pos = url.indexOf("?"); 1843 return pos >= 0 ? url.substring(0,pos) : url; 1844 } 1845 1846 1852 protected byte[] readFileToByteArray(File file) throws IOException 1853 { 1854 FileInputStream in = null; 1855 1856 try 1857 { 1858 byte[] buffer = new byte[(int) file.length()]; 1859 in = new FileInputStream (file); 1860 in.read(buffer); 1861 1862 return buffer; 1863 } 1864 finally 1865 { 1866 if (in != null) 1867 { 1868 try 1869 { 1870 in.close(); 1871 } 1872 catch (IOException e) 1873 { 1874 } 1875 } 1876 } 1877 } 1878 1879} 1880 1881 | Popular Tags |