1 32 33 34 package websphinx; 35 36 import rcm.util.PriorityQueue; 37 import rcm.util.Timer; 38 import java.util.Vector ; 39 import java.util.Enumeration ; 40 import java.util.Hashtable ; 41 import java.util.StringTokenizer ; 42 import java.net.URL ; 43 import java.net.MalformedURLException ; 44 import java.io.IOException ; 45 import java.io.Serializable ; 47 import java.io.ObjectInputStream ; 48 import java.io.ObjectOutputStream ; 49 51 92 93 public class Crawler implements Runnable  94 , Serializable 96 { 98 99 private static final long serialVersionUID = -3757789861952010450L; 101 103 107 public static final String [] WEB = null; 108 109 114 public static final String [] SERVER = {"local"}; 115 116 121 public static final String [] SUBTREE = {"sibling", "descendent"}; 122 123 124 129 public static final String [] HYPERLINKS = {"hyperlink"}; 130 131 135 public static final String [] HYPERLINKS_AND_IMAGES = {"hyperlink", "image"}; 136 137 141 public static final String [] ALL_LINKS = null; 142 143 private String name = getClass().getName(); private transient Link[] roots = null; 146 private String [] rootHrefs = null; private String [] domain = WEB; 148 private boolean synchronous = false; 149 private boolean depthFirst = true; 150 private String [] type = HYPERLINKS; 151 private boolean ignoreVisitedLinks = true; 152 private int maxDepth = 5; 153 private DownloadParameters dp = new DownloadParameters () 154 .changeUserAgent (name); 155 private Vector classifiers = new Vector (); 156 private LinkPredicate linkPredicate; 157 private PagePredicate pagePredicate; 158 private Action action; 159 160 162 private transient Link[] crawledRoots = null; 163 164 private transient int state = CrawlEvent.CLEARED; 165 166 private transient Worm[] worms; 167 169 private transient PriorityQueue fetchQueue; 170 private transient PriorityQueue crawlQueue; 172 175 private transient int numLinksTested; 176 private transient int numPagesVisited; 178 private transient int numPagesLeft; 180 183 private transient Vector crawlListeners; 185 private transient Vector linkListeners; 187 189 private transient Hashtable visitedPages; 190 192 private transient RobotExclusion robotExclusion; 193 195 198 public Crawler () { 199 addClassifier (new StandardClassifier()); 200 init (); 201 } 202 203 206 private void init () { 207 state = CrawlEvent.CLEARED; 208 209 numLinksTested = 0; 210 numPagesVisited = 0; 211 numPagesLeft = 0; 212 213 worms = null; 214 crawlQueue = new PriorityQueue(); 215 fetchQueue = new PriorityQueue(); 216 217 crawlListeners = new Vector (); 218 linkListeners = new Vector (); 219 220 visitedPages = new Hashtable (); 221 robotExclusion = new RobotExclusion (getName ()); 222 } 223 224 227 private void writeObject (ObjectOutputStream out) 229 throws IOException { 230 if (roots != null) { 231 rootHrefs = new String [roots.length]; 232 for (int i=0; i<roots.length; ++i) 233 rootHrefs[i] = roots[i].getURL().toString(); 234 } 235 else 236 rootHrefs = null; 237 238 out.defaultWriteObject (); 239 240 rootHrefs = null; 241 } 242 244 247 private void readObject (ObjectInputStream in) 249 throws IOException , ClassNotFoundException { 250 in.defaultReadObject (); 251 252 if (rootHrefs != null) { 253 roots = new Link [rootHrefs.length]; 254 for (int i=0; i<rootHrefs.length; ++i) 255 roots[i] = new Link (rootHrefs[i]); 256 } 257 else 258 roots = null; 259 260 domain = useStandard (WEB, domain); 261 domain = useStandard (SERVER, domain); 262 domain = useStandard (SUBTREE, domain); 263 264 type = useStandard (HYPERLINKS, type); 265 type = useStandard (HYPERLINKS_AND_IMAGES, type); 266 type = useStandard (ALL_LINKS, type); 267 268 init (); 269 270 if (linkPredicate != null) 271 linkPredicate.connected (this); 272 if (pagePredicate != null) 273 pagePredicate.connected (this); 274 if (action != null) 275 action.connected (this); 276 } 277 278 private static String [] useStandard (String [] standard, String [] s) { 279 if (s == null || standard == null || standard == s) 280 return s; 281 if (s.length != standard.length) 282 return s; 283 for (int i=0; i<s.length; ++i) 284 if (!s[i].equals (standard[i])) 285 return s; 286 return standard; 287 } 288 290 296 public void run () { 297 crawledRoots = roots; 298 299 if (state == CrawlEvent.STOPPED) 300 clear (); 301 302 if (state == CrawlEvent.CLEARED && crawledRoots != null) { 303 float priority = 0; 305 float increment = 1.0f/crawledRoots.length; 306 for (int i=0; i<crawledRoots.length; ++i) { 307 crawledRoots[i].setPriority (priority); 308 priority += increment; 309 } 310 submit (crawledRoots); 311 } 312 313 state = CrawlEvent.STARTED; 314 sendCrawlEvent (state); 315 316 synchronized (crawlQueue) { 317 Timer timer = new CrawlTimer (this); 318 int timeout = dp.getCrawlTimeout(); 319 if (timeout > 0) 320 timer.set (timeout*1000, false); 321 322 int nWorms = Math.max (dp.getMaxThreads (), 1); 323 worms = new Worm[nWorms]; 324 for (int i=0; i<nWorms; ++i) { 325 worms[i] = new Worm (this, i); 326 worms[i].start (); 327 } 328 329 try { 330 while (state == CrawlEvent.STARTED) { 331 if (numPagesLeft == 0) { 332 state = CrawlEvent.STOPPED; 334 sendCrawlEvent (state); 335 } 336 else if (synchronous) { 337 Link link = (Link)crawlQueue.getMin (); 341 if (link.getStatus () == LinkEvent.DOWNLOADED) 342 process (link); 343 else 344 crawlQueue.wait (); 345 } 346 else 347 crawlQueue.wait (); 351 } 352 } catch (InterruptedException e) {} 353 354 timer.cancel (); 355 356 for (int i=0; i<worms.length; ++i) 357 worms[i].die (); 358 if (state == CrawlEvent.PAUSED) { 359 synchronized (fetchQueue) { 361 for (int i=0; i<worms.length; ++i) 362 if (worms[i].link != null) 363 fetchQueue.put (worms[i].link); 364 } 365 } 366 worms = null; 367 } 368 } 369 370 375 public void clear () { 376 stop (); 377 numPagesVisited = 0; 378 numLinksTested = 0; 379 clearVisited (); 380 if (crawledRoots != null) 381 for (int i=0; i < crawledRoots.length; ++i) 382 crawledRoots[i].disconnect (); 383 crawledRoots = null; 384 state = CrawlEvent.CLEARED; 385 sendCrawlEvent (state); 386 } 387 388 394 public void pause () { 395 if (state == CrawlEvent.STARTED) { 396 synchronized (crawlQueue) { 397 state = CrawlEvent.PAUSED; 398 crawlQueue.notify (); 399 } 400 sendCrawlEvent (state); 401 } 402 } 403 404 409 public void stop () { 410 if (state == CrawlEvent.STARTED || state == CrawlEvent.PAUSED) { 411 synchronized (crawlQueue) { 412 synchronized (fetchQueue) { 413 state = CrawlEvent.STOPPED; 414 fetchQueue.clear (); 415 crawlQueue.clear (); 416 numPagesLeft = 0; 417 crawlQueue.notify (); 418 } 419 } 420 sendCrawlEvent (state); 421 } 422 } 423 424 428 void timedOut () { 429 if (state == CrawlEvent.STARTED) { 430 synchronized (crawlQueue) { 431 synchronized (fetchQueue) { 432 state = CrawlEvent.TIMED_OUT; 433 fetchQueue.clear (); 434 crawlQueue.clear (); 435 numPagesLeft = 0; 436 crawlQueue.notify (); 437 } 438 } 439 sendCrawlEvent (state); 440 } 441 } 442 443 444 448 public int getState () { 449 return state; 450 } 451 452 457 public void visit (Page page) { 458 } 459 460 468 public boolean shouldVisit (Link l) { 469 return true; 470 } 471 472 480 public void expand (Page page) { 481 Link[] links = page.getLinks(); 483 484 if (links != null && links.length > 0) { 485 float priority = (depthFirst ? -numPagesVisited : numPagesVisited); 488 float increment = 1.0f/links.length; 489 490 for (int i=0; i<links.length; ++i) { 491 Link l = links[i]; 492 493 l.setPriority (priority); 495 priority += increment; 496 l.setDownloadParameters (dp); 497 498 ++numLinksTested; 499 if (ignoreVisitedLinks && visited (l)) 500 sendLinkEvent (l, LinkEvent.ALREADY_VISITED); 503 else if (!((type == null || l.hasAnyLabels (type)) 504 && (domain == null || l.hasAnyLabels (domain)) 505 && (linkPredicate == null || linkPredicate.shouldVisit (l)) 506 && shouldVisit (l))) 507 sendLinkEvent (l, LinkEvent.SKIPPED); 508 else if (page.getDepth() >= maxDepth) 509 sendLinkEvent (l, LinkEvent.TOO_DEEP); 510 else 511 submit (l); 512 } 513 } 514 } 515 516 519 520 524 public int getPagesVisited() { 525 return numPagesVisited; 526 } 527 531 public int getLinksTested() { 532 return numLinksTested; 533 } 534 538 public int getPagesLeft() { 539 return numPagesLeft; 540 } 541 545 public int getActiveThreads () { 546 Worm[] w = worms; 547 548 if (w == null) 549 return 0; 550 551 int n = 0; 552 for (int i=0; i<w.length; ++i) 553 if (w[i] != null && w[i].link != null) 554 ++n; 555 return n; 556 } 557 558 561 562 570 public String getName () { 571 return name; 572 } 573 577 public void setName (String name) { 578 this.name = name; 579 } 580 581 585 public String toString () { 586 return getName (); 587 } 588 589 593 public Link[] getRoots () { 594 if (roots == null) 595 return new Link[0]; 596 597 Link[] result = new Link[roots.length]; 598 System.arraycopy (roots, 0, result, 0, roots.length); 599 return result; 600 } 601 607 public Link[] getCrawledRoots () { 608 if (crawledRoots == null) 609 return null; 610 611 Link[] result = new Link[crawledRoots.length]; 612 System.arraycopy (crawledRoots, 0, result, 0, crawledRoots.length); 613 return result; 614 } 615 619 public String getRootHrefs () { 620 StringBuffer buf = new StringBuffer (); 621 if (roots != null) { 622 for (int i=0; i<roots.length; ++i) { 623 if (buf.length() > 0) 624 buf.append ('\n'); 625 buf.append (roots[i].getURL().toExternalForm()); 626 } 627 } 628 return buf.toString (); 629 } 630 636 public void setRootHrefs (String hrefs) throws MalformedURLException { 637 Vector v = new Vector (); 638 StringTokenizer tok = new StringTokenizer (hrefs); 639 while (tok.hasMoreElements ()) 640 v.addElement (new Link (tok.nextToken())); 641 roots = new Link[v.size()]; 642 v.copyInto (roots); 643 } 644 648 public void setRoot (Link link) { 649 roots = new Link[1]; 650 roots[0] = link; 651 } 652 656 public void setRoots (Link[] links) { 657 roots = new Link[links.length]; 658 System.arraycopy (links, 0, roots, 0, links.length); 659 } 660 661 665 public void addRoot (Link link) { 666 if (roots == null) 667 setRoot (link); 668 else { 669 Link newroots[] = new Link[roots.length+1]; 670 System.arraycopy (roots, 0, newroots, 0, roots.length); 671 newroots[newroots.length-1] = link; 672 roots = newroots; 673 } 674 } 675 676 680 public String [] getDomain () { 681 return domain; 682 } 683 687 public void setDomain (String [] domain) { 688 this.domain = domain; 689 } 690 691 695 public String [] getLinkType () { 696 return type; 697 } 698 702 public void setLinkType (String [] type) { 703 this.type = type; 704 } 705 706 710 public boolean getDepthFirst() { 711 return depthFirst; 712 } 713 719 public void setDepthFirst(boolean useDFS) { 720 depthFirst = useDFS; 721 } 722 727 public boolean getSynchronous() { 728 return synchronous; 729 } 730 735 public void setSynchronous(boolean f) { 736 synchronous = f; 737 } 738 743 public boolean getIgnoreVisitedLinks() { 744 return ignoreVisitedLinks; 745 } 746 751 public void setIgnoreVisitedLinks(boolean f) { 752 ignoreVisitedLinks = f; 753 } 754 758 public int getMaxDepth() { 759 return maxDepth; 760 } 761 765 public void setMaxDepth(int maxDepth) { 766 this.maxDepth = maxDepth; 767 } 768 772 public DownloadParameters getDownloadParameters() { 773 return dp; 774 } 775 780 public void setDownloadParameters(DownloadParameters dp) { 781 this.dp = dp; 782 } 783 784 791 public void setLinkPredicate (LinkPredicate pred) { 792 if (pred == linkPredicate 793 || (pred != null && pred.equals (linkPredicate))) 794 return; 795 if (linkPredicate != null) 796 linkPredicate.disconnected (this); 797 linkPredicate = pred; 798 if (linkPredicate != null) 799 linkPredicate.connected (this); 800 } 801 802 806 public LinkPredicate getLinkPredicate () { 807 return linkPredicate; 808 } 809 810 816 public void setPagePredicate (PagePredicate pred) { 817 if (pred == pagePredicate 818 || (pred != null && pred.equals (pagePredicate))) 819 return; 820 if (pagePredicate != null) 821 pagePredicate.disconnected (this); 822 pagePredicate = pred; 823 if (pagePredicate != null) 824 pagePredicate.connected (this); 825 } 826 827 831 public PagePredicate getPagePredicate () { 832 return pagePredicate; 833 } 834 835 842 public void setAction (Action act) { 843 if (act == action 844 || (act != null && act.equals (action))) 845 return; 846 if (action != null) 847 action.disconnected (this); 848 action = act; 849 if (action != null) 850 action.connected (this); 851 } 852 853 857 public Action getAction () { 858 return action; 859 } 860 861 862 866 867 872 public void submit (Link link) { 873 markVisited (link); sendLinkEvent (link, LinkEvent.QUEUED); 875 synchronized (crawlQueue) { 876 synchronized (fetchQueue) { 877 crawlQueue.put (link); 878 ++numPagesLeft; 879 fetchQueue.put (link); 880 fetchQueue.notifyAll (); } 882 } 883 } 884 889 public void submit (Link[] links) { 890 for (int i=0; i<links.length; ++i) 891 submit (links[i]); 892 } 893 894 898 public Enumeration enumerateQueue () { 900 return crawlQueue.elements (); 901 } 902 903 907 908 913 public void addClassifier (Classifier c) { 914 if (!classifiers.contains (c)) { 915 float cpriority = c.getPriority (); 916 917 for (int i=0; i<classifiers.size(); ++i) { 918 Classifier d = (Classifier)classifiers.elementAt (i); 919 if (cpriority < d.getPriority ()) { 920 classifiers.insertElementAt (c, i); 921 return; 922 } 923 } 924 classifiers.addElement (c); 925 } 926 } 927 928 934 public void removeClassifier (Classifier c) { 935 classifiers.removeElement (c); 936 } 937 938 941 public void removeAllClassifiers () { 942 classifiers.removeAllElements (); 943 } 944 945 950 public Enumeration enumerateClassifiers () { 951 return classifiers.elements(); 952 } 953 954 959 public Classifier[] getClassifiers () { 960 Classifier[] c = new Classifier[classifiers.size()]; 961 classifiers.copyInto (c); 962 return c; 963 } 964 965 969 970 976 public void addCrawlListener (CrawlListener listen) { 977 if (!crawlListeners.contains (listen)) 978 crawlListeners.addElement (listen); 979 } 980 981 987 public void removeCrawlListener (CrawlListener listen) { 988 crawlListeners.removeElement (listen); 989 } 990 991 997 public void addLinkListener (LinkListener listen) { 998 if (!linkListeners.contains (listen)) 999 linkListeners.addElement (listen); 1000 } 1001 1002 1008 public void removeLinkListener (LinkListener listen) { 1009 linkListeners.removeElement (listen); 1010 } 1011 1012 1016 protected void sendCrawlEvent (int id) { 1017 CrawlEvent evt = new CrawlEvent (this, id); 1018 for (int j=0, len=crawlListeners.size(); j<len; ++j) { 1019 CrawlListener listen = (CrawlListener)crawlListeners.elementAt(j); 1020 switch (id) { 1021 case CrawlEvent.STARTED: 1022 listen.started (evt); 1023 break; 1024 case CrawlEvent.STOPPED: 1025 listen.stopped (evt); 1026 break; 1027 case CrawlEvent.CLEARED: 1028 listen.cleared (evt); 1029 break; 1030 case CrawlEvent.TIMED_OUT: 1031 listen.timedOut (evt); 1032 break; 1033 case CrawlEvent.PAUSED: 1034 listen.paused (evt); 1035 break; 1036 } 1037 } 1038 } 1039 1040 1045 protected void sendLinkEvent (Link l, int id) { 1046 LinkEvent evt = new LinkEvent (this, id, l); 1047 l.setStatus (id); 1048 for (int j=0, len=linkListeners.size(); j<len; ++j) { 1049 LinkListener listen = (LinkListener)linkListeners.elementAt(j); 1050 listen.crawled (evt); 1051 } 1052 } 1053 1054 1060 protected void sendLinkEvent (Link l, int id, Throwable exception) { 1061 LinkEvent evt = new LinkEvent (this, id, l, exception); 1062 l.setStatus (id); 1063 l.setLabel ("exception", exception.toString ()); 1064 for (int j=0, len=linkListeners.size(); j<len; ++j) { 1065 LinkListener listen = (LinkListener)linkListeners.elementAt(j); 1066 listen.crawled (evt); 1067 } 1068 } 1069 1070 1074 1075 1081 public boolean visited (Link link) { 1082 return visitedPages.containsKey (link.getPageURL().toString()); 1083 } 1084 1085 1089 protected void markVisited (Link link) { 1090 visitedPages.put (link.getPageURL().toString(), this); 1091 } 1092 1093 1096 protected void clearVisited () { 1097 visitedPages.clear (); 1098 } 1099 1100 1104 1105 void fetch (Worm w) { 1106 Timer timer = new WormTimer (w); 1107 1108 while (!w.dead) { 1109 1111 synchronized (fetchQueue) { 1113 while (!w.dead 1114 && (w.link = (Link)fetchQueue.deleteMin ()) == null) { 1115 try { 1116 fetchQueue.wait (); 1117 } catch (InterruptedException e) {} 1118 } 1119 } 1120 1121 if (w.dead) 1122 return; 1123 1124 1126 try { 1127 DownloadParameters dp; 1129 Page page; 1130 1131 dp = w.link.getDownloadParameters(); 1132 if (dp == null) 1133 dp = this.dp; 1134 int timeout = dp.getDownloadTimeout(); 1135 1136 sendLinkEvent (w.link, LinkEvent.RETRIEVING); 1137 try { 1138 1139 if (timeout > 0) 1140 timer.set (timeout*1000, false); 1141 1142 if (dp.getObeyRobotExclusion() 1143 && robotExclusion.disallowed (w.link.getURL())) 1144 throw new IOException ("disallowed by Robot Exclusion Standard (robots.txt)"); 1145 1146 page = new Page (w.link, dp); 1147 1148 } finally { 1149 timer.cancel (); 1150 } 1151 1152 if (w.dead) 1153 return; 1154 1155 sendLinkEvent (w.link, LinkEvent.DOWNLOADED); 1156 1157 if (synchronous) { 1158 synchronized (crawlQueue) { 1163 crawlQueue.notify (); 1164 } 1165 } 1166 else { 1167 process (w.link); 1170 } 1171 1172 w.link = null; 1173 1174 1176 } catch (ThreadDeath e) { 1177 throw e; } catch (Throwable e) { 1179 if (w.dead) 1182 return; 1183 1184 sendLinkEvent (w.link, LinkEvent.ERROR, e); 1185 synchronized (crawlQueue) { 1186 crawlQueue.delete (w.link); 1187 --numPagesLeft; 1188 w.link = null; 1189 crawlQueue.notify (); 1190 } 1191 } 1192 } 1193 } 1194 1195 void process (Link link) { 1196 Page page = link.getPage (); 1197 1198 for (int j=0, len=classifiers.size(); j<len; ++j) { 1200 Classifier cl = (Classifier)classifiers.elementAt(j); 1201 cl.classify (page); 1202 } 1203 1204 ++numPagesVisited; 1206 if (pagePredicate == null || pagePredicate.shouldActOn (page)) { 1207 if (action != null) 1208 action.visit (page); 1209 visit (page); 1210 } 1211 expand (page); 1212 1213 sendLinkEvent (link, LinkEvent.VISITED); 1215 1216 synchronized (crawlQueue) { 1218 crawlQueue.delete (link); 1219 --numPagesLeft; 1220 crawlQueue.notify (); 1221 } 1222 } 1223 1224 void fetchTimedOut (Worm w, int interval) { 1225 if (w.dead) 1226 return; 1227 1228 w.die (); 1229 sendLinkEvent (w.link, LinkEvent.ERROR, 1230 new IOException ("Timeout after " + interval + " seconds")); 1231 1232 synchronized (crawlQueue) { 1233 crawlQueue.delete (w.link); 1234 --numPagesLeft; 1235 1236 worms[w.i] = new Worm (this, w.i); 1237 worms[w.i].start (); 1238 1239 crawlQueue.notify (); 1240 } 1241 } 1242 1243 public static void main (String [] args) throws Exception { 1246 java.io.ObjectInputStream in = 1247 new java.io.ObjectInputStream (new java.io.FileInputStream (args[0])); 1248 Crawler loadedCrawler = (Crawler)in.readObject (); 1249 in.close (); 1250 1251 EventLog.monitor (loadedCrawler).setOnlyNetworkEvents (false); 1252 loadedCrawler.run (); 1253 } 1254 1256} 1257 1258 1259class Worm extends Thread { 1260 Crawler crawler; int i; Link link; boolean dead = false; 1265 public Worm (Crawler crawler, int i) { 1266 super (crawler.getName() + " worm " + i); 1267 setDaemon (true); 1268 this.crawler = crawler; 1269 this.i = i; 1270 } 1271 1272 public void run () { 1273 crawler.fetch (this); 1274 } 1275 1276 public void die () { 1277 dead = true; 1278 stop (); 1279 } 1280 1281} 1282 1283class WormTimer extends Timer { 1284 Worm worm; 1285 1286 public WormTimer (Worm worm) { 1287 this.worm = worm; 1288 } 1289 1290 protected void alarm () { 1291 worm.crawler.fetchTimedOut (worm, getInterval()/1000); 1292 } 1293} 1294 1295class CrawlTimer extends Timer { 1296 Crawler crawler; 1297 1298 public CrawlTimer (Crawler crawler) { 1299 this.crawler = crawler; 1300 } 1301 1302 protected void alarm () { 1303 crawler.timedOut (); 1304 } 1305} 1306 1307 | Popular Tags |