1 19 20 33 package org.htmlparser.parserapplications; 34 import org.htmlparser.Node; 35 import org.htmlparser.Parser; 36 import org.htmlparser.tags.LinkTag; 37 import org.htmlparser.util.DefaultParserFeedback; 38 import org.htmlparser.util.NodeIterator; 39 import org.htmlparser.util.ParserException; 40 43 public class Robot 44 { 45 private org.htmlparser.Parser parser; 46 49 public Robot(String resourceLocation) 50 { 51 try 52 { 53 parser = new Parser(resourceLocation, new DefaultParserFeedback()); 54 parser.registerScanners(); 55 } 56 catch (ParserException e) 57 { 58 System.err.println("Error, could not create parser object"); 59 e.printStackTrace(); 60 } 61 } 62 66 public void crawl(int crawlDepth) throws ParserException 67 { 68 try 69 { 70 crawl(parser, crawlDepth); 71 } 72 catch (ParserException e) 73 { 74 throw new ParserException( 75 "HTMLParserException at crawl(" + crawlDepth + ")", 76 e); 77 } 78 } 79 84 public void crawl(Parser parser, int crawlDepth) throws ParserException 85 { 86 System.out.println(" crawlDepth = " + crawlDepth); 87 for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 88 { 89 Node node = e.nextNode(); 90 if (node instanceof LinkTag) 91 { 92 LinkTag linkTag = (LinkTag) node; 93 { 94 if (!linkTag.isMailLink()) 95 { 96 if (linkTag.getLink().toUpperCase().indexOf("HTM") 97 != -1 98 || linkTag.getLink().toUpperCase().indexOf("COM") 99 != -1 100 || linkTag.getLink().toUpperCase().indexOf("ORG") 101 != -1) 102 { 103 if (crawlDepth > 0) 104 { 105 Parser newParser = 106 new Parser( 107 linkTag.getLink(), 108 new DefaultParserFeedback()); 109 newParser.registerScanners(); 110 System.out.print( 111 "Crawling to " + linkTag.getLink()); 112 crawl(newParser, crawlDepth - 1); 113 } 114 else 115 System.out.println(linkTag.getLink()); 116 } 117 } 118 } 119 } 120 } 121 } 122 123 public static void main(String [] args) 124 { 125 System.out.println("Robot Crawler v" + Parser.getVersion()); 126 if (args.length < 2 || args[0].equals("-help")) 127 { 128 System.out.println(); 129 System.out.println( 130 "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>"); 131 System.out.println(); 132 System.out.println( 133 " <resourceLocn> the name of the file to be parsed (with complete path "); 134 System.out.println( 135 " if not in current directory)"); 136 System.out.println( 137 " <depth> No of links to be followed from each link"); 138 System.out.println(" -help This screen"); 139 System.out.println(); 140 System.out.println( 141 "HTML Parser home page : http://htmlparser.sourceforge.net"); 142 System.out.println(); 143 System.out.println( 144 "Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3"); 145 System.out.println(); 146 System.out.println( 147 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); 148 System.exit(-1); 149 } 150 String resourceLocation = ""; 151 int crawlDepth = 1; 152 if (args.length != 0) 153 resourceLocation = args[0]; 154 if (args.length == 2) 155 crawlDepth = Integer.valueOf(args[1]).intValue(); 156 157 Robot robot = new Robot(resourceLocation); 158 System.out.println("Crawling Site " + resourceLocation); 159 try 160 { 161 robot.crawl(crawlDepth); 162 } 163 catch (ParserException e) 164 { 165 e.printStackTrace(); 166 } 167 } 168 } 169 | Popular Tags |