Robot


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/Robot.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  package org.htmlparser.parserapplications;
34  import org.htmlparser.Node;
35  import org.htmlparser.Parser;
36  import org.htmlparser.tags.LinkTag;
37  import org.htmlparser.util.DefaultParserFeedback;
38  import org.htmlparser.util.NodeIterator;
39  import org.htmlparser.util.ParserException;
40  /**
41   * The Robot Crawler application will crawl through urls recursively, based on a depth value.
42   */
43  public class Robot
44  {
45      private org.htmlparser.Parser parser;
46      /**
47       * Robot crawler - Provide the starting url 
48       */
49      public Robot(String   resourceLocation)
50      {
51          try
52          {
53              parser = new Parser(resourceLocation, new DefaultParserFeedback());
54              parser.registerScanners();
55          }
56          catch (ParserException e)
57          {
58              System.err.println("Error, could not create parser object");
59              e.printStackTrace();
60          }
61      }
62      /**
63       * Crawl using a given crawl depth.
64       * @param crawlDepth Depth of crawling
65       */
66      public void crawl(int crawlDepth) throws ParserException
67      {
68          try
69          {
70              crawl(parser, crawlDepth);
71          }
72          catch (ParserException e)
73          {
74              throw new ParserException(
75                  "HTMLParserException at crawl(" + crawlDepth + ")",
76                  e);
77          }
78      }
79      /**
80       * Crawl using a given parser object, and a given crawl depth.
81       * @param parser Parser object
82       * @param crawlDepth Depth of crawling
83       */
84      public void crawl(Parser parser, int crawlDepth) throws ParserException
85      {
86          System.out.println(" crawlDepth = " + crawlDepth);
87          for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
88          {
89              Node node = e.nextNode();
90              if (node instanceof LinkTag)
91              {
92                  LinkTag linkTag = (LinkTag) node;
93                  {
94                      if (!linkTag.isMailLink())
95                      {
96                          if (linkTag.getLink().toUpperCase().indexOf("HTM")
97                              != -1
98                              || linkTag.getLink().toUpperCase().indexOf("COM")
99                                  != -1
100                             || linkTag.getLink().toUpperCase().indexOf("ORG")
101                                 != -1)
102                         {
103                             if (crawlDepth > 0)
104                             {
105                                 Parser newParser =
106                                     new Parser(
107                                         linkTag.getLink(),
108                                         new DefaultParserFeedback());
109                                 newParser.registerScanners();
110                                 System.out.print(
111                                     "Crawling to " + linkTag.getLink());
112                                 crawl(newParser, crawlDepth - 1);
113                             }
114                             else
115                                 System.out.println(linkTag.getLink());
116                         }
117                     }
118                 }
119             }
120         }
121     }
122 
123     public static void main(String  [] args)
124     {
125         System.out.println("Robot Crawler v" + Parser.getVersion());
126         if (args.length < 2 || args[0].equals("-help"))
127         {
128             System.out.println();
129             System.out.println(
130                 "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>");
131             System.out.println();
132             System.out.println(
133                 "   <resourceLocn> the name of the file to be parsed (with complete path ");
134             System.out.println(
135                 "                  if not in current directory)");
136             System.out.println(
137                 "   <depth> No of links to be followed from each link");
138             System.out.println("   -help This screen");
139             System.out.println();
140             System.out.println(
141                 "HTML Parser home page : http://htmlparser.sourceforge.net");
142             System.out.println();
143             System.out.println(
144                 "Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3");
145             System.out.println();
146             System.out.println(
147                 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
148             System.exit(-1);
149         }
150         String   resourceLocation = "";
151         int crawlDepth = 1;
152         if (args.length != 0)
153             resourceLocation = args[0];
154         if (args.length == 2)
155             crawlDepth = Integer.valueOf(args[1]).intValue();
156 
157         Robot robot = new Robot(resourceLocation);
158         System.out.println("Crawling Site " + resourceLocation);
159         try
160         {
161             robot.crawl(crawlDepth);
162         }
163         catch (ParserException e)
164         {
165             e.printStackTrace();
166         }
167     }
168 }
169
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags