KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > parserapplications > Robot


1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/Robot.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2
/*
3  * ====================================================================
4  * Copyright 2002-2004 The Apache Software Foundation.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */

19
20 // The developers of JMeter and Apache are greatful to the developers
21
// of HTMLParser for giving Apache Software Foundation a non-exclusive
22
// license. The performance benefits of HTMLParser are clear and the
23
// users of JMeter will benefit from the hard work the HTMLParser
24
// team. For detailed information about HTMLParser, the project is
25
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
26
//
27
// HTMLParser was originally created by Somik Raha in 2000. Since then
28
// a healthy community of users has formed and helped refine the
29
// design so that it is able to tackle the difficult task of parsing
30
// dirty HTML. Derrick Oswald is the current lead developer and was kind
31
// enough to assist JMeter.
32

33 package org.htmlparser.parserapplications;
34 import org.htmlparser.Node;
35 import org.htmlparser.Parser;
36 import org.htmlparser.tags.LinkTag;
37 import org.htmlparser.util.DefaultParserFeedback;
38 import org.htmlparser.util.NodeIterator;
39 import org.htmlparser.util.ParserException;
40 /**
41  * The Robot Crawler application will crawl through urls recursively, based on a depth value.
42  */

43 public class Robot
44 {
45     private org.htmlparser.Parser parser;
46     /**
47      * Robot crawler - Provide the starting url
48      */

49     public Robot(String JavaDoc resourceLocation)
50     {
51         try
52         {
53             parser = new Parser(resourceLocation, new DefaultParserFeedback());
54             parser.registerScanners();
55         }
56         catch (ParserException e)
57         {
58             System.err.println("Error, could not create parser object");
59             e.printStackTrace();
60         }
61     }
62     /**
63      * Crawl using a given crawl depth.
64      * @param crawlDepth Depth of crawling
65      */

66     public void crawl(int crawlDepth) throws ParserException
67     {
68         try
69         {
70             crawl(parser, crawlDepth);
71         }
72         catch (ParserException e)
73         {
74             throw new ParserException(
75                 "HTMLParserException at crawl(" + crawlDepth + ")",
76                 e);
77         }
78     }
79     /**
80      * Crawl using a given parser object, and a given crawl depth.
81      * @param parser Parser object
82      * @param crawlDepth Depth of crawling
83      */

84     public void crawl(Parser parser, int crawlDepth) throws ParserException
85     {
86         System.out.println(" crawlDepth = " + crawlDepth);
87         for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
88         {
89             Node node = e.nextNode();
90             if (node instanceof LinkTag)
91             {
92                 LinkTag linkTag = (LinkTag) node;
93                 {
94                     if (!linkTag.isMailLink())
95                     {
96                         if (linkTag.getLink().toUpperCase().indexOf("HTM")
97                             != -1
98                             || linkTag.getLink().toUpperCase().indexOf("COM")
99                                 != -1
100                             || linkTag.getLink().toUpperCase().indexOf("ORG")
101                                 != -1)
102                         {
103                             if (crawlDepth > 0)
104                             {
105                                 Parser newParser =
106                                     new Parser(
107                                         linkTag.getLink(),
108                                         new DefaultParserFeedback());
109                                 newParser.registerScanners();
110                                 System.out.print(
111                                     "Crawling to " + linkTag.getLink());
112                                 crawl(newParser, crawlDepth - 1);
113                             }
114                             else
115                                 System.out.println(linkTag.getLink());
116                         }
117                     }
118                 }
119             }
120         }
121     }
122
123     public static void main(String JavaDoc[] args)
124     {
125         System.out.println("Robot Crawler v" + Parser.getVersion());
126         if (args.length < 2 || args[0].equals("-help"))
127         {
128             System.out.println();
129             System.out.println(
130                 "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>");
131             System.out.println();
132             System.out.println(
133                 " <resourceLocn> the name of the file to be parsed (with complete path ");
134             System.out.println(
135                 " if not in current directory)");
136             System.out.println(
137                 " <depth> No of links to be followed from each link");
138             System.out.println(" -help This screen");
139             System.out.println();
140             System.out.println(
141                 "HTML Parser home page : http://htmlparser.sourceforge.net");
142             System.out.println();
143             System.out.println(
144                 "Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3");
145             System.out.println();
146             System.out.println(
147                 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
148             System.exit(-1);
149         }
150         String JavaDoc resourceLocation = "";
151         int crawlDepth = 1;
152         if (args.length != 0)
153             resourceLocation = args[0];
154         if (args.length == 2)
155             crawlDepth = Integer.valueOf(args[1]).intValue();
156
157         Robot robot = new Robot(resourceLocation);
158         System.out.println("Crawling Site " + resourceLocation);
159         try
160         {
161             robot.crawl(crawlDepth);
162         }
163         catch (ParserException e)
164         {
165             e.printStackTrace();
166         }
167     }
168 }
169
Popular Tags