KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > weblech > spider > HTMLParser


1 /*
2  * This is the MIT license, see also http://www.opensource.org/licenses/mit-license.html
3  *
4  * Copyright (c) 2001 Brian Pitcher
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */

24
25 // $Header: /cvsroot/weblech/weblech/src/weblech/spider/HTMLParser.java,v 1.3 2002/06/09 11:02:36 weblech Exp $
26

27 package weblech.spider;
28
29 import org.apache.log4j.Category;
30
31 import java.util.List JavaDoc;
32 import java.util.ArrayList JavaDoc;
33 import java.util.HashSet JavaDoc;
34 import java.util.Set JavaDoc;
35 import java.net.URL JavaDoc;
36 import java.net.MalformedURLException JavaDoc;
37 import java.io.ByteArrayInputStream JavaDoc;
38 import java.io.IOException JavaDoc;
39 import java.io.FileWriter JavaDoc;
40 import java.io.PrintWriter JavaDoc;
41
42 import weblech.util.Log4j;
43
44 public class HTMLParser
45 {
46     private final static Category _logClass = Category.getInstance(URLObject.class);
47
48     private SpiderConfig config;
49
50     static
51     {
52         Log4j.init();
53     }
54
55     public HTMLParser(SpiderConfig config)
56     {
57         this.config = config;
58     }
59
60     public List JavaDoc parseLinksInDocument(URL JavaDoc sourceURL, String JavaDoc textContent)
61     {
62         return parseAsHTML(sourceURL, textContent);
63     }
64
65     private List JavaDoc parseAsHTML(URL JavaDoc sourceURL, String JavaDoc textContent)
66     {
67         _logClass.debug("parseAsHTML()");
68         ArrayList JavaDoc newURLs = new ArrayList JavaDoc();
69         HashSet JavaDoc newURLSet = new HashSet JavaDoc();
70
71         extractAttributesFromTags("img", "src", sourceURL, newURLs, newURLSet, textContent);
72         extractAttributesFromTags("a", "href", sourceURL, newURLs, newURLSet, textContent);
73         extractAttributesFromTags("body", "background", sourceURL, newURLs, newURLSet, textContent);
74         extractAttributesFromTags("frame", "src", sourceURL, newURLs, newURLSet, textContent);
75         extractAttributesFromTags("IMG", "SRC", sourceURL, newURLs, newURLSet, textContent);
76         extractAttributesFromTags("A", "HREF", sourceURL, newURLs, newURLSet, textContent);
77         extractAttributesFromTags("BODY", "BACKGROUND", sourceURL, newURLs, newURLSet, textContent);
78         extractAttributesFromTags("FRAME", "SRC", sourceURL, newURLs, newURLSet, textContent);
79
80         if(newURLs.size() == 0)
81         {
82             _logClass.debug("Got 0 new URLs from HTML parse, check HTML\n" + textContent);
83         }
84         _logClass.debug("Returning " + newURLs.size() + " urls extracted from page");
85         return newURLs;
86     }
87
88     private void extractAttributesFromTags(String JavaDoc tag, String JavaDoc attr, URL JavaDoc sourceURL, List JavaDoc newURLs, Set JavaDoc newURLSet, String JavaDoc input)
89     {
90         _logClass.debug("extractAttributesFromTags(" + tag + ", " + attr + ", ...)");
91
92         int startPos = 0;
93         String JavaDoc startTag = "<" + tag + " ";
94         String JavaDoc attrStr = attr + "=\"";
95         while(true)
96         {
97             int tagPos = input.indexOf(startTag, startPos);
98             if(tagPos < 0)
99             {
100                 return;
101             }
102             int attrPos = input.indexOf(attrStr, tagPos + 1);
103             if(attrPos < 0)
104             {
105                 startPos = tagPos + 1;
106                 continue;
107             }
108             int nextClosePos = input.indexOf(">", tagPos + 1);
109             if(attrPos < nextClosePos)
110             {
111                 // Ooh, found one
112
int closeQuotePos = input.indexOf("\"", attrPos + attrStr.length() + 1);
113                 if(closeQuotePos > 0)
114                 {
115                     String JavaDoc urlStr = input.substring(attrPos + attrStr.length(), closeQuotePos);
116                     if(urlStr.indexOf('#') != -1)
117                     {
118                         urlStr = urlStr.substring(0, urlStr.indexOf('#'));
119                     }
120                     //_logClass.debug("Found possible URL string: " + URL);
121

122                     if(isMailTo(urlStr))
123                     {
124                         logMailURL(urlStr);
125                     }
126                     else
127                     {
128                         try
129                         {
130
131                             URL JavaDoc u = new URL JavaDoc(sourceURL, urlStr);
132                             if(newURLSet.contains(u))
133                             {
134                                 //_logClass.debug("Already found URL on page: " + u);
135
}
136                             else
137                             {
138                                 newURLs.add(u);
139                                 newURLSet.add(u);
140                                 //_logClass.debug("Found new URL on page: " + u);
141
}
142                         }
143                         catch(MalformedURLException JavaDoc murle)
144                         {
145                         }
146                     }
147                 }
148                 startPos = tagPos + 1;
149                 continue;
150             }
151             else
152             {
153                 startPos = tagPos + 1;
154                 continue;
155             }
156         }
157     }
158
159     private void logMailURL(String JavaDoc url)
160     {
161         _logClass.debug("logMailURL()");
162
163         try
164         {
165             FileWriter JavaDoc appendedFile = new FileWriter JavaDoc(config.getMailtoLogFile().toString(), true);
166             PrintWriter JavaDoc pW = new PrintWriter JavaDoc(appendedFile);
167             pW.println(url);
168             pW.flush();
169             pW.close();
170         }
171         catch(IOException JavaDoc ioe)
172         {
173             _logClass.warn("Caught IO exception writing mailto URL:" + ioe.getMessage(), ioe);
174         }
175     }
176
177     /**
178      * Check if a particular URL looks like it's a mailto: style link.
179      */

180     private boolean isMailTo(String JavaDoc url)
181     {
182         if(url == null)
183         {
184             return false;
185         }
186
187         url = url.toUpperCase();
188         return (url.indexOf("MAILTO:") != -1);
189     }
190 }
191
Popular Tags