KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > jmeter > protocol > http > parser > HtmlParserHTMLParser


1 // $Header: /home/cvs/jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/parser/HtmlParserHTMLParser.java,v 1.14.2.1 2005/03/02 01:34:14 sebb Exp $
2
/*
3  * Copyright 2003-2004 The Apache Software Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17 */

18
19 package org.apache.jmeter.protocol.http.parser;
20
21 import java.io.StringReader JavaDoc;
22 import java.net.MalformedURLException JavaDoc;
23 import java.net.URL JavaDoc;
24 import java.util.Iterator JavaDoc;
25
26 import org.apache.jorphan.logging.LoggingManager;
27 import org.apache.log.Logger;
28
29 import org.htmlparser.Node;
30 import org.htmlparser.NodeReader;
31 import org.htmlparser.Parser;
32 import org.htmlparser.scanners.AppletScanner;
33 import org.htmlparser.scanners.BaseHrefScanner;
34 import org.htmlparser.scanners.BodyScanner;
35 import org.htmlparser.scanners.FrameScanner;
36 import org.htmlparser.scanners.InputTagScanner;
37 import org.htmlparser.scanners.LinkScanner;
38 import org.htmlparser.scanners.LinkTagScanner;
39 import org.htmlparser.scanners.ScriptScanner;
40 import org.htmlparser.tags.AppletTag;
41 import org.htmlparser.tags.BaseHrefTag;
42 import org.htmlparser.tags.BodyTag;
43 import org.htmlparser.tags.FrameTag;
44 import org.htmlparser.tags.ImageTag;
45 import org.htmlparser.tags.InputTag;
46 import org.htmlparser.tags.LinkTag;
47 import org.htmlparser.tags.LinkTagTag;
48 import org.htmlparser.tags.ScriptTag;
49 import org.htmlparser.util.DefaultParserFeedback;
50 import org.htmlparser.util.NodeIterator;
51 import org.htmlparser.util.ParserException;
52
53 /**
54  * HtmlParser implementation using SourceForge's HtmlParser.
55  *
56  * @version $Revision: 1.14.2.1 $ updated on $Date: 2005/03/02 01:34:14 $
57  */

58 class HtmlParserHTMLParser extends HTMLParser
59 {
60     /** Used to store the Logger (used for debug and error messages). */
61     transient private static Logger log= LoggingManager.getLoggerForClass();
62
63     protected HtmlParserHTMLParser(){
64         super();
65     }
66
67     protected boolean isReusable()
68     {
69         return true;
70     }
71
72     /* (non-Javadoc)
73      * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[], java.net.URL)
74      */

75     public Iterator JavaDoc getEmbeddedResourceURLs(byte[] html, URL JavaDoc baseUrl, URLCollection urls)
76         throws HTMLParseException
77     {
78         Parser htmlParser= null;
79         try
80         {
81             String JavaDoc contents= new String JavaDoc(html);
82             StringReader JavaDoc reader= new StringReader JavaDoc(contents);
83             NodeReader nreader= new NodeReader(reader, contents.length());
84             htmlParser= new Parser(nreader, new DefaultParserFeedback());
85             addTagListeners(htmlParser);
86         }
87         catch (Exception JavaDoc e)
88         {
89             throw new HTMLParseException(e);
90         }
91
92         // Now parse the DOM tree
93

94         // look for applets
95

96         // This will only work with an Applet .class file.
97
// Ideally, this should be upgraded to work with Objects (IE)
98
// and archives (.jar and .zip) files as well.
99

100         try
101         {
102             // we start to iterate through the elements
103
for (NodeIterator e= htmlParser.elements(); e.hasMoreNodes();)
104             {
105                 Node node= e.nextNode();
106                 String JavaDoc binUrlStr= null;
107
108                 // first we check to see if body tag has a
109
// background set and we set the NodeIterator
110
// to the child elements inside the body
111
if (node instanceof BodyTag)
112                 {
113                     BodyTag body= (BodyTag)node;
114                     binUrlStr= body.getAttribute("background");
115                     // if the body tag exists, we get the elements
116
// within the body tag. if we don't we won't
117
// see the body of the page. The only catch
118
// with this is if there are images after the
119
// closing body tag, it won't get parsed. If
120
// someone puts it outside the body tag, it
121
// is probably a mistake. Plus it's bad to
122
// have important content after the closing
123
// body tag. Peter Lin 10-9-03
124
e= body.elements();
125                 }
126                 else if (node instanceof BaseHrefTag)
127                 {
128                     BaseHrefTag baseHref= (BaseHrefTag)node;
129                     try
130                     {
131                         baseUrl= new URL JavaDoc(baseUrl, baseHref.getBaseUrl()+"/");
132                     }
133                     catch (MalformedURLException JavaDoc e1)
134                     {
135                         throw new HTMLParseException(e1);
136                     }
137                 }
138                 else if (node instanceof ImageTag)
139                 {
140                     ImageTag image= (ImageTag)node;
141                     binUrlStr= image.getImageURL();
142                 }
143                 else if (node instanceof AppletTag)
144                 {
145                     AppletTag applet= (AppletTag)node;
146                     binUrlStr= applet.getAppletClass();
147                 }
148                 else if (node instanceof InputTag)
149                 {
150                     InputTag input= (InputTag)node;
151                     // we check the input tag type for image
152
String JavaDoc strType= input.getAttribute("type");
153                     if (strType != null && strType.equalsIgnoreCase("image"))
154                     {
155                         // then we need to download the binary
156
binUrlStr= input.getAttribute("src");
157                     }
158                 } else if (node instanceof LinkTag){
159                     LinkTag link = (LinkTag)node;
160                     if (link.getChild(0) instanceof ImageTag){
161                         ImageTag img = (ImageTag)link.getChild(0);
162                         binUrlStr = img.getImageURL();
163                     }
164                 } else if (node instanceof ScriptTag){
165                     ScriptTag script = (ScriptTag)node;
166                     binUrlStr = script.getAttribute("src");
167                 } else if (node instanceof FrameTag){
168                     FrameTag tag = (FrameTag)node;
169                     binUrlStr = tag.getAttribute("src");
170                 } else if (node instanceof LinkTagTag){
171                     LinkTagTag script = (LinkTagTag)node;
172                     if (script.getAttribute("rel").equalsIgnoreCase("stylesheet")){
173                         binUrlStr = script.getAttribute("href");
174                     }
175                 }
176                 
177                 if (binUrlStr == null)
178                 {
179                     continue;
180                 }
181
182                 urls.addURL(binUrlStr,baseUrl);
183             }
184             log.debug("End : parseNodes");
185         }
186         catch (ParserException e)
187         {
188             throw new HTMLParseException(e);
189         }
190
191         return urls.iterator();
192     }
193
194     /**
195      * Returns a node representing a whole xml given an xml document.
196      *
197      * @param text an xml document
198      * @return a node representing a whole xml
199      *
200      * @throws SAXException indicates an error parsing the xml document
201      */

202     private static void addTagListeners(Parser parser)
203     {
204         log.debug("Start : addTagListeners");
205         // add body tag scanner
206
parser.addScanner(new BodyScanner());
207         // add BaseHRefTag scanner
208
parser.addScanner(new BaseHrefScanner());
209         // add ImageTag and BaseHrefTag scanners
210
LinkScanner linkScanner= new LinkScanner(LinkTag.LINK_TAG_FILTER);
211         // parser.addScanner(linkScanner);
212
parser.addScanner(
213             linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER));
214         parser.addScanner(
215             linkScanner.createBaseHREFScanner("-b"));
216                             // Taken from org.htmlparser.Parser
217
// add input tag scanner
218
parser.addScanner(new InputTagScanner());
219         // add applet tag scanner
220
parser.addScanner(new AppletScanner());
221         parser.addScanner(new ScriptScanner());
222         parser.addScanner(new LinkTagScanner());
223         parser.addScanner(new FrameScanner());
224     }
225 }
226
Popular Tags