KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > jmeter > protocol > http > parser > RegexpHTMLParser


1 // $Header: /home/cvs/jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/parser/RegexpHTMLParser.java,v 1.17.2.1 2005/03/02 11:08:18 jsalvata Exp $
2
/*
3  * Copyright 2003-2004 The Apache Software Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17 */

18
19 package org.apache.jmeter.protocol.http.parser;
20
21 import java.net.MalformedURLException JavaDoc;
22 import java.net.URL JavaDoc;
23 import java.util.Iterator JavaDoc;
24
25 import org.apache.jorphan.logging.LoggingManager;
26 import org.apache.log.Logger;
27
28 // NOTE: Also looked at using Java 1.4 regexp instead of ORO. The change was
29
// trivial. Performance did not improve -- at least not significantly.
30
// Finally decided for ORO following advise from Stefan Bodewig (message
31
// to jmeter-dev dated 25 Nov 2003 8:52 CET) [Jordi]
32
import org.apache.oro.text.regex.MatchResult;
33 import org.apache.oro.text.regex.Pattern;
34 import org.apache.oro.text.regex.PatternMatcherInput;
35 import org.apache.oro.text.regex.Perl5Compiler;
36 import org.apache.oro.text.regex.Perl5Matcher;
37 import org.apache.oro.text.regex.MalformedPatternException;
38
39 /**
40  * HtmlParser implementation using regular expressions.
41  * <p>
42  * This class will find RLs specified in the following ways (where
43  * <b>url</b> represents the RL being found:
44  * <ul>
45  * <li>&lt;img SRC=<b>url</b> ... &gt;
46  * <li>&lt;script SRC=<b>url</b> ... &gt;
47  * <li>&lt;applet code=<b>url</b> ... &gt;
48  * <li>&lt;input type=image SRC=<b>url</b> ... &gt;
49  * <li>&lt;body background=<b>url</b> ... &gt;
50  * <li>&lt;table background=<b>url</b> ... &gt;
51  * <li>&lt;td background=<b>url</b> ... &gt;
52  * <li>&lt;tr background=<b>url</b> ... &gt;
53  * <li>&lt;applet ... codebase=<b>url</b> ... &gt;
54  * <li>&lt;embed SRC=<b>url</b> ... &gt;
55  * <li>&lt;embed codebase=<b>url</b> ... &gt;
56  * <li>&lt;object codebase=<b>url</b> ... &gt;
57  * <li>&lt;link rel=stylesheet HREF=<b>url</b>... gt;
58  * </ul>
59  *
60  * <p>
61  * This class will take into account the following construct:
62  * <ul>
63  * <li>&lt;base HREF=<b>url</b>&gt;
64  * </ul>
65  *
66  * <p>
67  * But not the following:
68  * <ul>
69  * <li>&lt; ... codebase=<b>url</b> ... &gt;
70  * </ul>
71  *
72  * @author <a HREF="mailto:jsalvata@apache.org">Jordi Salvat i Alabart</a>
73  * @version $Revision: 1.17.2.1 $ updated on $Date: 2005/03/02 11:08:18 $
74  */

75 class RegexpHTMLParser extends HTMLParser
76 {
77
78     /**
79      * Regexp fragment matching a tag attribute's value (including
80      * the equals sign and any spaces before it). Note it matches
81      * unquoted values, which to my understanding, are not conformant
82      * to any of the HTML specifications, but are still quite common
83      * in the web and all browsers seem to understand them.
84      */

85     private static final String JavaDoc VALUE=
86         "\\s*=\\s*(?:\"([^\"]*)\"|'([^']*)'|([^\"'\\s>\\\\][^\\s>]*)(?=[\\s>]))";
87             // Note there's 3 capturing groups per value
88

89     /**
90      * Regexp fragment matching the separation between two tag attributes.
91      */

92     private static final String JavaDoc SEP=
93         "\\s(?:[^>]*\\s)?";
94
95     /**
96      * Regular expression used against the HTML code to find the URIs of
97      * images, etc.:
98      */

99     private static final String JavaDoc REGEXP=
100         "<(?:"
101             + "!--.*?-->"
102             + "|BASE"+SEP+"HREF"+VALUE
103             + "|(?:IMG|SCRIPT|FRAME|IFRAME)"+SEP+"SRC"+VALUE
104             + "|APPLET"+SEP+"CODE(?:BASE)?"+VALUE
105             + "|(?:EMBED|OBJECT)"+SEP+"(?:SRC|CODEBASE)"+VALUE
106             + "|(?:BODY|TABLE|TR|TD)"+SEP+"BACKGROUND"+VALUE
107             + "|INPUT(?:"+SEP+"(?:SRC"+VALUE+"|TYPE\\s*=\\s*(?:\"image\"|'image'|image(?=[\\s>])))){2,}"
108             + "|LINK(?:"+SEP+"(?:HREF"+VALUE+"|REL\\s*=\\s*(?:\"stylesheet\"|'stylesheet'|stylesheet(?=[\\s>])))){2,}"
109             + ")";
110
111     // Number of capturing groups possibly containing Base HREFs:
112
private static final int NUM_BASE_GROUPS= 3;
113
114     /**
115      * Compiled regular expression.
116      */

117     static Pattern pattern;
118
119     /**
120      * Thread-local matcher:
121      */

122     private static ThreadLocal JavaDoc localMatcher= new ThreadLocal JavaDoc()
123     {
124         protected Object JavaDoc initialValue()
125         {
126             return new Perl5Matcher();
127         }
128     };
129
130     /**
131      * Thread-local input:
132      */

133     private static ThreadLocal JavaDoc localInput= new ThreadLocal JavaDoc()
134     {
135         protected Object JavaDoc initialValue()
136         {
137             return new PatternMatcherInput(new char[0]);
138         }
139     };
140
141     /** Used to store the Logger (used for debug and error messages). */
142     transient private static Logger log;
143
144     protected boolean isReusable()
145     {
146         return true;
147     }
148
149     /**
150      * Make sure to compile the regular expression upon instantiation:
151      */

152     protected RegexpHTMLParser() {
153         super();
154
155         // Define this here to ensure it's ready to report any trouble
156
// with the regexp:
157
log= LoggingManager.getLoggerForClass();
158         
159         // Compile the regular expression:
160
try
161         {
162             Perl5Compiler c= new Perl5Compiler();
163             pattern=
164                 c.compile(
165                     REGEXP,
166                     Perl5Compiler.CASE_INSENSITIVE_MASK
167                         | Perl5Compiler.SINGLELINE_MASK
168                         | Perl5Compiler.READ_ONLY_MASK);
169         }
170         catch (MalformedPatternException mpe)
171         {
172             log.error(
173                 "Internal error compiling regular expression in ParseRegexp.");
174             log.error("MalformedPatternException - " + mpe);
175             throw new Error JavaDoc(mpe.toString());//JDK1.4: remove .toString()
176
}
177     }
178
179     /* (non-Javadoc)
180      * @see org.apache.jmeter.protocol.http.parser.HtmlParser#getEmbeddedResourceURLs(byte[], java.net.URL)
181      */

182     public Iterator JavaDoc getEmbeddedResourceURLs(byte[] html, URL JavaDoc baseUrl, URLCollection urls)
183     {
184
185         Perl5Matcher matcher= (Perl5Matcher)localMatcher.get();
186         PatternMatcherInput input= (PatternMatcherInput)localInput.get();
187         // TODO: find a way to avoid the cost of creating a String here --
188
// probably a new PatternMatcherInput working on a byte[] would do
189
// better.
190
input.setInput(new String JavaDoc(html));
191         while (matcher.contains(input, pattern))
192         {
193             MatchResult match= matcher.getMatch();
194             String JavaDoc s;
195             if (log.isDebugEnabled())
196                 log.debug("match groups " + match.groups());
197             // Check for a BASE HREF:
198
for (int g=1; g <= NUM_BASE_GROUPS && g <= match.groups(); g++)
199             {
200                 s= match.group(g);
201                 if (s != null)
202                 {
203                     if (log.isDebugEnabled())
204                     {
205                         log.debug("new baseUrl: " + s + " - " + baseUrl.toString());
206                     }
207                     try
208                     {
209                         baseUrl= new URL JavaDoc(baseUrl, s);
210                     }
211                     catch (MalformedURLException JavaDoc e)
212                     {
213                         // Doesn't even look like a URL?
214
// Maybe it isn't: Ignore the exception.
215
if (log.isDebugEnabled())
216                         {
217                             log.debug(
218                                 "Can't build base URL from RL "
219                                     + s
220                                     + " in page "
221                                     + baseUrl,
222                                 e);
223                         }
224                     }
225                 }
226             }
227             for (int g= NUM_BASE_GROUPS+1; g <= match.groups(); g++)
228             {
229                 s= match.group(g);
230                 if (log.isDebugEnabled())
231                 {
232                     log.debug("group " + g + " - " + match.group(g));
233                 }
234                 if (s != null)
235                 {
236                         urls.addURL(s,baseUrl);
237                 }
238             }
239         }
240         return urls.iterator();
241     }
242 }
243
Popular Tags