KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > jmeter > protocol > http > parser > HTMLParser


1 // $Header: /home/cvs/jakarta-jmeter/src/protocol/http/org/apache/jmeter/protocol/http/parser/HTMLParser.java,v 1.23.2.1 2005/03/02 01:32:47 sebb Exp $
2
/*
3  * Copyright 2003-2004 The Apache Software Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17 */

18
19 package org.apache.jmeter.protocol.http.parser;
20
21 import java.io.BufferedReader JavaDoc;
22 import java.io.File JavaDoc;
23 import java.io.FileInputStream JavaDoc;
24 import java.io.FileReader JavaDoc;
25 import java.net.URL JavaDoc;
26 import java.util.ArrayList JavaDoc;
27 import java.util.Collection JavaDoc;
28 import java.util.Comparator JavaDoc;
29 import java.util.Hashtable JavaDoc;
30 import java.util.Iterator JavaDoc;
31 import java.util.List JavaDoc;
32 import java.util.Properties JavaDoc;
33 import java.util.TreeSet JavaDoc;
34 import java.util.Vector JavaDoc;
35
36 import junit.framework.TestSuite;
37
38 import org.apache.jmeter.junit.JMeterTestCase;
39 import org.apache.jmeter.util.JMeterUtils;
40 import org.apache.jorphan.logging.LoggingManager;
41 import org.apache.log.Logger;
42
43 /**
44  * HtmlParsers can parse HTML content to obtain URLs.
45  *
46  * @author <a HREF="mailto:jsalvata@apache.org">Jordi Salvat i Alabart</a>
47  * @version $Revision: 1.23.2.1 $ updated on $Date: 2005/03/02 01:32:47 $
48  */

49 public abstract class HTMLParser
50 {
51     /** Used to store the Logger (used for debug and error messages). */
52     transient private static Logger log = LoggingManager.getLoggerForClass();
53
54     // Cache of parsers - parsers must be re-usable
55
private static Hashtable JavaDoc parsers = new Hashtable JavaDoc(3);
56     
57     private final static String JavaDoc PARSER_CLASSNAME = "htmlParser.className";
58         
59     private final static String JavaDoc DEFAULT_PARSER =
60         "org.apache.jmeter.protocol.http.parser.HtmlParserHTMLParser";
61
62     /**
63      * Protected constructor to prevent instantiation except
64      * from within subclasses.
65      */

66     protected HTMLParser() {
67     }
68     
69
70     public static final HTMLParser getParser(){
71         return getParser(JMeterUtils.getPropDefault(PARSER_CLASSNAME,DEFAULT_PARSER));
72     }
73
74     public static final synchronized HTMLParser getParser(String JavaDoc htmlParserClassName){
75
76         // Is there a cached parser?
77
HTMLParser pars=(HTMLParser) parsers.get(htmlParserClassName);
78         if (pars != null){
79             log.debug("Fetched "+htmlParserClassName);
80             return pars;
81         }
82
83         try
84         {
85             Object JavaDoc clazz = Class.forName(htmlParserClassName).newInstance();
86             if (clazz instanceof HTMLParser){
87                 pars = (HTMLParser) clazz;
88             } else {
89                 throw new HTMLParseError(new ClassCastException JavaDoc(htmlParserClassName));
90             }
91         }
92         catch (InstantiationException JavaDoc e)
93         {
94             throw new HTMLParseError(e);
95         }
96         catch (IllegalAccessException JavaDoc e)
97         {
98             throw new HTMLParseError(e);
99         }
100         catch (ClassNotFoundException JavaDoc e)
101         {
102             throw new HTMLParseError(e);
103         }
104         log.info("Created "+htmlParserClassName);
105         if (pars.isReusable()){
106             parsers.put(htmlParserClassName,pars);// cache the parser
107
}
108         
109         return pars;
110     }
111
112     /**
113      * Get the URLs for all the resources that a browser would automatically
114      * download following the download of the HTML content, that is: images,
115      * stylesheets, javascript files, applets, etc...
116      * <p>
117      * URLs should not appear twice in the returned iterator.
118      * <p>
119      * Malformed URLs can be reported to the caller by having the Iterator
120      * return the corresponding RL String. Overall problems parsing the html
121      * should be reported by throwing an HTMLParseException.
122      *
123      * @param html HTML code
124      * @param baseUrl Base URL from which the HTML code was obtained
125      * @return an Iterator for the resource URLs
126      */

127     public Iterator JavaDoc getEmbeddedResourceURLs(byte[] html, URL JavaDoc baseUrl)
128         throws HTMLParseException
129         {
130             // The Set is used to ignore duplicated binary files.
131
// Using a LinkedHashSet to avoid unnecessary overhead in iterating
132
// the elements in the set later on. As a side-effect, this will keep
133
// them roughly in order, which should be a better model of browser
134
// behaviour.
135

136             Collection JavaDoc col;
137             
138             // N.B. LinkedHashSet is Java 1.4
139
if (hasLinkedHashSet){
140                 try {
141                     col = (Collection JavaDoc) Class.forName("java.util.LinkedHashSet").newInstance();
142                 } catch (Exception JavaDoc e) {
143                     throw new Error JavaDoc("Should not happen:"+e.toString());
144                 }
145             } else {
146                 col = new java.util.HashSet JavaDoc(); //TODO: improve JDK1.3 solution
147
}
148             
149             return getEmbeddedResourceURLs(html, baseUrl,new URLCollection(col));
150             
151             // An additional note on using HashSets to store URLs: I just
152
// discovered that obtaining the hashCode of a java.net.URL implies
153
// a domain-name resolution process. This means significant delays
154
// can occur, even more so if the domain name is not resolvable.
155
// Whether this can be a problem in practical situations I can't tell, but
156
// thought I'd keep a note just in case...
157
// BTW, note that using a Vector and removing duplicates via scan
158
// would not help, since URL.equals requires name resolution too.
159
// The above problem has now been addressed with the URLString and
160
// URLCollection classes.
161

162         }
163         
164         // See whether we can use LinkedHashSet or not:
165
private static final boolean hasLinkedHashSet;
166         static {
167             boolean b;
168             try
169             {
170                 Class.forName("java.util.LinkedHashSet");
171                 b = true;
172             }
173             catch (ClassNotFoundException JavaDoc e)
174             {
175                 b = false;
176             }
177             hasLinkedHashSet = b;
178         }
179         
180         
181     /**
182      * Get the URLs for all the resources that a browser would automatically
183      * download following the download of the HTML content, that is: images,
184      * stylesheets, javascript files, applets, etc...
185      * <p>
186      * All URLs should be added to the Collection.
187      * <p>
188      * Malformed URLs can be reported to the caller by having the Iterator
189      * return the corresponding RL String. Overall problems parsing the html
190      * should be reported by throwing an HTMLParseException.
191      *
192      * N.B.
193      * The Iterator returns URLs, but the Collection will contain
194      * objects of class URLString.
195      *
196      * @param html HTML code
197      * @param baseUrl Base URL from which the HTML code was obtained
198      * @param coll URLCollection
199      * @return an Iterator for the resource URLs
200      */

201     public abstract Iterator JavaDoc getEmbeddedResourceURLs(byte[] html, URL JavaDoc baseUrl,
202                                                       URLCollection coll)
203         throws HTMLParseException;
204
205
206     /**
207      * Get the URLs for all the resources that a browser would automatically
208      * download following the download of the HTML content, that is: images,
209      * stylesheets, javascript files, applets, etc...
210      *
211      * N.B.
212      * The Iterator returns URLs, but the Collection will contain
213      * objects of class URLString.
214      *
215      * @param html HTML code
216      * @param baseUrl Base URL from which the HTML code was obtained
217      * @param coll Collection - will contain URLString objects, not URLs
218      * @return an Iterator for the resource URLs
219      */

220     public Iterator JavaDoc getEmbeddedResourceURLs(byte[] html, URL JavaDoc baseUrl,
221                                                       Collection JavaDoc coll)
222         throws HTMLParseException
223         {
224             return getEmbeddedResourceURLs(html,baseUrl, new URLCollection(coll));
225         }
226
227
228     /**
229      * Parsers should over-ride this method if the parser class is re-usable,
230      * in which case the class will be cached for the next getParser() call.
231      *
232      * @return true if the Parser is reusable
233      */

234     protected boolean isReusable()
235     {
236         return false;
237     }
238
239 //////////////////////////// TEST CODE FOLLOWS /////////////////////////////
240

241
242     public static class Test extends JMeterTestCase
243     {
244         private String JavaDoc parserName;
245         private int testNumber=0;
246
247         public Test() {
248             super();
249         }
250
251         public Test(String JavaDoc name) {
252             super(name);
253         }
254
255         public Test(String JavaDoc name, int test) {
256             super(name);
257             testNumber = test;
258         }
259
260         public Test(String JavaDoc name, String JavaDoc parser, int test) {
261             super(name);
262             testNumber = test;
263             parserName = parser;
264         }
265
266
267         private class TestClass //Can't instantiate
268
{
269             private TestClass(){};
270         }
271
272         private static class TestData
273         {
274             private String JavaDoc fileName;
275             private String JavaDoc baseURL;
276             private String JavaDoc expectedSet;
277             private String JavaDoc expectedList;
278
279             private TestData(String JavaDoc f, String JavaDoc b, String JavaDoc s, String JavaDoc l){
280                 fileName = f;
281                 baseURL = b;
282                 expectedSet = s;
283                 expectedList = l;
284             }
285
286             private TestData(String JavaDoc f, String JavaDoc b, String JavaDoc s){
287                 this(f,b,s,null);
288             }
289         }
290
291         // List of parsers to test. Should probably be derived automatically
292
private static final String JavaDoc [] PARSERS = {
293             "org.apache.jmeter.protocol.http.parser.HtmlParserHTMLParser",
294             "org.apache.jmeter.protocol.http.parser.JTidyHTMLParser",
295             "org.apache.jmeter.protocol.http.parser.RegexpHTMLParser"
296         };
297         private static final TestData[] TESTS = new TestData[]{
298             new TestData(
299                          "testfiles/HTMLParserTestCase.html",
300                          "http://localhost/mydir/myfile.html",
301                          "testfiles/HTMLParserTestCase.set",
302                           "testfiles/HTMLParserTestCase.all"
303                          ),
304             new TestData(
305                          "testfiles/HTMLParserTestCaseWithBaseHRef.html",
306                          "http://localhost/mydir/myfile.html",
307                          "testfiles/HTMLParserTestCase.set",
308                           "testfiles/HTMLParserTestCase.all"
309                          ),
310             new TestData(
311                          "testfiles/HTMLParserTestCase2.html",
312                          "http:", //Dummy, as the file has no entries
313
"",
314                          ""
315                          ),
316             new TestData(
317                          "testfiles/HTMLParserTestCase3.html",
318                          "http:", //Dummy, as the file has no entries
319
"",
320                          ""
321                          ),
322             new TestData(
323                          "testfiles/HTMLParserTestCaseWithComments.html",
324                          "http://localhost/mydir/myfile.html",
325                          "testfiles/HTMLParserTestCase.set",
326                          "testfiles/HTMLParserTestCase.all"
327                          ),
328             new TestData(
329                      "testfiles/HTMLScript.html",
330                      "http://localhost/",
331                      "testfiles/HTMLScript.set",
332                      "testfiles/HTMLScript.all"
333                      ),
334              new TestData(
335                          "testfiles/HTMLParserTestFrames.html",
336                          "http://localhost/",
337                          "testfiles/HTMLParserTestFrames.all",
338                          "testfiles/HTMLParserTestFrames.all"
339                          ),
340         };
341
342         public static junit.framework.Test suite(){
343             TestSuite suite = new TestSuite();
344             suite.addTest(new Test("testDefaultParser"));
345             suite.addTest(new Test("testParserDefault"));
346             suite.addTest(new Test("testParserMissing"));
347             suite.addTest(new Test("testNotParser"));
348             suite.addTest(new Test("testNotCreatable"));
349             for (int i = 0;i<PARSERS.length;i++){
350                 TestSuite ps = new TestSuite(PARSERS[i]);// Identify the subtests
351
ps.addTest(new Test("testParserProperty",PARSERS[i],0));
352                 for (int j=0;j<TESTS.length;j++){
353                     TestSuite ts = new TestSuite(TESTS[j].fileName);
354                     ts.addTest(new Test("testParserSet",PARSERS[i],j));
355                     ts.addTest(new Test("testParserList",PARSERS[i],j));
356                     ps.addTest(ts);
357                 }
358                 suite.addTest(ps);
359             }
360             return suite;
361         }
362         
363         // Test if can instantiate parser using property name
364
public void testParserProperty() throws Exception JavaDoc
365         {
366             Properties JavaDoc p = JMeterUtils.getJMeterProperties();
367             if (p == null){
368                 p=JMeterUtils.getProperties("jmeter.properties");
369             }
370             p.setProperty(PARSER_CLASSNAME,parserName);
371             getParser();
372         }
373         
374         public void testDefaultParser() throws Exception JavaDoc {
375             getParser();
376         }
377
378         public void testParserDefault() throws Exception JavaDoc {
379             getParser(DEFAULT_PARSER);
380         }
381
382         public void testParserMissing() throws Exception JavaDoc {
383             try{
384                 getParser("no.such.parser");
385             }
386             catch (HTMLParseError e)
387             {
388                 if (e.getCause() instanceof ClassNotFoundException JavaDoc)
389                 {
390                      // This is OK
391
}
392                 else
393                 {
394                     throw e;
395                 }
396             }
397         }
398
399         public void testNotParser() throws Exception JavaDoc {
400             try{
401                 getParser("java.lang.String");
402             }
403             catch (HTMLParseError e)
404             {
405                 if (e.getCause() instanceof ClassCastException JavaDoc) return;
406                 throw e;
407             }
408         }
409
410         public void testNotCreatable() throws Exception JavaDoc {
411             try
412             {
413                 getParser(TestClass.class.getName());
414             }
415             catch (HTMLParseError e)
416             {
417                 if (e.getCause() instanceof InstantiationException JavaDoc) return;
418                 throw e;
419             }
420         }
421
422         public void testParserSet() throws Exception JavaDoc
423         {
424             HTMLParser p = getParser(parserName);
425             filetest(p,TESTS[testNumber].fileName,TESTS[testNumber].baseURL,TESTS[testNumber].expectedSet
426                     ,null,false);
427         }
428
429         public void testParserList() throws Exception JavaDoc
430         {
431             HTMLParser p = getParser(parserName);
432             filetest(p,TESTS[testNumber].fileName,TESTS[testNumber].baseURL,TESTS[testNumber].expectedList
433                     ,new Vector JavaDoc(),true);
434         }
435
436         private static void filetest(HTMLParser p,
437                                        String JavaDoc file,
438                                        String JavaDoc url,
439                                        String JavaDoc resultFile,
440                                        Collection JavaDoc c,
441                                        boolean orderMatters) //Does the order matter?
442
throws Exception JavaDoc
443         {
444             log.debug("file "+file);
445             File JavaDoc f= findTestFile(file);
446             byte[] buffer= new byte[(int)f.length()];
447             int len= new FileInputStream JavaDoc(f).read(buffer);
448             assertEquals(len, buffer.length);
449             Iterator JavaDoc result;
450             if (c == null) {
451                 result = p.getEmbeddedResourceURLs(buffer,new URL JavaDoc(url));
452             } else {
453                 result = p.getEmbeddedResourceURLs(buffer,new URL JavaDoc(url),c);
454             }
455             /*
456              * TODO:
457              * Exact ordering is only required for some tests;
458              * change the comparison to do a set compare where
459              * necessary.
460              */

461             Iterator JavaDoc expected;
462             if (orderMatters) {
463                 expected= getFile(resultFile).iterator();
464             } else {
465                 // Convert both to Sets
466
expected = new TreeSet JavaDoc(getFile(resultFile)).iterator();
467                 TreeSet JavaDoc temp = new TreeSet JavaDoc(new Comparator JavaDoc(){
468                     public int compare(Object JavaDoc o1, Object JavaDoc o2)
469                     {
470                         return (o1.toString().compareTo(o2.toString()));
471                     }});
472                 while (result.hasNext()){
473                     temp.add(result.next());
474                 }
475                 result=temp.iterator();
476             }
477             
478             while (expected.hasNext()) {
479                 assertTrue("Expecting another result",result.hasNext());
480                 try
481                 {
482                     assertEquals(expected.next(),((URL JavaDoc) result.next()).toString());
483                 }
484                 catch (ClassCastException JavaDoc e)
485                 {
486                     fail("Expected URL, but got "+e.toString());
487                 }
488             }
489             assertFalse("Should have reached the end of the results",result.hasNext());
490         }
491
492         // Get expected results as a List
493
private static List JavaDoc getFile(String JavaDoc file)
494             throws Exception JavaDoc
495         {
496             ArrayList JavaDoc al = new ArrayList JavaDoc();
497             if (file != null && file.length() > 0){
498               BufferedReader JavaDoc br =
499                 new BufferedReader JavaDoc(
500                     new FileReader JavaDoc(findTestFile(file)));
501               String JavaDoc line = br.readLine();
502               while (line != null){
503                 al.add(line);
504                 line = br.readLine();
505               }
506               br.close();
507             }
508             return al;
509         }
510     }
511 }
Popular Tags