KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorHTMLTest


1 /* ExtractorHTMLTest
2  *
3  * Created on May 19, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.extractor;
24
25 import java.io.File JavaDoc;
26 import java.io.FileOutputStream JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.net.URL JavaDoc;
29 import java.util.Collection JavaDoc;
30 import java.util.Iterator JavaDoc;
31
32 import javax.management.AttributeNotFoundException JavaDoc;
33 import javax.management.InvalidAttributeValueException JavaDoc;
34 import javax.management.MBeanException JavaDoc;
35 import javax.management.ReflectionException JavaDoc;
36
37 import org.apache.commons.collections.CollectionUtils;
38 import org.apache.commons.collections.Predicate;
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.datamodel.CoreAttributeConstants;
41 import org.archive.crawler.datamodel.CrawlOrder;
42 import org.archive.crawler.datamodel.CrawlURI;
43 import org.archive.crawler.settings.MapType;
44 import org.archive.crawler.settings.SettingsHandler;
45 import org.archive.crawler.settings.XMLSettingsHandler;
46 import org.archive.net.UURI;
47 import org.archive.net.UURIFactory;
48 import org.archive.util.HttpRecorder;
49 import org.archive.util.TmpDirTestCase;
50
51
52 /**
53  * Test html extractor.
54  *
55  * @author stack
56  * @version $Revision: 1.20 $, $Date: 2005/09/22 23:03:11 $
57  */

58 public class ExtractorHTMLTest
59 extends TmpDirTestCase
60 implements CoreAttributeConstants {
61     private final String JavaDoc ARCHIVE_DOT_ORG = "archive.org";
62     private final String JavaDoc LINK_TO_FIND = "http://www.hewlett.org/";
63     private HttpRecorder recorder = null;
64     private ExtractorHTML extractor = null;
65     
66     protected ExtractorHTML createExtractor()
67     throws InvalidAttributeValueException JavaDoc, AttributeNotFoundException JavaDoc,
68     MBeanException JavaDoc, ReflectionException JavaDoc {
69         // Hack in a settings handler. Do this by adding this extractor
70
// to the order file (I'm adding it to a random MapType; seemingly
71
// can only add to MapTypes post-construction). This takes care
72
// of setting a valid SettingsHandler into the ExtractorHTML (This
73
// shouldn't be so difficult). Of note, the order file below is
74
// not written to disk.
75
final String JavaDoc name = this.getClass().getName();
76         SettingsHandler handler = new XMLSettingsHandler(
77             new File JavaDoc(getTmpDir(), name + ".order.xml"));
78         handler.initialize();
79         return (ExtractorHTML)((MapType)handler.getOrder().
80             getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81                 getSettingsObject(null), new ExtractorHTML(name));
82     }
83     
84     protected void setUp() throws Exception JavaDoc {
85         super.setUp();
86         this.extractor = createExtractor();
87         final boolean USE_NET = false;
88         URL JavaDoc url = null;
89         if (USE_NET) {
90             url = new URL JavaDoc("http://" + this.ARCHIVE_DOT_ORG);
91         } else {
92             File JavaDoc f = new File JavaDoc(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93             url = new URL JavaDoc("file://" + f.getAbsolutePath());
94             FileOutputStream JavaDoc fos = new FileOutputStream JavaDoc(f);
95             fos.write(("<html><head><title>test</title><body>" +
96                 "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97                 "</body></html>").getBytes());
98             fos.flush();
99             fos.close();
100         }
101         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102             this.getClass().getName(), url.openStream(), null);
103     }
104
105     /*
106      * @see TestCase#tearDown()
107      */

108     protected void tearDown() throws Exception JavaDoc {
109         super.tearDown();
110     }
111
112     public void testInnerProcess() throws IOException JavaDoc {
113         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115         this.extractor.innerProcess(curi);
116         Collection JavaDoc links = curi.getOutLinks();
117         boolean foundLinkToHewlettFoundation = false;
118         for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
119             Link link = (Link)i.next();
120             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121                 foundLinkToHewlettFoundation = true;
122                 break;
123             }
124         }
125         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126     }
127     
128     private CrawlURI setupCrawlURI(HttpRecorder rec, String JavaDoc url)
129             throws URIException {
130         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131         curi.setContentSize(this.recorder.getRecordedInput().getSize());
132         curi.setContentType("text/html");
133         curi.setFetchStatus(200);
134         curi.setHttpRecorder(rec);
135         // Fake out the extractor that this is a HTTP transaction.
136
curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137             new Object JavaDoc());
138         return curi;
139     }
140     
141     /**
142      * Test single net or local filesystem page parse.
143      * Set the uuri to be a net url or instead put in place a file
144      * named for this class under the unit test directory.
145      * @throws IOException
146      * @throws ReflectionException
147      * @throws MBeanException
148      * @throws AttributeNotFoundException
149      * @throws InvalidAttributeValueException
150      */

151     public void testPageParse()
152     throws InvalidAttributeValueException JavaDoc, AttributeNotFoundException JavaDoc,
153     MBeanException JavaDoc, ReflectionException JavaDoc, IOException JavaDoc {
154         UURI uuri = null;
155         
156 // DO
157
// uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
158
// OR
159
// File f = new File(getTmpDir(), this.getClass().getName() +
160
// ".html");
161
// if (f.exists()) {
162
// uuri = UURIFactory.getInstance("file://" +
163
// f.getAbsolutePath());
164
// }
165
// OR
166
// uuri = getUURI(URL or PATH)
167
//
168
// OR
169
// Use the main method below and pass this class an argument.
170
//
171
if (uuri != null) {
172             runExtractor(uuri);
173         }
174     }
175     
176     protected UURI getUURI(String JavaDoc url) throws URIException {
177         url = (url.indexOf("://") > 0)? url: "file://" + url;
178         return UURIFactory.getInstance(url);
179     }
180     
181     protected void runExtractor(UURI baseUURI)
182     throws InvalidAttributeValueException JavaDoc, AttributeNotFoundException JavaDoc,
183     MBeanException JavaDoc, ReflectionException JavaDoc, IOException JavaDoc {
184         runExtractor(baseUURI, null);
185     }
186     
187     protected void runExtractor(UURI baseUURI, String JavaDoc encoding)
188     throws IOException JavaDoc, InvalidAttributeValueException JavaDoc,
189     AttributeNotFoundException JavaDoc, MBeanException JavaDoc, ReflectionException JavaDoc {
190         if (baseUURI == null) {
191             return;
192         }
193         this.extractor = createExtractor();
194         URL JavaDoc url = new URL JavaDoc(baseUURI.toString());
195         this.recorder = HttpRecorder.
196             wrapInputStreamWithHttpRecord(getTmpDir(),
197             this.getClass().getName(), url.openStream(), encoding);
198         CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199         this.extractor.innerProcess(curi);
200         
201         System.out.println("+" + this.extractor.report());
202         int count = 0;
203         Collection JavaDoc links = curi.getOutLinks();
204         System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205         if (links != null) {
206             for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
207                 Link link = (Link)i.next();
208                 if (link.getHopType()==Link.NAVLINK_HOP) {
209                     count++;
210                     System.out.println(link.getDestination());
211                 }
212             }
213         }
214         System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215         if (links != null) {
216             for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
217                 Link link = (Link)i.next();
218                 if (link.getHopType()==Link.EMBED_HOP) {
219                     count++;
220                     System.out.println(link.getDestination());
221                 }
222             }
223         }
224         System.out.
225             println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226         if (links != null) {
227             for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
228                 Link link = (Link)i.next();
229                 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230                     count++;
231                     System.out.println(link.getDestination());
232                 }
233             }
234         }
235         System.out.
236             println("+HTML Other (all other hopTypes):");
237         if (links != null) {
238             for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
239                 Link link = (Link) i.next();
240                 if (link.getHopType() != Link.SPECULATIVE_HOP
241                         && link.getHopType() != Link.NAVLINK_HOP
242                         && link.getHopType() != Link.EMBED_HOP) {
243                     count++;
244                     System.out.println(link.getHopType() + " "
245                             + link.getDestination());
246                 }
247             }
248         }
249         System.out.println("TOTAL URIS EXTRACTED: "+count);
250     }
251
252     /**
253      * Test a particular <embed SRC=...> construct that was suspicious in
254      * the No10GovUk crawl.
255      *
256      * @throws URIException
257      */

258     public void testEmbedSrc() throws URIException {
259         CrawlURI curi=
260             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261         // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
262
CharSequence JavaDoc cs = "<embed SRC=\"/documents/prem/18/1/graphics/qtvr/" +
263             "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264             "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265             "quicktime/download/\" /> ";
266         this.extractor.extract(curi,cs);
267         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268             public boolean evaluate(Object JavaDoc object) {
269                 return ((Link) object).getDestination().toString().indexOf(
270                         "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271             }
272         }));
273     }
274     
275     /**
276      * Test a whitespace issue found in href.
277      *
278      * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279      * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280      *
281      * @throws URIException
282      */

283     public void testHrefWhitespace() throws URIException {
284         CrawlURI curi =
285             new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286         CharSequence JavaDoc cs = "<a HREF=\"http://www.carsound.dk\n\n\n" +
287             "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";
288         this.extractor.extract(curi,cs);
289         curi.getOutLinks();
290         assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291                 .getOutLinks(), new Predicate() {
292             public boolean evaluate(Object JavaDoc object) {
293                 return ((Link) object).getDestination().toString().indexOf(
294                         "http://www.carsound.dk/")>=0;
295             }
296         }));
297     }
298     
299     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
300         if (args.length != 1 && args.length != 2) {
301             System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
302                 " URL|PATH [ENCODING]");
303             System.exit(1);
304         }
305         ExtractorHTMLTest testCase = new ExtractorHTMLTest();
306         testCase.setUp();
307         try {
308             testCase.runExtractor(testCase.getUURI(args[0]),
309                 (args.length == 2)? args[1]: null);
310         } finally {
311             testCase.tearDown();
312         }
313     }
314 }
315
Popular Tags