ExtractorHTMLTest


1   /* ExtractorHTMLTest
2    *
3    * Created on May 19, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.io.File  ;
26  import java.io.FileOutputStream  ;
27  import java.io.IOException  ;
28  import java.net.URL  ;
29  import java.util.Collection  ;
30  import java.util.Iterator  ;
31  
32  import javax.management.AttributeNotFoundException  ;
33  import javax.management.InvalidAttributeValueException  ;
34  import javax.management.MBeanException  ;
35  import javax.management.ReflectionException  ;
36  
37  import org.apache.commons.collections.CollectionUtils;
38  import org.apache.commons.collections.Predicate;
39  import org.apache.commons.httpclient.URIException;
40  import org.archive.crawler.datamodel.CoreAttributeConstants;
41  import org.archive.crawler.datamodel.CrawlOrder;
42  import org.archive.crawler.datamodel.CrawlURI;
43  import org.archive.crawler.settings.MapType;
44  import org.archive.crawler.settings.SettingsHandler;
45  import org.archive.crawler.settings.XMLSettingsHandler;
46  import org.archive.net.UURI;
47  import org.archive.net.UURIFactory;
48  import org.archive.util.HttpRecorder;
49  import org.archive.util.TmpDirTestCase;
50  
51  
52  /**
53   * Test html extractor.
54   *
55   * @author stack
56   * @version $Revision: 1.20 $, $Date: 2005/09/22 23:03:11 $
57   */
58  public class ExtractorHTMLTest
59  extends TmpDirTestCase
60  implements CoreAttributeConstants {
61      private final String   ARCHIVE_DOT_ORG = "archive.org";
62      private final String   LINK_TO_FIND = "http://www.hewlett.org/";
63      private HttpRecorder recorder = null;
64      private ExtractorHTML extractor = null;
65      
66      protected ExtractorHTML createExtractor()
67      throws InvalidAttributeValueException  , AttributeNotFoundException  ,
68      MBeanException  , ReflectionException   {
69          // Hack in a settings handler.  Do this by adding this extractor
70          // to the order file (I'm adding it to a random MapType; seemingly
71          // can only add to MapTypes post-construction). This takes care
72          // of setting a valid SettingsHandler into the ExtractorHTML (This
73          // shouldn't be so difficult).  Of note, the order file below is
74          // not written to disk.
75          final String   name = this.getClass().getName();
76          SettingsHandler handler = new XMLSettingsHandler(
77              new File  (getTmpDir(), name + ".order.xml"));
78          handler.initialize();
79          return (ExtractorHTML)((MapType)handler.getOrder().
80              getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81                  getSettingsObject(null), new ExtractorHTML(name));
82      }
83      
84      protected void setUp() throws Exception   {
85          super.setUp();
86          this.extractor = createExtractor();
87          final boolean USE_NET = false;
88          URL   url = null;
89          if (USE_NET) {
90              url = new URL  ("http://" + this.ARCHIVE_DOT_ORG);
91          } else {
92              File   f = new File  (getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93              url = new URL  ("file://" + f.getAbsolutePath());
94              FileOutputStream   fos = new FileOutputStream  (f);
95              fos.write(("<html><head><title>test</title><body>" +
96                  "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97                  "</body></html>").getBytes());
98              fos.flush();
99              fos.close();
100         }
101         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102             this.getClass().getName(), url.openStream(), null);
103     }
104 
105     /*
106      * @see TestCase#tearDown()
107      */
108     protected void tearDown() throws Exception   {
109         super.tearDown();
110     }
111 
112     public void testInnerProcess() throws IOException   {
113         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115         this.extractor.innerProcess(curi);
116         Collection   links = curi.getOutLinks();
117         boolean foundLinkToHewlettFoundation = false;
118         for (Iterator   i = links.iterator(); i.hasNext();) {
119             Link link = (Link)i.next();
120             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121                 foundLinkToHewlettFoundation = true;
122                 break;
123             }
124         }
125         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126     }
127     
128     private CrawlURI setupCrawlURI(HttpRecorder rec, String   url)
129             throws URIException {
130         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131         curi.setContentSize(this.recorder.getRecordedInput().getSize());
132         curi.setContentType("text/html");
133         curi.setFetchStatus(200);
134         curi.setHttpRecorder(rec);
135         // Fake out the extractor that this is a HTTP transaction.
136         curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137             new Object  ());
138         return curi;
139     }
140     
141     /**
142      * Test single net or local filesystem page parse.
143      * Set the uuri to be a net url or instead put in place a file
144      * named for this class under the unit test directory.
145      * @throws IOException
146      * @throws ReflectionException
147      * @throws MBeanException
148      * @throws AttributeNotFoundException
149      * @throws InvalidAttributeValueException
150      */
151     public void testPageParse()
152     throws InvalidAttributeValueException  , AttributeNotFoundException  ,
153     MBeanException  , ReflectionException  , IOException   {
154         UURI uuri = null;
155         
156 // DO
157 //      uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
158 // OR
159 //        File f = new File(getTmpDir(), this.getClass().getName() +
160 //        ".html");
161 //        if (f.exists()) {
162 //          uuri = UURIFactory.getInstance("file://" +
163 //                  f.getAbsolutePath());
164 //        }
165 // OR 
166 //      uuri = getUURI(URL or PATH)
167 //
168 // OR 
169 //      Use the main method below and pass this class an argument.
170 //     
171         if (uuri != null) {
172             runExtractor(uuri);
173         }
174     }
175     
176     protected UURI getUURI(String   url) throws URIException {
177         url = (url.indexOf("://") > 0)? url: "file://" + url;
178         return UURIFactory.getInstance(url);
179     }
180     
181     protected void runExtractor(UURI baseUURI)
182     throws InvalidAttributeValueException  , AttributeNotFoundException  ,
183     MBeanException  , ReflectionException  , IOException   {
184         runExtractor(baseUURI, null);
185     }
186     
187     protected void runExtractor(UURI baseUURI, String   encoding)
188     throws IOException  , InvalidAttributeValueException  ,
189     AttributeNotFoundException  , MBeanException  , ReflectionException   {
190         if (baseUURI == null) {
191             return;
192         }
193         this.extractor = createExtractor();
194         URL   url = new URL  (baseUURI.toString());
195         this.recorder = HttpRecorder.
196             wrapInputStreamWithHttpRecord(getTmpDir(),
197             this.getClass().getName(), url.openStream(), encoding);
198         CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199         this.extractor.innerProcess(curi);
200         
201         System.out.println("+" + this.extractor.report());
202         int count = 0; 
203         Collection   links = curi.getOutLinks();
204         System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205         if (links != null) {
206             for (Iterator   i = links.iterator(); i.hasNext();) {
207                 Link link = (Link)i.next();
208                 if (link.getHopType()==Link.NAVLINK_HOP) {
209                     count++;
210                     System.out.println(link.getDestination());
211                 }
212             }
213         }
214         System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215         if (links != null) {
216             for (Iterator   i = links.iterator(); i.hasNext();) {
217                 Link link = (Link)i.next();
218                 if (link.getHopType()==Link.EMBED_HOP) {
219                     count++;
220                     System.out.println(link.getDestination());
221                 }
222             }
223         }
224         System.out.
225             println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226         if (links != null) {
227             for (Iterator   i = links.iterator(); i.hasNext();) {
228                 Link link = (Link)i.next();
229                 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230                     count++;
231                     System.out.println(link.getDestination());
232                 }
233             }
234         }
235         System.out.
236             println("+HTML Other (all other hopTypes):");
237         if (links != null) {
238             for (Iterator   i = links.iterator(); i.hasNext();) {
239                 Link link = (Link) i.next();
240                 if (link.getHopType() != Link.SPECULATIVE_HOP
241                         && link.getHopType() != Link.NAVLINK_HOP
242                         && link.getHopType() != Link.EMBED_HOP) {
243                     count++;
244                     System.out.println(link.getHopType() + " "
245                             + link.getDestination());
246                 }
247             }
248         }
249         System.out.println("TOTAL URIS EXTRACTED: "+count);
250     }
251 
252     /**
253      * Test a particular <embed SRC=...> construct that was suspicious in
254      * the No10GovUk crawl.
255      *
256      * @throws URIException
257      */
258     public void testEmbedSrc() throws URIException {
259         CrawlURI curi=
260             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261         // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
262         CharSequence   cs = "<embed SRC=\"/documents/prem/18/1/graphics/qtvr/" +
263             "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264             "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265             "quicktime/download/\" /> ";
266         this.extractor.extract(curi,cs);
267         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268             public boolean evaluate(Object   object) {
269                 return ((Link) object).getDestination().toString().indexOf(
270                         "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271             }
272         }));
273     }
274     
275     /**
276      * Test a whitespace issue found in href.
277      * 
278      * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279      * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280      *
281      * @throws URIException
282      */
283     public void testHrefWhitespace() throws URIException {
284         CrawlURI curi =
285             new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286         CharSequence   cs = "<a HREF=\"http://www.carsound.dk\n\n\n" +
287             "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";   
288         this.extractor.extract(curi,cs);
289         curi.getOutLinks();
290         assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291                 .getOutLinks(), new Predicate() {
292             public boolean evaluate(Object   object) {
293                 return ((Link) object).getDestination().toString().indexOf(
294                         "http://www.carsound.dk/")>=0;
295             }
296         }));
297     }
298     
299     public static void main(String  [] args) throws Exception   {
300         if (args.length != 1 && args.length != 2) {
301             System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
302                 " URL|PATH [ENCODING]");
303             System.exit(1);
304         }
305         ExtractorHTMLTest testCase = new ExtractorHTMLTest();
306         testCase.setUp();
307         try {
308             testCase.runExtractor(testCase.getUURI(args[0]),
309                 (args.length == 2)? args[1]: null);
310         } finally {
311             testCase.tearDown();
312         }
313     }
314 }
315
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags