JerichoExtractorHTMLTest


1   /* JerichoExtractorHTMLTest
2    *
3    * Copyright (C) 2006 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   * 
21   * 
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.io.File  ;
26  import java.io.FileOutputStream  ;
27  import java.io.IOException  ;
28  import java.net.URL  ;
29  import java.util.Collection  ;
30  import java.util.Iterator  ;
31  
32  import javax.management.AttributeNotFoundException  ;
33  import javax.management.InvalidAttributeValueException  ;
34  import javax.management.MBeanException  ;
35  import javax.management.ReflectionException  ;
36  
37  import org.apache.commons.collections.CollectionUtils;
38  import org.apache.commons.collections.Predicate;
39  import org.apache.commons.httpclient.URIException;
40  import org.archive.crawler.datamodel.CoreAttributeConstants;
41  import org.archive.crawler.datamodel.CrawlOrder;
42  import org.archive.crawler.datamodel.CrawlURI;
43  import org.archive.crawler.settings.MapType;
44  import org.archive.crawler.settings.SettingsHandler;
45  import org.archive.crawler.settings.XMLSettingsHandler;
46  import org.archive.net.UURI;
47  import org.archive.net.UURIFactory;
48  import org.archive.util.HttpRecorder;
49  
50  
51  /**
52   * Test html extractor.
53   *
54   * @author stack
55   * @version $Revision: 1.1.2.1 $, $Date: 2007/01/13 01:31:17 $
56   */
57  public class JerichoExtractorHTMLTest
58  extends ExtractorHTMLTest
59  implements CoreAttributeConstants {
60      private final String   ARCHIVE_DOT_ORG = "archive.org";
61      private final String   LINK_TO_FIND = "http://www.hewlett.org/";
62      private HttpRecorder recorder = null;
63      private JerichoExtractorHTML extractor = null;
64      
65      protected JerichoExtractorHTML createExtractor()
66      throws InvalidAttributeValueException  , AttributeNotFoundException  ,
67      MBeanException  , ReflectionException   {
68          // Hack in a settings handler.  Do this by adding this extractor
69          // to the order file (I'm adding it to a random MapType; seemingly
70          // can only add to MapTypes post-construction). This takes care
71          // of setting a valid SettingsHandler into the ExtractorHTML (This
72          // shouldn't be so difficult).  Of note, the order file below is
73          // not written to disk.
74          final String   name = this.getClass().getName();
75          SettingsHandler handler = new XMLSettingsHandler(
76              new File  (getTmpDir(), name + ".order.xml"));
77          handler.initialize();
78          return (JerichoExtractorHTML)((MapType)handler.getOrder().
79              getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
80                  getSettingsObject(null), new JerichoExtractorHTML(name));
81      }
82      
83      protected void setUp() throws Exception   {
84          super.setUp();
85          this.extractor = createExtractor();
86          final boolean USE_NET = false;
87          URL   url = null;
88          if (USE_NET) {
89              url = new URL  ("http://" + this.ARCHIVE_DOT_ORG);
90          } else {
91              File   f = new File  (getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
92              url = new URL  ("file://" + f.getAbsolutePath());
93              FileOutputStream   fos = new FileOutputStream  (f);
94              fos.write(("<html><head><title>test</title><body>" +
95                  "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
96                  "</body></html>").getBytes());
97              fos.flush();
98              fos.close();
99          }
100         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
101             this.getClass().getName(), url.openStream(), null);
102     }
103 
104 
105     public void testInnerProcess() throws IOException   {
106         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
107         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
108         this.extractor.innerProcess(curi);
109         Collection   links = curi.getOutLinks();
110         boolean foundLinkToHewlettFoundation = false;
111         for (Iterator   i = links.iterator(); i.hasNext();) {
112             Link link = (Link)i.next();
113             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
114                 foundLinkToHewlettFoundation = true;
115                 break;
116             }
117         }
118         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
119     }
120     
121     private CrawlURI setupCrawlURI(HttpRecorder rec, String   url)
122             throws URIException {
123         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
124         curi.setContentSize(this.recorder.getRecordedInput().getSize());
125         curi.setContentType("text/html");
126         curi.setFetchStatus(200);
127         curi.setHttpRecorder(rec);
128         // Fake out the extractor that this is a HTTP transaction.
129         curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
130             new Object  ());
131         return curi;
132     }
133 
134     
135     /**
136      * Test a forms link extraction
137      * 
138      * @throws URIException
139      */
140     public void testFormsLink() throws URIException {
141         CrawlURI curi =
142             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
143         CharSequence   cs = 
144             "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
145             "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
146             "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
147             "  <select name=\"selectBox\">" +
148             "    <option value=\"selectedOption\" selected>option1</option>" +
149             "    <option value=\"nonselectedOption\">option2</option>" +
150             "  </select>" +
151             "  <input type=\"submit\" name=\"test\" value=\"Go\">" +
152             "</form>";   
153         this.extractor.extract(curi,cs);
154         curi.getOutLinks();
155         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
156             public boolean evaluate(Object   object) {
157                 return ((Link) object).getDestination().toString().indexOf(
158                         "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
159             }
160         }));
161     }
162 }
163
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags