KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > JerichoExtractorHTMLTest


1 /* JerichoExtractorHTMLTest
2  *
3  * Copyright (C) 2006 Olaf Freyer
4  *
5  * This file is part of the Heritrix web crawler (crawler.archive.org).
6  *
7  * Heritrix is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or
10  * any later version.
11  *
12  * Heritrix is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser Public License
18  * along with Heritrix; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20  *
21  *
22  */

23 package org.archive.crawler.extractor;
24
25 import java.io.File JavaDoc;
26 import java.io.FileOutputStream JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.net.URL JavaDoc;
29 import java.util.Collection JavaDoc;
30 import java.util.Iterator JavaDoc;
31
32 import javax.management.AttributeNotFoundException JavaDoc;
33 import javax.management.InvalidAttributeValueException JavaDoc;
34 import javax.management.MBeanException JavaDoc;
35 import javax.management.ReflectionException JavaDoc;
36
37 import org.apache.commons.collections.CollectionUtils;
38 import org.apache.commons.collections.Predicate;
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.datamodel.CoreAttributeConstants;
41 import org.archive.crawler.datamodel.CrawlOrder;
42 import org.archive.crawler.datamodel.CrawlURI;
43 import org.archive.crawler.settings.MapType;
44 import org.archive.crawler.settings.SettingsHandler;
45 import org.archive.crawler.settings.XMLSettingsHandler;
46 import org.archive.net.UURI;
47 import org.archive.net.UURIFactory;
48 import org.archive.util.HttpRecorder;
49
50
51 /**
52  * Test html extractor.
53  *
54  * @author stack
55  * @version $Revision: 1.1.2.1 $, $Date: 2007/01/13 01:31:17 $
56  */

57 public class JerichoExtractorHTMLTest
58 extends ExtractorHTMLTest
59 implements CoreAttributeConstants {
60     private final String JavaDoc ARCHIVE_DOT_ORG = "archive.org";
61     private final String JavaDoc LINK_TO_FIND = "http://www.hewlett.org/";
62     private HttpRecorder recorder = null;
63     private JerichoExtractorHTML extractor = null;
64     
65     protected JerichoExtractorHTML createExtractor()
66     throws InvalidAttributeValueException JavaDoc, AttributeNotFoundException JavaDoc,
67     MBeanException JavaDoc, ReflectionException JavaDoc {
68         // Hack in a settings handler. Do this by adding this extractor
69
// to the order file (I'm adding it to a random MapType; seemingly
70
// can only add to MapTypes post-construction). This takes care
71
// of setting a valid SettingsHandler into the ExtractorHTML (This
72
// shouldn't be so difficult). Of note, the order file below is
73
// not written to disk.
74
final String JavaDoc name = this.getClass().getName();
75         SettingsHandler handler = new XMLSettingsHandler(
76             new File JavaDoc(getTmpDir(), name + ".order.xml"));
77         handler.initialize();
78         return (JerichoExtractorHTML)((MapType)handler.getOrder().
79             getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
80                 getSettingsObject(null), new JerichoExtractorHTML(name));
81     }
82     
83     protected void setUp() throws Exception JavaDoc {
84         super.setUp();
85         this.extractor = createExtractor();
86         final boolean USE_NET = false;
87         URL JavaDoc url = null;
88         if (USE_NET) {
89             url = new URL JavaDoc("http://" + this.ARCHIVE_DOT_ORG);
90         } else {
91             File JavaDoc f = new File JavaDoc(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
92             url = new URL JavaDoc("file://" + f.getAbsolutePath());
93             FileOutputStream JavaDoc fos = new FileOutputStream JavaDoc(f);
94             fos.write(("<html><head><title>test</title><body>" +
95                 "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
96                 "</body></html>").getBytes());
97             fos.flush();
98             fos.close();
99         }
100         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
101             this.getClass().getName(), url.openStream(), null);
102     }
103
104
105     public void testInnerProcess() throws IOException JavaDoc {
106         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
107         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
108         this.extractor.innerProcess(curi);
109         Collection JavaDoc links = curi.getOutLinks();
110         boolean foundLinkToHewlettFoundation = false;
111         for (Iterator JavaDoc i = links.iterator(); i.hasNext();) {
112             Link link = (Link)i.next();
113             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
114                 foundLinkToHewlettFoundation = true;
115                 break;
116             }
117         }
118         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
119     }
120     
121     private CrawlURI setupCrawlURI(HttpRecorder rec, String JavaDoc url)
122             throws URIException {
123         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
124         curi.setContentSize(this.recorder.getRecordedInput().getSize());
125         curi.setContentType("text/html");
126         curi.setFetchStatus(200);
127         curi.setHttpRecorder(rec);
128         // Fake out the extractor that this is a HTTP transaction.
129
curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
130             new Object JavaDoc());
131         return curi;
132     }
133
134     
135     /**
136      * Test a forms link extraction
137      *
138      * @throws URIException
139      */

140     public void testFormsLink() throws URIException {
141         CrawlURI curi =
142             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
143         CharSequence JavaDoc cs =
144             "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
145             " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
146             " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
147             " <select name=\"selectBox\">" +
148             " <option value=\"selectedOption\" selected>option1</option>" +
149             " <option value=\"nonselectedOption\">option2</option>" +
150             " </select>" +
151             " <input type=\"submit\" name=\"test\" value=\"Go\">" +
152             "</form>";
153         this.extractor.extract(curi,cs);
154         curi.getOutLinks();
155         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
156             public boolean evaluate(Object JavaDoc object) {
157                 return ((Link) object).getDestination().toString().indexOf(
158                         "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
159             }
160         }));
161     }
162 }
163
Popular Tags