1 23 package org.archive.crawler.extractor; 24 25 import java.io.File ; 26 import java.io.FileOutputStream ; 27 import java.io.IOException ; 28 import java.net.URL ; 29 import java.util.Collection ; 30 import java.util.Iterator ; 31 32 import javax.management.AttributeNotFoundException ; 33 import javax.management.InvalidAttributeValueException ; 34 import javax.management.MBeanException ; 35 import javax.management.ReflectionException ; 36 37 import org.apache.commons.collections.CollectionUtils; 38 import org.apache.commons.collections.Predicate; 39 import org.apache.commons.httpclient.URIException; 40 import org.archive.crawler.datamodel.CoreAttributeConstants; 41 import org.archive.crawler.datamodel.CrawlOrder; 42 import org.archive.crawler.datamodel.CrawlURI; 43 import org.archive.crawler.settings.MapType; 44 import org.archive.crawler.settings.SettingsHandler; 45 import org.archive.crawler.settings.XMLSettingsHandler; 46 import org.archive.net.UURI; 47 import org.archive.net.UURIFactory; 48 import org.archive.util.HttpRecorder; 49 50 51 57 public class JerichoExtractorHTMLTest 58 extends ExtractorHTMLTest 59 implements CoreAttributeConstants { 60 private final String ARCHIVE_DOT_ORG = "archive.org"; 61 private final String LINK_TO_FIND = "http://www.hewlett.org/"; 62 private HttpRecorder recorder = null; 63 private JerichoExtractorHTML extractor = null; 64 65 protected JerichoExtractorHTML createExtractor() 66 throws InvalidAttributeValueException , AttributeNotFoundException , 67 MBeanException , ReflectionException { 68 final String name = this.getClass().getName(); 75 SettingsHandler handler = new XMLSettingsHandler( 76 new File (getTmpDir(), name + ".order.xml")); 77 handler.initialize(); 78 return (JerichoExtractorHTML)((MapType)handler.getOrder(). 79 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler. 80 getSettingsObject(null), new JerichoExtractorHTML(name)); 81 } 82 83 protected void setUp() throws Exception { 84 super.setUp(); 85 this.extractor = createExtractor(); 86 final boolean USE_NET = false; 87 URL url = null; 88 if (USE_NET) { 89 url = new URL ("http://" + this.ARCHIVE_DOT_ORG); 90 } else { 91 File f = new File (getTmpDir(), this.ARCHIVE_DOT_ORG + ".html"); 92 url = new URL ("file://" + f.getAbsolutePath()); 93 FileOutputStream fos = new FileOutputStream (f); 94 fos.write(("<html><head><title>test</title><body>" + 95 "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" + 96 "</body></html>").getBytes()); 97 fos.flush(); 98 fos.close(); 99 } 100 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(), 101 this.getClass().getName(), url.openStream(), null); 102 } 103 104 105 public void testInnerProcess() throws IOException { 106 UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG); 107 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString()); 108 this.extractor.innerProcess(curi); 109 Collection links = curi.getOutLinks(); 110 boolean foundLinkToHewlettFoundation = false; 111 for (Iterator i = links.iterator(); i.hasNext();) { 112 Link link = (Link)i.next(); 113 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) { 114 foundLinkToHewlettFoundation = true; 115 break; 116 } 117 } 118 assertTrue("Did not find gif url", foundLinkToHewlettFoundation); 119 } 120 121 private CrawlURI setupCrawlURI(HttpRecorder rec, String url) 122 throws URIException { 123 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url)); 124 curi.setContentSize(this.recorder.getRecordedInput().getSize()); 125 curi.setContentType("text/html"); 126 curi.setFetchStatus(200); 127 curi.setHttpRecorder(rec); 128 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, 130 new Object ()); 131 return curi; 132 } 133 134 135 140 public void testFormsLink() throws URIException { 141 CrawlURI curi = 142 new CrawlURI(UURIFactory.getInstance("http://www.example.org")); 143 CharSequence cs = 144 "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " + 145 " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+ 146 " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " + 147 " <select name=\"selectBox\">" + 148 " <option value=\"selectedOption\" selected>option1</option>" + 149 " <option value=\"nonselectedOption\">option2</option>" + 150 " </select>" + 151 " <input type=\"submit\" name=\"test\" value=\"Go\">" + 152 "</form>"; 153 this.extractor.extract(curi,cs); 154 curi.getOutLinks(); 155 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { 156 public boolean evaluate(Object object) { 157 return ((Link) object).getDestination().toString().indexOf( 158 "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0; 159 } 160 })); 161 } 162 } 163 | Popular Tags |