1 23 package org.archive.crawler.extractor; 24 25 import java.io.File ; 26 import java.io.FileOutputStream ; 27 import java.io.IOException ; 28 import java.net.URL ; 29 import java.util.Collection ; 30 import java.util.Iterator ; 31 32 import javax.management.AttributeNotFoundException ; 33 import javax.management.InvalidAttributeValueException ; 34 import javax.management.MBeanException ; 35 import javax.management.ReflectionException ; 36 37 import org.apache.commons.collections.CollectionUtils; 38 import org.apache.commons.collections.Predicate; 39 import org.apache.commons.httpclient.URIException; 40 import org.archive.crawler.datamodel.CoreAttributeConstants; 41 import org.archive.crawler.datamodel.CrawlOrder; 42 import org.archive.crawler.datamodel.CrawlURI; 43 import org.archive.crawler.settings.MapType; 44 import org.archive.crawler.settings.SettingsHandler; 45 import org.archive.crawler.settings.XMLSettingsHandler; 46 import org.archive.net.UURI; 47 import org.archive.net.UURIFactory; 48 import org.archive.util.HttpRecorder; 49 import org.archive.util.TmpDirTestCase; 50 51 52 58 public class ExtractorHTMLTest 59 extends TmpDirTestCase 60 implements CoreAttributeConstants { 61 private final String ARCHIVE_DOT_ORG = "archive.org"; 62 private final String LINK_TO_FIND = "http://www.hewlett.org/"; 63 private HttpRecorder recorder = null; 64 private ExtractorHTML extractor = null; 65 66 protected ExtractorHTML createExtractor() 67 throws InvalidAttributeValueException , AttributeNotFoundException , 68 MBeanException , ReflectionException { 69 final String name = this.getClass().getName(); 76 SettingsHandler handler = new XMLSettingsHandler( 77 new File (getTmpDir(), name + ".order.xml")); 78 handler.initialize(); 79 return (ExtractorHTML)((MapType)handler.getOrder(). 80 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler. 81 getSettingsObject(null), new ExtractorHTML(name)); 82 } 83 84 protected void setUp() throws Exception { 85 super.setUp(); 86 this.extractor = createExtractor(); 87 final boolean USE_NET = false; 88 URL url = null; 89 if (USE_NET) { 90 url = new URL ("http://" + this.ARCHIVE_DOT_ORG); 91 } else { 92 File f = new File (getTmpDir(), this.ARCHIVE_DOT_ORG + ".html"); 93 url = new URL ("file://" + f.getAbsolutePath()); 94 FileOutputStream fos = new FileOutputStream (f); 95 fos.write(("<html><head><title>test</title><body>" + 96 "<a HREF=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" + 97 "</body></html>").getBytes()); 98 fos.flush(); 99 fos.close(); 100 } 101 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(), 102 this.getClass().getName(), url.openStream(), null); 103 } 104 105 108 protected void tearDown() throws Exception { 109 super.tearDown(); 110 } 111 112 public void testInnerProcess() throws IOException { 113 UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG); 114 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString()); 115 this.extractor.innerProcess(curi); 116 Collection links = curi.getOutLinks(); 117 boolean foundLinkToHewlettFoundation = false; 118 for (Iterator i = links.iterator(); i.hasNext();) { 119 Link link = (Link)i.next(); 120 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) { 121 foundLinkToHewlettFoundation = true; 122 break; 123 } 124 } 125 assertTrue("Did not find gif url", foundLinkToHewlettFoundation); 126 } 127 128 private CrawlURI setupCrawlURI(HttpRecorder rec, String url) 129 throws URIException { 130 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url)); 131 curi.setContentSize(this.recorder.getRecordedInput().getSize()); 132 curi.setContentType("text/html"); 133 curi.setFetchStatus(200); 134 curi.setHttpRecorder(rec); 135 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, 137 new Object ()); 138 return curi; 139 } 140 141 151 public void testPageParse() 152 throws InvalidAttributeValueException , AttributeNotFoundException , 153 MBeanException , ReflectionException , IOException { 154 UURI uuri = null; 155 156 if (uuri != null) { 172 runExtractor(uuri); 173 } 174 } 175 176 protected UURI getUURI(String url) throws URIException { 177 url = (url.indexOf("://") > 0)? url: "file://" + url; 178 return UURIFactory.getInstance(url); 179 } 180 181 protected void runExtractor(UURI baseUURI) 182 throws InvalidAttributeValueException , AttributeNotFoundException , 183 MBeanException , ReflectionException , IOException { 184 runExtractor(baseUURI, null); 185 } 186 187 protected void runExtractor(UURI baseUURI, String encoding) 188 throws IOException , InvalidAttributeValueException , 189 AttributeNotFoundException , MBeanException , ReflectionException { 190 if (baseUURI == null) { 191 return; 192 } 193 this.extractor = createExtractor(); 194 URL url = new URL (baseUURI.toString()); 195 this.recorder = HttpRecorder. 196 wrapInputStreamWithHttpRecord(getTmpDir(), 197 this.getClass().getName(), url.openStream(), encoding); 198 CrawlURI curi = setupCrawlURI(this.recorder, url.toString()); 199 this.extractor.innerProcess(curi); 200 201 System.out.println("+" + this.extractor.report()); 202 int count = 0; 203 Collection links = curi.getOutLinks(); 204 System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):"); 205 if (links != null) { 206 for (Iterator i = links.iterator(); i.hasNext();) { 207 Link link = (Link)i.next(); 208 if (link.getHopType()==Link.NAVLINK_HOP) { 209 count++; 210 System.out.println(link.getDestination()); 211 } 212 } 213 } 214 System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):"); 215 if (links != null) { 216 for (Iterator i = links.iterator(); i.hasNext();) { 217 Link link = (Link)i.next(); 218 if (link.getHopType()==Link.EMBED_HOP) { 219 count++; 220 System.out.println(link.getDestination()); 221 } 222 } 223 } 224 System.out. 225 println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):"); 226 if (links != null) { 227 for (Iterator i = links.iterator(); i.hasNext();) { 228 Link link = (Link)i.next(); 229 if (link.getHopType()==Link.SPECULATIVE_HOP) { 230 count++; 231 System.out.println(link.getDestination()); 232 } 233 } 234 } 235 System.out. 236 println("+HTML Other (all other hopTypes):"); 237 if (links != null) { 238 for (Iterator i = links.iterator(); i.hasNext();) { 239 Link link = (Link) i.next(); 240 if (link.getHopType() != Link.SPECULATIVE_HOP 241 && link.getHopType() != Link.NAVLINK_HOP 242 && link.getHopType() != Link.EMBED_HOP) { 243 count++; 244 System.out.println(link.getHopType() + " " 245 + link.getDestination()); 246 } 247 } 248 } 249 System.out.println("TOTAL URIS EXTRACTED: "+count); 250 } 251 252 258 public void testEmbedSrc() throws URIException { 259 CrawlURI curi= 260 new CrawlURI(UURIFactory.getInstance("http://www.example.org")); 261 CharSequence cs = "<embed SRC=\"/documents/prem/18/1/graphics/qtvr/" + 263 "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " + 264 "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" + 265 "quicktime/download/\" /> "; 266 this.extractor.extract(curi,cs); 267 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { 268 public boolean evaluate(Object object) { 269 return ((Link) object).getDestination().toString().indexOf( 270 "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0; 271 } 272 })); 273 } 274 275 283 public void testHrefWhitespace() throws URIException { 284 CrawlURI curi = 285 new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk")); 286 CharSequence cs = "<a HREF=\"http://www.carsound.dk\n\n\n" + 287 "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>"; 288 this.extractor.extract(curi,cs); 289 curi.getOutLinks(); 290 assertTrue("Not stripping new lines", CollectionUtils.exists(curi 291 .getOutLinks(), new Predicate() { 292 public boolean evaluate(Object object) { 293 return ((Link) object).getDestination().toString().indexOf( 294 "http://www.carsound.dk/")>=0; 295 } 296 })); 297 } 298 299 public static void main(String [] args) throws Exception { 300 if (args.length != 1 && args.length != 2) { 301 System.err.println("Usage: " + ExtractorHTMLTest.class.getName() + 302 " URL|PATH [ENCODING]"); 303 System.exit(1); 304 } 305 ExtractorHTMLTest testCase = new ExtractorHTMLTest(); 306 testCase.setUp(); 307 try { 308 testCase.runExtractor(testCase.getUURI(args[0]), 309 (args.length == 2)? args[1]: null); 310 } finally { 311 testCase.tearDown(); 312 } 313 } 314 } 315 | Popular Tags |