1 23 package org.archive.crawler.filter; 24 25 import org.archive.crawler.datamodel.CrawlURI; 26 import org.archive.util.TextUtils; 27 28 36 public class ContentTypeRegExpFilter extends URIRegExpFilter { 37 38 private static final long serialVersionUID = 206378978342655106L; 39 40 private static final String DESCRIPTION = "ContentType regexp filter" + 41 "*Deprecated* To be replaced by an equivalent DecideRule. " + 42 "Cannot be used until after fetcher processors. Only then is the" + 43 " Content-Type known. A good place for this filter is at" + 44 " the writer step processing. If the content-type is null," + 45 " 301s usually have no content-type, the filter returns true."; 46 47 50 public ContentTypeRegExpFilter(String name) { 51 super 52 (name, DESCRIPTION, ""); 53 } 54 55 public ContentTypeRegExpFilter(String name, String regexp) { 56 super(name, DESCRIPTION, regexp); 57 } 58 59 protected boolean innerAccepts(Object o) { 60 if (!(o instanceof CrawlURI)) { 62 return false; 63 } 64 String content_type = ((CrawlURI)o).getContentType(); 65 String regexp = getRegexp(o); 66 return (regexp == null)? false: 67 (content_type == null)? true: 68 TextUtils.matches(getRegexp(o), content_type); 69 } 70 } 71 | Popular Tags |