1 26 27 package org.archive.crawler.filter; 28 29 import java.util.logging.Level ; 30 import java.util.logging.Logger ; 31 32 import javax.management.AttributeNotFoundException ; 33 34 import org.archive.crawler.datamodel.CrawlURI; 35 import org.archive.crawler.settings.ComplexType; 36 import org.archive.crawler.settings.MapType; 37 import org.archive.crawler.settings.SimpleType; 38 39 47 public class FilePatternFilter extends URIRegExpFilter { 48 49 private static final long serialVersionUID = -4019256104085004651L; 50 51 private static final Logger logger = 52 Logger.getLogger(FilePatternFilter.class.getName()); 53 public static final String ATTR_USE_DEFAULT = "use-default-patterns"; 54 public static final String IMAGES_PATTERNS = ".*(?i)(\\.(bmp|gif|jpe?g" + 55 "|png|tiff?))$"; 56 public static final String AUDIO_PATTERNS = ".*(?i)(\\.(mid|mp2|mp3|mp4" + 57 "|wav))$"; 58 public static final String VIDEO_PATTERNS = ".*(?i)(\\.(avi|mov|mpeg|ram" + 59 "|rm|smil|wmv))$"; 60 public static final String MISC_PATTERNS = ".*(?i)(\\.(doc|pdf|ppt|swf))$"; 61 public static final String ALL_DEFAULT_PATTERNS = ".*(?i)(\\.(bmp|gif" + 62 "|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|rm|smil|wmv" + 63 "|doc|pdf|ppt|swf))$"; 64 65 public static final String ALL = "All"; 66 public static final String IMAGES = "Images"; 67 public static final String AUDIO = "Audio"; 68 public static final String VIDEO = "Video"; 69 public static final String MISC = "Miscellaneous"; 70 public static final String CUSTOM = "Custom"; 71 72 75 public FilePatternFilter(String name) { 76 super(name); 77 setDescription("A URI path suffix filter *Deprecated* Use" + 78 "DecidingFilter and MatchesFilePatternDecideRule instead. " + 79 "All URLs that end with the specified pattern(s) will be added " + 80 "to the scope's focus. Default file patterns are:\n.avi, .bmp, " + 81 ".doc, .gif, .jp(e)g, .mid, .mov, .mp2, .mp3, .mp4, .mpeg, " + 82 ".pdf, .png, .ppt, .ram, .rm,.smil, .swf, .tif(f), .wav, .wmv\n" + 83 "It is also possible to specifiy custom regular expressions " + 84 "for this filter, turning it into (effectively) a generic " + 85 "regular expression filter."); 86 87 String [] options = new String [] {ALL, IMAGES, AUDIO, VIDEO, MISC, 88 CUSTOM}; 89 90 addElementToDefinition( 91 new SimpleType(ATTR_USE_DEFAULT, "URLs that match selected file " + 92 "patterns will be crawled. Default file patterns are:\n" + 93 "Images: .bmp, .gif, .jp(e)g, .png, .tif(f)\nAudio: .mid, " + 94 ".mp2, .mp3, .mp4, .wav\nVideo: .avi, .mov, .mpeg, .ram, " + 95 ".rm, .smil, .wmv\nMiscellaneous: .doc, .pdf, .ppt, .swf\n" + 96 "All: All above patterns\nChoose 'Custom' to specify your own" + 97 " pattern. These default patterns are case insensitive.", 98 "All", options)); 99 100 addElementToDefinition( 101 new SimpleType(ATTR_REGEXP, "Custom java regular expression.+n " + 102 "This regular expression will be used instead of the " + 103 "supplied pattern groups for matching.\nAn example " + 104 "of such a regular expression (Miscellaneous):\n" + 105 ".*(?i)(\\.(doc|pdf|ppt|swf))$\n" + 106 "Any arbitrary reg.expr. is valid though and will be " + 107 "applied to the URI.", "")); 108 109 110 } 111 112 115 protected String getRegexp(Object o) { 116 try { 117 String patternType = (String )getAttribute(o, ATTR_USE_DEFAULT); 118 119 if (patternType.equals(ALL)) { 120 return ALL_DEFAULT_PATTERNS; 121 } else if (patternType.equals(IMAGES)) { 122 return IMAGES_PATTERNS; 123 }else if (patternType.equals(AUDIO)) { 124 return AUDIO_PATTERNS; 125 }else if(patternType.equals(VIDEO)) { 126 return VIDEO_PATTERNS; 127 }else if(patternType.equals(MISC)) { 128 return MISC_PATTERNS; 129 }else if(patternType.equals(CUSTOM)) { 130 return (String ) getAttribute(o, ATTR_REGEXP); 131 }else { 132 assert false : "Unrecognized pattern type " + patternType + 133 ". Should never happened!"; 134 } 135 136 } catch (AttributeNotFoundException e) { 137 logger.log(Level.SEVERE,"necessary setting missing",e); 138 } 139 return null; 142 } 143 144 147 public boolean accepts(Object o) { 148 CrawlURI curi = (o instanceof CrawlURI) ? (CrawlURI) o : null; 149 150 try { 155 if (!((Boolean ) getAttribute(ATTR_ENABLED, curi)).booleanValue()) { 156 return false; 157 } 158 } catch (AttributeNotFoundException e) { 159 logger.severe(e.getMessage()); 160 } 161 162 boolean accept = returnTrueIfMatches(curi) == innerAccepts(o); 163 164 if (accept && logger.isLoggable(Level.FINEST)) { 165 ComplexType p = this.getParent(); 167 if (p instanceof MapType) { 168 p = p.getParent(); 169 } 170 String msg = this.toString() + " belonging to " + p.toString() 171 + " accepted " + o.toString(); 172 logger.finest(msg); 173 } 174 175 return accept; 176 } 177 178 } 179 | Popular Tags |