KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > filter > ContentTypeRegExpFilter


1 /* ContentTypeRegExpFilter.java
2  *
3  * Created on Sep 13, 2004
4  *
5  * Copyright (C) 2004 Tom Emerson.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.filter;
24
25 import org.archive.crawler.datamodel.CrawlURI;
26 import org.archive.util.TextUtils;
27
28 /**
29  * Compares the content-type of the passed CrawlURI to a regular expression.
30  *
31  * @author Tom Emerson
32  * @version $Date: 2007/01/13 01:31:17 $, $Revision: 1.5.2.1 $
33  * @deprecated As of release 1.10.0. To be replaced by an equivalent
34  * {@link DecideRule}.
35  */

36 public class ContentTypeRegExpFilter extends URIRegExpFilter {
37
38     private static final long serialVersionUID = 206378978342655106L;
39
40     private static final String JavaDoc DESCRIPTION = "ContentType regexp filter" +
41             "*Deprecated* To be replaced by an equivalent DecideRule. " +
42         "Cannot be used until after fetcher processors. Only then is the" +
43         " Content-Type known. A good place for this filter is at" +
44         " the writer step processing. If the content-type is null," +
45         " 301s usually have no content-type, the filter returns true.";
46
47     /**
48      * @param name Filter name.
49      */

50     public ContentTypeRegExpFilter(String JavaDoc name) {
51         super
52         (name, DESCRIPTION, "");
53     }
54
55     public ContentTypeRegExpFilter(String JavaDoc name, String JavaDoc regexp) {
56         super(name, DESCRIPTION, regexp);
57     }
58     
59     protected boolean innerAccepts(Object JavaDoc o) {
60         // FIXME: can o ever be anything but a CrawlURI?
61
if (!(o instanceof CrawlURI)) {
62             return false;
63         }
64         String JavaDoc content_type = ((CrawlURI)o).getContentType();
65         String JavaDoc regexp = getRegexp(o);
66         return (regexp == null)? false:
67             (content_type == null)? true:
68                 TextUtils.matches(getRegexp(o), content_type);
69     }
70 }
71
Popular Tags