KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > DomainSensitiveFrontier


1 /* DomainSensitiveFrontier
2 *
3 * $Id: DomainSensitiveFrontier.java,v 1.13.2.1 2007/01/13 01:31:23 stack-sf Exp $
4 *
5 * Created on 2004-may-06
6 *
7 * Copyright (C) 2004 Royal Library of Sweden.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.frontier;
26
27 import java.io.IOException JavaDoc;
28 import java.util.Hashtable JavaDoc;
29 import java.util.logging.Logger JavaDoc;
30
31 import javax.management.AttributeNotFoundException JavaDoc;
32 import javax.management.MBeanException JavaDoc;
33 import javax.management.ReflectionException JavaDoc;
34
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.event.CrawlURIDispositionListener;
37 import org.archive.crawler.filter.OrFilter;
38 import org.archive.crawler.filter.URIRegExpFilter;
39 import org.archive.crawler.framework.CrawlController;
40 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41 import org.archive.crawler.prefetch.QuotaEnforcer;
42 import org.archive.crawler.scope.ClassicScope;
43 import org.archive.crawler.settings.CrawlerSettings;
44 import org.archive.crawler.settings.SimpleType;
45 import org.archive.crawler.settings.Type;
46
47 /**
48  * Behaves like {@link BdbFrontier} (i.e., a basic mostly breadth-first
49  * frontier), but with the addition that you can set the number of documents
50  * to download on a per site basis.
51  *
52  * Useful for case of frequent revisits of a site of frequent changes.
53  *
54  * <p>Choose the number of docs you want to download and specify
55  * the count in <code>max-docs</code>. If <code>count-per-host</code> is
56  * true, the default, then the crawler will download <code>max-docs</code>
57  * per host. If you create an override, the overridden <code>max-docs</code>
58  * count will be downloaded instead, whether it is higher or lower.
59  * <p>If <code>count-per-host</code> is false, then <code>max-docs</code>
60  * acts like the the crawl order <code>max-docs</code> and the crawler will
61  * download this total amount of docs only. Overrides will
62  * download <code>max-docs</code> total in the overridden domain.
63  *
64  * @author Oskar Grenholm <oskar dot grenholm at kb dot se>
65  * @deprecated As of release 1.10.0. Replaced by {@link BdbFrontier} and
66  * {@link QuotaEnforcer}.
67  */

68 public class DomainSensitiveFrontier extends BdbFrontier
69 implements CrawlURIDispositionListener {
70
71     private static final long serialVersionUID = -3330190056282726202L;
72
73     private static final Logger JavaDoc logger =
74         Logger.getLogger(DomainSensitiveFrontier.class.getName());
75     
76     public static final String JavaDoc ATTR_MAX_DOCS = "max-docs";
77     public static final String JavaDoc ATTR_COUNTER_MODE = "counter-mode";
78     public static final String JavaDoc COUNT_OVERRIDE = "count-per-override";
79     public static final String JavaDoc COUNT_HOST = "count-per-host";
80     public static final String JavaDoc COUNT_DOMAIN = "count-per-domain";
81     public static final String JavaDoc[] ATTR_AVAILABLE_MODES = new String JavaDoc[] {
82         COUNT_OVERRIDE, COUNT_HOST, COUNT_DOMAIN };
83     public static final String JavaDoc DEFAULT_MODE = COUNT_OVERRIDE;
84         
85     // TODO: Make this a BigMap.
86
private Hashtable JavaDoc<String JavaDoc,Long JavaDoc> hostCounters = new Hashtable JavaDoc<String JavaDoc,Long JavaDoc>();
87     private boolean countPerOverride = true;
88     private String JavaDoc counterMode;
89
90     public DomainSensitiveFrontier(String JavaDoc name) {
91         super(ATTR_NAME, "DomainSensitiveFrontier. *Deprecated* Use " +
92             "BdbFrontier+QuotaEnforcer instead. " +
93             "Overrides BdbFrontier to add specification of number of " +
94             "documents to download (Expects 'exclude-filter' " +
95             "to be part of CrawlScope).");
96         Type e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCS,
97             "Maximum number of documents to download for host or domain" +
98             " (Zero means no limit).", new Long JavaDoc(0)));
99         e.setOverrideable(true);
100         e = addElementToDefinition(new SimpleType(ATTR_COUNTER_MODE,
101                "If " + COUNT_OVERRIDE + ", acts like the crawl " +
102                "order maximum download count and the crawler will download " +
103                "this total amount of docs only. Override to change the max " +
104                "count for the overridden domain or host. " +
105                "Else if " + COUNT_HOST + " the crawler will download " +
106                ATTR_MAX_DOCS + " per host. Add an override to change " +
107                "max count on a per-domain or a per-host basis.For " +
108                "example, if you set " + ATTR_MAX_DOCS + " to 30 in " +
109                "this mode, the crawler will download 30 docs from " +
110                "each host in scope. If you override for kb.se setting " +
111                ATTR_MAX_DOCS +
112                " to 20, it will instead download only 20 docs from each " +
113                "host of kb.se. (It can be a larger as well as a smaller " +
114                "value here.). " +
115                "Finally " + COUNT_DOMAIN + " behaves similar to " +
116                COUNT_HOST +
117                ", but instead sets max on a per-domain basis." +
118                "Here you can do overrides on the domain-level, but " +
119                "not on the host-level. So if you here set " +
120                ATTR_MAX_DOCS +
121                " to 30 the crawler will download 30 docs from each " +
122                "domain in scope. If you override for kb.se setting " +
123                ATTR_MAX_DOCS + " to 20, it will instead download only " +
124                "20 docs in total from the whole kb.se domain. (It can be " +
125                "a larger as well as a smaller value here.)",
126                DEFAULT_MODE, ATTR_AVAILABLE_MODES));
127          e.setOverrideable(false);
128     }
129
130     public void initialize(CrawlController c)
131     throws FatalConfigurationException, IOException JavaDoc {
132         super.initialize(c);
133         this.controller.addCrawlURIDispositionListener(this);
134         try {
135             counterMode = ((String JavaDoc)getAttribute(ATTR_COUNTER_MODE));
136             if(counterMode.equalsIgnoreCase(COUNT_DOMAIN) ||
137                     counterMode.equalsIgnoreCase(COUNT_HOST))
138                 countPerOverride = false;
139             else
140                 countPerOverride = true;
141         } catch (AttributeNotFoundException JavaDoc e) {
142             e.printStackTrace();
143         } catch (MBeanException JavaDoc e) {
144             e.printStackTrace();
145         } catch (ReflectionException JavaDoc e) {
146             e.printStackTrace();
147         }
148     }
149     
150     /**
151      * Check if the max document download limit for this host or domain has
152      * been reached.
153      *
154      * If so, delete the rest of the URIs for this host or domain waiting in
155      * the queue. Then add an URIRegExpFilter for this host or domain, so
156      * we won't get any more URIs from this one later on.
157      * @param curi CrawlURI.
158      * @return True if discarded queue.
159      */

160     private synchronized boolean checkDownloadLimits(CrawlURI curi) {
161         long thisMaxDocs = 0;
162         long thisCounter = 0;
163         boolean discarded = false;
164         boolean retVal = false;
165         if (curi.getUURI().getScheme().equals("dns")) {
166             return false;
167         }
168         try {
169             String JavaDoc host = curi.getUURI().getHost();
170             CrawlerSettings cs = controller.getSettingsHandler().
171                 getSettings(host);
172             do {
173                 String JavaDoc scope;
174                 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
175                     scope = cs.getScope() != null ? cs.getScope() : "root";
176                 else if(counterMode.equalsIgnoreCase(COUNT_HOST))
177                     scope = host;
178                 else{ //Get domain part of host
179
int i = host.lastIndexOf(".");
180                     i = host.lastIndexOf(".", i-1);
181                     scope = host.substring(i+1, host.length());
182                 }
183                 thisMaxDocs =
184                     ((Long JavaDoc) getAttribute(cs, ATTR_MAX_DOCS)).longValue();
185                 thisCounter = this.hostCounters.get(scope) != null ?
186                     ((Long JavaDoc) this.hostCounters.get(scope)).longValue(): 0;
187                 // Have we hit the max document download limit for this host
188
// or domain?
189
if ((thisMaxDocs > 0 && thisCounter >= thisMaxDocs)) {
190                     logger.fine("Discarding Queue: " + host + " ");
191                     curi.addAnnotation("dsfLimit");
192                    if (!discarded) {
193                         long count = 0;
194                         WorkQueue wq = getQueueFor(curi);
195                         wq.unpeek();
196                         count += wq.deleteMatching(this, ".*");
197                         decrementQueuedCount(count);
198                         discarded = true;
199                         // I tried adding annotation but we're past log time
200
// for Curi so it doesn't work.
201
// curi.addAnnotation("maxDocsForHost");
202
}
203                     // Adding an exclude filter for this host or domain
204
OrFilter or = (OrFilter) this.controller.getScope()
205                             .getAttribute(ClassicScope.ATTR_EXCLUDE_FILTER);
206                     // If we have hit max for root, block everything. Else
207
// just the scope.
208
String JavaDoc filter = scope.equalsIgnoreCase("root") ?
209                         ".*" : "^((https?://)?[a-zA-Z0-9\\.]*)" + scope +
210                             "($|/.*)";
211                     logger.fine("Adding filter: [" + filter + "].");
212                     URIRegExpFilter urf =
213                         new URIRegExpFilter(curi.toString(), filter);
214                     or.addFilter(this.controller.getSettingsHandler().
215                         getSettings(null), urf);
216                     thisMaxDocs = 0;
217                     thisCounter = 0;
218                     retVal = true;
219                 }
220             } while ((cs = cs.getParent()) != null && countPerOverride);
221         } catch (Exception JavaDoc e) {
222             logger.severe("ERROR: checkDownloadLimits(), "
223                     + "while processing {" + curi.toString() + "}"
224                     + e.getClass()
225                     + "message: " + e.getMessage() + ". Stack trace:");
226             e.printStackTrace();
227         }
228         return retVal;
229     }
230     
231     protected synchronized void incrementHostCounters(CrawlURI curi) {
232         if (!curi.getUURI().toString().startsWith("dns:")) {
233             try {
234                 String JavaDoc host = curi.getUURI().getHost();
235                 CrawlerSettings cs =
236                     controller.getSettingsHandler().getSettings(host);
237                 do {
238                     String JavaDoc scope;
239                     if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
240                         scope = cs.getScope() != null? cs.getScope() : "root";
241                     else if(counterMode.equalsIgnoreCase(COUNT_HOST))
242                         scope = host;
243                     else{ //Get only domain part of host
244
int i = host.lastIndexOf(".");
245                         i = host.lastIndexOf(".", i-1);
246                         scope = host.substring(i+1, host.length());
247                     }
248                     long counter = this.hostCounters.get(scope) != null ?
249                         ((Long JavaDoc)this.hostCounters.get(scope)).longValue(): 0;
250                     this.hostCounters.put(scope, new Long JavaDoc(++counter));
251                 } while ((cs = cs.getParent()) != null && countPerOverride);
252             } catch (Exception JavaDoc e) {
253                 logger.severe("ERROR: incrementHostCounters() " +
254                     e.getMessage());
255             }
256         }
257     }
258     
259     public void crawledURISuccessful(CrawlURI curi) {
260         incrementHostCounters(curi);
261         checkDownloadLimits(curi);
262     }
263
264     public void crawledURINeedRetry(CrawlURI curi) {
265     }
266
267     public void crawledURIDisregard(CrawlURI curi) {
268     }
269
270     public void crawledURIFailure(CrawlURI curi) {
271     }
272 }
273
Popular Tags