AdaptiveRevisitFrontier


1   /* AdaptiveRevisitFrontier.java
2   *
3   * Created on Sep 13, 2004
4   *
5   * Copyright (C) 2004 Kristinn Sigur?sson.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.crawler.frontier;
24  
25  import java.io.File  ;
26  import java.io.IOException  ;
27  import java.io.PrintWriter  ;
28  import java.io.Serializable  ;
29  import java.io.StringWriter  ;
30  import java.io.Writer  ;
31  import java.util.ArrayList  ;
32  import java.util.Date  ;
33  import java.util.Iterator  ;
34  import java.util.List  ;
35  import java.util.logging.Level  ;
36  import java.util.logging.Logger  ;
37  
38  import javax.management.AttributeNotFoundException  ;
39  
40  import org.apache.commons.httpclient.HttpStatus;
41  import org.archive.crawler.datamodel.CandidateURI;
42  import org.archive.crawler.datamodel.CoreAttributeConstants;
43  import org.archive.crawler.datamodel.CrawlServer;
44  import org.archive.crawler.datamodel.CrawlURI;
45  import org.archive.crawler.datamodel.FetchStatusCodes;
46  import org.archive.crawler.datamodel.UriUniqFilter;
47  import org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver;
48  import org.archive.crawler.event.CrawlStatusListener;
49  import org.archive.crawler.framework.CrawlController;
50  import org.archive.crawler.framework.Frontier;
51  import org.archive.crawler.framework.FrontierMarker;
52  import org.archive.crawler.framework.exceptions.EndedException;
53  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
54  import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
55  import org.archive.crawler.settings.ModuleType;
56  import org.archive.crawler.settings.RegularExpressionConstraint;
57  import org.archive.crawler.settings.SimpleType;
58  import org.archive.crawler.settings.Type;
59  import org.archive.crawler.url.Canonicalizer;
60  import org.archive.crawler.util.BdbUriUniqFilter;
61  import org.archive.net.UURI;
62  import org.archive.queue.MemQueue;
63  import org.archive.queue.Queue;
64  import org.archive.util.ArchiveUtils;
65  
66  
67  /**
68   * A Frontier that will repeatedly visit all encountered URIs. 
69   * <p>
70   * Wait time between visits is configurable and varies based on observed 
71   * changes of documents.
72   * <p>
73   * The Frontier borrows many things from HostQueuesFrontier, but implements 
74   * an entirely different strategy in issuing URIs and consequently in keeping a
75   * record of discovered URIs.
76   *
77   * @author Kristinn Sigurdsson
78   */
79  public class AdaptiveRevisitFrontier extends ModuleType 
80  implements Frontier, FetchStatusCodes, CoreAttributeConstants,
81          AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {
82  
83      private static final long serialVersionUID = -8666872690438543671L;
84  
85      private static final Logger   logger =
86          Logger.getLogger(AdaptiveRevisitFrontier.class.getName());
87  
88      /** How many multiples of last fetch elapsed time to wait before recontacting
89       * same server */
90      public final static String   ATTR_DELAY_FACTOR = "delay-factor";
91      private final static Float   DEFAULT_DELAY_FACTOR = new Float  (5);
92      
93      /** Always wait this long after one completion before recontacting
94       * same server, regardless of multiple */
95      public final static String   ATTR_MIN_DELAY = "min-delay-ms";
96  
97      // 2 seconds
98      private final static Integer   DEFAULT_MIN_DELAY = new Integer  (2000);
99      
100     /** Never wait more than this long, regardless of multiple */
101     public final static String   ATTR_MAX_DELAY = "max-delay-ms";
102     
103     // 30 seconds
104     private final static Integer   DEFAULT_MAX_DELAY = new Integer  (30000);
105     
106     /** Maximum times to emit a CrawlURI without final disposition */
107     public final static String   ATTR_MAX_RETRIES = "max-retries";
108     private final static Integer   DEFAULT_MAX_RETRIES = new Integer  (30);
109 
110     /** For retryable problems, seconds to wait before a retry */
111     public final static String   ATTR_RETRY_DELAY = "retry-delay-seconds";
112     
113     // 15 minutes
114     private final static Long   DEFAULT_RETRY_DELAY = new Long  (900);
115     
116     /** Maximum simultaneous requests in process to a host (queue) */
117     public final static String   ATTR_HOST_VALENCE = "host-valence";
118     private final static Integer   DEFAULT_HOST_VALENCE = new Integer  (1); 
119 
120     /** Number of hops of embeds (ERX) to bump to front of host queue */
121     public final static String   ATTR_PREFERENCE_EMBED_HOPS =
122         "preference-embed-hops";
123     private final static Integer   DEFAULT_PREFERENCE_EMBED_HOPS = new Integer  (0); 
124     
125     /** Queue assignment to force on CrawlURIs. Intended to be used 
126      *  via overrides*/
127     public final static String   ATTR_FORCE_QUEUE = "force-queue-assignment";
128     protected final static String   DEFAULT_FORCE_QUEUE = "";
129     /** Acceptable characters in forced queue names.
130      *  Word chars, dash, period, comma, colon */
131     protected final static String   ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";
132 
133     /** Should the queue assignment ignore www in hostnames, effectively 
134      *  stripping them away. 
135      */
136     public final static String   ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
137     protected final static Boolean   DEFAULT_QUEUE_IGNORE_WWW = new Boolean  (false);
138     
139     /** Should the Frontier use a seperate 'already included' datastructure
140      *  or rely on the queues'. 
141      */
142     public final static String   ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
143     protected final static Boolean   DEFAULT_USE_URI_UNIQ_FILTER = new Boolean  (false);
144     
145     private CrawlController controller;
146     
147     private AdaptiveRevisitQueueList hostQueues;
148     
149     private UriUniqFilter alreadyIncluded;
150 
151     private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();
152 
153     /** Policy for assigning CrawlURIs to named queues */
154     private QueueAssignmentPolicy queueAssignmentPolicy = null;
155     
156     // top-level stats
157     private long succeededFetchCount = 0;
158     private long failedFetchCount = 0;
159     // URI's that are disregarded (for example because of robot.txt rules)
160     private long disregardedUriCount = 0;
161 
162     private long totalProcessedBytes = 0;
163     
164     // Flags indicating operator-specified crawl pause/end 
165     private boolean shouldPause = false;
166     private boolean shouldTerminate = false;
167     
168 
169     public AdaptiveRevisitFrontier(String   name) {
170         this(name, "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " +
171                 "will repeatedly visit all " +
172                 "encountered URIs. Wait time between visits is configurable" +
173                 " and is determined by seperate Processor(s). See " +
174                 "WaitEvaluators " +
175                 "See documentation for ARFrontier limitations.");        
176     }
177 
178     public AdaptiveRevisitFrontier(String   name, String   description) {
179         super(Frontier.ATTR_NAME, description);
180         addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
181                 "How many multiples of last fetch elapsed time to wait before " +
182                 "recontacting same server", DEFAULT_DELAY_FACTOR));
183             addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
184                 "Never wait more than this long, regardless of multiple",
185                 DEFAULT_MAX_DELAY));
186             addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
187                 "Always wait this long after one completion before recontacting " +
188                 "same server, regardless of multiple", DEFAULT_MIN_DELAY));
189              addElementToDefinition(new SimpleType(ATTR_MAX_RETRIES,
190                 "How often to retry fetching a URI that failed to be retrieved.\n" +
191                 "If zero, the crawler will get the robots.txt only.",
192                 DEFAULT_MAX_RETRIES));
193             addElementToDefinition(new SimpleType(ATTR_RETRY_DELAY,
194                     "How long to wait by default until we retry fetching a" +
195                     " URI that failed to be retrieved (seconds). ",
196                     DEFAULT_RETRY_DELAY));
197             addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
198                     "Number of embedded (or redirected) hops up to which " +
199                     "a URI has higher priority scheduling. For example, if set " +
200                     "to 1 (the default), items such as inline images (1-hop " +
201                     "embedded resources) will be scheduled ahead of all regular " +
202                     "links (or many-hop resources, like nested frames). If set to " +
203                     "zero, no preferencing will occur, and embeds/redirects are " +
204                     "scheduled the same as regular links.",
205                     DEFAULT_PREFERENCE_EMBED_HOPS));
206             Type t;
207             t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
208                     "Maximum number of simultaneous requests to a single" +
209                     " host.",
210                     DEFAULT_HOST_VALENCE));
211             t.setExpertSetting(true);
212             t = addElementToDefinition(new SimpleType(ATTR_QUEUE_IGNORE_WWW,
213                     "If true then documents from x.com, www.x.com and any " +
214                     "www[0-9]+.x.com will be assigned to the same queue.",
215                     DEFAULT_QUEUE_IGNORE_WWW));
216             t.setExpertSetting(true);
217             t = addElementToDefinition(new SimpleType(
218                     ATTR_FORCE_QUEUE,
219                     "The queue name into which to force URIs. Should "
220                     + "be left blank at global level.  Specify a "
221                     + "per-domain/per-host override to force URIs into "
222                     + "a particular named queue, regardless of the assignment "
223                     + "policy in effect (domain or ip-based politeness). "
224                     + "This could be used on domains known to all be from "
225                     + "the same small set of IPs (eg blogspot, dailykos, etc.) "
226                     + "to simulate IP-based politeness, or it could be used if "
227                     + "you wanted to enforce politeness over a whole domain, even "
228                     + "though the subdomains are split across many IPs.",
229                     DEFAULT_FORCE_QUEUE));
230             t.setOverrideable(true);
231             t.setExpertSetting(true);
232             t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE,
233                     Level.WARNING, "This field must contain only alphanumeric "
234                     + "characters plus period, dash, comma, colon, or underscore."));
235             t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
236                     "If true then the Frontier will use a seperate " +
237                     "datastructure to detect and eliminate duplicates.\n" +
238                     "This is required for Canonicalization rules to work.",
239                     DEFAULT_USE_URI_UNIQ_FILTER));
240             t.setExpertSetting(true);
241             t.setOverrideable(false);
242 
243         // Register persistent CrawlURI items 
244         CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
245         CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
246     }
247 
248     public synchronized void initialize(CrawlController c)
249             throws FatalConfigurationException, IOException   {
250         controller = c;
251         controller.addCrawlStatusListener(this);
252 
253         queueAssignmentPolicy = new HostnameQueueAssignmentPolicy();
254         
255         hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(),
256             c.getClassCatalog());
257         
258         if(((Boolean  )getUncheckedAttribute(
259                 null,ATTR_USE_URI_UNIQ_FILTER)).booleanValue()){
260             alreadyIncluded = createAlreadyIncluded();
261         } else {
262             alreadyIncluded = null;
263         }
264         
265         loadSeeds();
266     }
267 
268     /**
269      * Create a UriUniqFilter that will serve as record 
270      * of already seen URIs.
271      *
272      * @return A UURISet that will serve as a record of already seen URIs
273      * @throws IOException
274      */
275     protected UriUniqFilter createAlreadyIncluded() throws IOException   {
276         UriUniqFilter uuf = new BdbUriUniqFilter(
277                 this.controller.getBdbEnvironment());
278         uuf.setDestination(this);
279         return uuf;
280     }
281     
282     /**
283      * Loads the seeds
284      * <p>
285      * This method is called by initialize() and kickUpdate()
286      */
287     public void loadSeeds() {
288         Writer   ignoredWriter = new StringWriter  ();
289         // Get the seeds to refresh.
290         Iterator   iter = this.controller.getScope().seedsIterator(ignoredWriter);
291         while (iter.hasNext()) {
292             CandidateURI caUri =
293                 CandidateURI.createSeedCandidateURI((UURI)iter.next());
294             caUri.setSchedulingDirective(CandidateURI.MEDIUM);
295             schedule(caUri);
296         }
297         batchFlush();
298         // save ignored items (if any) where they can be consulted later
299         AbstractFrontier.saveIgnoredItems(
300                 ignoredWriter.toString(), 
301                 controller.getDisk());
302     }
303     
304     public String   getClassKey(CandidateURI cauri) {
305         String   queueKey = (String  )getUncheckedAttribute(cauri,
306                 ATTR_FORCE_QUEUE);
307             if ("".equals(queueKey)) {
308                 // Typical case, barring overrides
309                 queueKey =
310                     queueAssignmentPolicy.getClassKey(controller, cauri);
311                 // The queueAssignmentPolicy is always based on Hostnames
312                 // We may need to remove any www[0-9]{0,}\. prefixes from the
313                 // hostnames
314                 if(((Boolean  )getUncheckedAttribute(
315                         cauri,ATTR_QUEUE_IGNORE_WWW)).booleanValue()){
316                     queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.","");
317                 }
318             }
319             return queueKey;
320     }
321 
322     /**
323      * Canonicalize passed uuri. Its would be sweeter if this canonicalize
324      * function was encapsulated by that which it canonicalizes but because
325      * settings change with context -- i.e. there may be overrides in operation
326      * for a particular URI -- its not so easy; Each CandidateURI would need a
327      * reference to the settings system. That's awkward to pass in.
328      * 
329      * @param uuri Candidate URI to canonicalize.
330      * @return Canonicalized version of passed <code>uuri</code>.
331      */
332     protected String   canonicalize(UURI uuri) {
333         return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
334     }
335 
336     /**
337      * Canonicalize passed CandidateURI. This method differs from
338      * {@link #canonicalize(UURI)} in that it takes a look at
339      * the CandidateURI context possibly overriding any canonicalization effect if
340      * it could make us miss content. If canonicalization produces an URL that
341      * was 'alreadyseen', but the entry in the 'alreadyseen' database did
342      * nothing but redirect to the current URL, we won't get the current URL;
343      * we'll think we've already see it. Examples would be archive.org
344      * redirecting to www.archive.org or the inverse, www.netarkivet.net
345      * redirecting to netarkivet.net (assuming stripWWW rule enabled).
346      * <p>Note, this method under circumstance sets the forceFetch flag.
347      * 
348      * @param cauri CandidateURI to examine.
349      * @return Canonicalized <code>cacuri</code>.
350      */
351     protected String   canonicalize(CandidateURI cauri) {
352         String   canon = canonicalize(cauri.getUURI());
353         if (cauri.isLocation()) {
354             // If the via is not the same as where we're being redirected (i.e.
355             // we're not being redirected back to the same page, AND the
356             // canonicalization of the via is equal to the the current cauri, 
357             // THEN forcefetch (Forcefetch so no chance of our not crawling
358             // content because alreadyseen check things its seen the url before.
359             // An example of an URL that redirects to itself is:
360             // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
361             // An example of an URL whose canonicalization equals its via's
362             // canonicalization, and we want to fetch content at the
363             // redirection (i.e. need to set forcefetch), is netarkivet.dk.
364             if (!cauri.toString().equals(cauri.getVia().toString()) &&
365                     canonicalize(cauri.getVia()).equals(canon)) {
366                 cauri.setForceFetch(true);
367             }
368         }
369         return canon;
370     }
371 
372     /**
373      * 
374      * @param caUri The URI to schedule.
375      */
376     protected void innerSchedule(CandidateURI caUri) {
377         CrawlURI curi;
378         if(caUri instanceof CrawlURI) {
379             curi = (CrawlURI) caUri;
380         } else {
381             curi = CrawlURI.from(caUri,System.currentTimeMillis());
382             curi.putLong(A_TIME_OF_NEXT_PROCESSING,
383                 System.currentTimeMillis());
384             // New CrawlURIs get 'current time' as the time of next processing.
385         }
386         
387         if(curi.getClassKey() == null){
388             curi.setClassKey(getClassKey(curi));
389         }
390 
391         if(curi.isSeed() && curi.getVia() != null
392                 && curi.flattenVia().length() > 0) {
393             // The only way a seed can have a non-empty via is if it is the
394             // result of a seed redirect.  Add it to the seeds list.
395             //
396             // This is a feature.  This is handling for case where a seed
397             // gets immediately redirected to another page.  What we're doing
398             // is treating the immediate redirect target as a seed.
399             this.controller.getScope().addSeed(curi);
400             // And it needs rapid scheduling.
401             curi.setSchedulingDirective(CandidateURI.MEDIUM);
402         }
403         
404         // Optionally preferencing embeds up to MEDIUM
405         int prefHops = ((Integer  ) getUncheckedAttribute(curi,
406                 ATTR_PREFERENCE_EMBED_HOPS)).intValue();
407         boolean prefEmbed = false;
408         if (prefHops > 0) {
409             int embedHops = curi.getTransHops();
410             if (embedHops > 0 && embedHops <= prefHops
411                     && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
412                 // number of embed hops falls within the preferenced range, and
413                 // uri is not already MEDIUM -- so promote it
414                 curi.setSchedulingDirective(CandidateURI.MEDIUM);
415                 prefEmbed = true;
416             }
417         }
418 
419         // Finally, allow curi to be fetched right now 
420         // (while not overriding overdue items)
421         curi.putLong(A_TIME_OF_NEXT_PROCESSING,
422                 System.currentTimeMillis());
423         
424         try {
425             logger.finest("scheduling " + curi.toString());
426             AdaptiveRevisitHostQueue hq = getHQ(curi);
427             hq.add(curi,prefEmbed);
428         } catch (IOException   e) {
429             // TODO Handle IOExceptions
430             e.printStackTrace();
431         }
432         
433     }
434 
435     /**
436      * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
437      * it if necessary. 
438      * 
439      * @param curi CrawlURI for which to get a queue
440      * @return AdaptiveRevisitHostQueue for given CrawlURI
441      * @throws IOException
442      */
443     protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi) throws IOException   {
444         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
445         if(hq == null){
446             // Need to create it.
447             int valence = DEFAULT_HOST_VALENCE.intValue();
448             try {
449                 valence = ((Integer  )getAttribute(curi,ATTR_HOST_VALENCE)).intValue();
450             } catch (AttributeNotFoundException   e2) {
451                 logger.severe("Unable to load valence.");
452             }
453             hq = hostQueues.createHQ(curi.getClassKey(),valence);
454         }
455         return hq;
456     }
457 
458     protected void batchSchedule(CandidateURI caUri) {
459         threadWaiting.getQueue().enqueue(caUri);
460     }
461 
462     protected void batchFlush() {
463         innerBatchFlush();
464     }
465 
466     private void innerBatchFlush() {
467         Queue q = threadWaiting.getQueue();
468         while(!q.isEmpty()) {
469             CandidateURI caUri = (CandidateURI)q.dequeue();
470             if(alreadyIncluded != null){
471                 String   cannon = canonicalize(caUri);
472                 System.out.println("Cannon of " + caUri + " is " + cannon);
473                 if (caUri.forceFetch()) {
474                     alreadyIncluded.addForce(cannon, caUri);
475                 } else {
476                     alreadyIncluded.add(cannon, caUri);
477                 }
478             } else {
479                 innerSchedule(caUri);
480             }
481         }
482     }
483     
484     /**
485      * @param curi
486      * @return the CrawlServer to be associated with this CrawlURI
487      */
488     protected CrawlServer getServer(CrawlURI curi) {
489         return this.controller.getServerCache().getServerFor(curi);
490     }
491 
492     /* (non-Javadoc)
493      * @see org.archive.crawler.framework.Frontier#next()
494      */
495     public synchronized CrawlURI next() 
496             throws InterruptedException  , EndedException {
497         controller.checkFinish();
498         
499         while(shouldPause){
500             controller.toePaused();
501             wait();
502         }
503         
504         if(shouldTerminate){
505             throw new EndedException("terminated");
506         }
507         
508         AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();
509         
510         while(hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY){
511             // Ok, so we don't have a ready queue, wait until the top one
512             // will become available.
513             long waitTime = hq.getNextReadyTime() - System.currentTimeMillis();
514             if(waitTime > 0){
515                 wait(waitTime);
516             }
517             // The top HQ may have changed, so get it again
518             hq = hostQueues.getTopHQ(); 
519         }             
520 
521         if(shouldTerminate){
522             // May have been terminated while thread was waiting for IO
523             throw new EndedException("terminated");
524         }
525         
526         try {
527             CrawlURI curi = hq.next();
528             // Populate CURI with 'transient' variables such as server.
529             logger.fine("Issuing " + curi.toString());
530             long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
531             long currT = System.currentTimeMillis();
532             long overdue = (currT-temp);
533             if(logger.isLoggable(Level.FINER)){
534                 String   waitI = "not set";
535                 if(curi.containsKey(A_WAIT_INTERVAL)){
536                     waitI = ArchiveUtils.formatMillisecondsToConventional(
537                             curi.getLong(A_WAIT_INTERVAL));
538                 }
539                 logger.finer("Wait interval: " + waitI + 
540                         ", Time of next proc: " + temp +
541                         ", Current time: " + currT +
542                         ", Overdue by: " + overdue + "ms");
543             }
544             if(overdue < 0){
545                 // This should never happen.
546                 logger.severe("Time overdue for " + curi.toString() + 
547                         "is negative (" + overdue + ")!");
548             }
549             curi.putLong(A_FETCH_OVERDUE,overdue);
550             return curi;
551         } catch (IOException   e) {
552             // TODO: Need to handle this in an intelligent manner. 
553             //       Is probably fatal?
554             e.printStackTrace();
555         }
556 
557         return null;
558     }
559 
560     /* (non-Javadoc)
561      * @see org.archive.crawler.framework.Frontier#isEmpty()
562      */
563     public boolean isEmpty() {
564         // Technically, the Frontier should never become empty since URIs are
565         // only discarded under exceptional circumstances.
566         return hostQueues.getSize() == 0;
567     }
568 
569     /* (non-Javadoc)
570      * @see org.archive.crawler.framework.Frontier#schedule(org.archive.crawler.datamodel.CandidateURI)
571      */
572     public void schedule(CandidateURI caURI) {
573         batchSchedule(caURI);        
574     }
575 
576     /* (non-Javadoc)
577      * @see org.archive.crawler.framework.Frontier#finished(org.archive.crawler.datamodel.CrawlURI)
578      */
579     public synchronized void finished(CrawlURI curi) {
580         logger.fine(curi.toString()+ " " + 
581                 CrawlURI.fetchStatusCodesToString(curi.getFetchStatus()));
582         curi.incrementFetchAttempts();
583         logLocalizedErrors(curi);
584 
585         innerFinished(curi);
586     }
587     
588     protected synchronized void innerFinished(CrawlURI curi) {
589         try {
590             innerBatchFlush();
591             
592             if (curi.isSuccess()) {
593                 successDisposition(curi);
594             } else if (needsPromptRetry(curi)) {
595                 // Consider statuses which allow nearly-immediate retry
596                 // (like deferred to allow precondition to be fetched)
597                 reschedule(curi,false);
598             } else if (needsRetrying(curi)) {
599                 // Consider errors which can be retried
600                 reschedule(curi,true);
601                 controller.fireCrawledURINeedRetryEvent(curi);
602             } else if(isDisregarded(curi)) {
603                 // Check for codes that mean that while the crawler did
604                 // manage to get it it must be disregarded for any reason.
605                 disregardDisposition(curi);
606             } else {
607                 // In that case FAILURE, note & log
608                 failureDisposition(curi);
609             }
610 
611             // New items might be available, let waiting threads know
612             // More then one queue might have become available due to 
613             // scheduling of items outside the parent URIs host, so we
614             // wake all waiting threads.
615             notifyAll();
616         } catch (RuntimeException   e) {
617             curi.setFetchStatus(S_RUNTIME_EXCEPTION);
618             // store exception temporarily for logging
619             logger.warning("RTE in innerFinished() " +
620                 e.getMessage());
621             e.printStackTrace();
622             curi.putObject(A_RUNTIME_EXCEPTION, e);
623             failureDisposition(curi);
624         } catch (AttributeNotFoundException   e) {
625             logger.severe(e.getMessage());
626         }
627     }
628 
629     /**
630      * Take note of any processor-local errors that have
631      * been entered into the CrawlURI.
632      * @param curi CrawlURI with errors.
633      */
634     private void logLocalizedErrors(CrawlURI curi) {
635         if(curi.containsKey(A_LOCALIZED_ERRORS)) {
636             List   localErrors = (List  )curi.getObject(A_LOCALIZED_ERRORS);
637             Iterator   iter = localErrors.iterator();
638             while(iter.hasNext()) {
639                 Object   array[] = {curi, iter.next()};
640                 controller.localErrors.log(Level.WARNING,
641                     curi.getUURI().toString(), array);
642             }
643             // once logged, discard
644             curi.remove(A_LOCALIZED_ERRORS);
645         }
646     }
647     
648     /**
649      * The CrawlURI has been successfully crawled. 
650      *
651      * @param curi The CrawlURI
652      */
653     protected void successDisposition(CrawlURI curi) {
654         curi.aboutToLog();
655 
656         long waitInterval = 0;
657         
658         if(curi.containsKey(A_WAIT_INTERVAL)){
659             waitInterval = curi.getLong(A_WAIT_INTERVAL);
660             curi.addAnnotation("wt:" + 
661                     ArchiveUtils.formatMillisecondsToConventional(
662                             waitInterval));
663         } else {
664             logger.severe("Missing wait interval for " + curi.toString() +
665                     " WaitEvaluator may be missing.");
666         }
667         if(curi.containsKey(A_NUMBER_OF_VISITS)){
668             curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
669         }
670         if(curi.containsKey(A_NUMBER_OF_VERSIONS)){
671             curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver");
672         }
673         if(curi.containsKey(A_FETCH_OVERDUE)){
674             curi.addAnnotation("ov:" +
675                     ArchiveUtils.formatMillisecondsToConventional(
676                     (curi.getLong(A_FETCH_OVERDUE))));
677         }
678         
679         Object   array[] = { curi };
680         controller.uriProcessing.log(
681             Level.INFO,
682             curi.getUURI().toString(),
683             array);
684 
685         succeededFetchCount++;
686         totalProcessedBytes += curi.getContentSize();
687 
688         // Let everyone know in case they want to do something before we strip
689         // the curi.
690         controller.fireCrawledURISuccessfulEvent(curi);
691         
692         curi.setSchedulingDirective(CandidateURI.NORMAL);
693 
694         // Set time of next processing
695         curi.putLong(A_TIME_OF_NEXT_PROCESSING,
696                 System.currentTimeMillis()+waitInterval);
697         
698         
699         /* Update HQ */
700         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
701         
702         // Wake up time is based on the time when a fetch was completed + the
703         // calculated snooze time for politeness. If the fetch completion time
704         // is missing, we'll use current time.
705         long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
706                 curi.getLong(A_FETCH_COMPLETED_TIME):
707                     (new Date  ()).getTime()) + calculateSnoozeTime(curi);
708         
709         // Ready the URI for reserialization.
710         curi.processingCleanup(); 
711         curi.resetDeferrals();   
712         curi.resetFetchAttempts();
713         try {
714             hq.update(curi, true, wakeupTime);
715         } catch (IOException   e) {
716             logger.severe("An IOException occured when updating " + 
717                     curi.toString() + "\n" + e.getMessage());
718             e.printStackTrace();
719         }
720     }
721 
722     /**
723      * Put near top of relevant hostQueue (but behind anything recently
724      * scheduled 'high')..
725      *
726      * @param curi CrawlURI to reschedule. Its time of next processing is not
727      *             modified.
728      * @param errorWait signals if there should be a wait before retrying.
729      * @throws AttributeNotFoundException
730      */
731     protected void reschedule(CrawlURI curi, boolean errorWait)
732             throws AttributeNotFoundException   {
733         long delay = 0;
734         if(errorWait){
735             if(curi.containsKey(A_RETRY_DELAY)) {
736                 delay = curi.getLong(A_RETRY_DELAY);
737             } else {
738                 // use ARFrontier default
739                 delay = ((Long  )getAttribute(ATTR_RETRY_DELAY,curi)).longValue();
740             }
741         }
742         
743         long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME)?
744                 curi.getLong(A_FETCH_COMPLETED_TIME):
745                     (new Date  ()).getTime()) + delay;
746         
747         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
748         // Ready the URI for reserialization.
749         curi.processingCleanup(); 
750         if(errorWait){
751             curi.resetDeferrals(); //Defferals only refer to immediate retries.
752         }
753         try {
754             hq.update(curi, errorWait, retryTime);
755         } catch (IOException   e) {
756             // TODO Handle IOException
757             e.printStackTrace();
758         }
759     }
760 
761     /**
762      * The CrawlURI has encountered a problem, and will not
763      * be retried.
764      *
765      * @param curi The CrawlURI
766      */
767     protected void failureDisposition(CrawlURI curi) {
768         //Let interested listeners know of failed disposition.
769         this.controller.fireCrawledURIFailureEvent(curi);
770 
771         // send to basic log
772         curi.aboutToLog();
773         Object   array[] = { curi };
774         this.controller.uriProcessing.log(
775             Level.INFO,
776             curi.getUURI().toString(),
777             array);
778 
779         // if exception, also send to crawlErrors
780         if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
781             this.controller.runtimeErrors.log(
782                 Level.WARNING,
783                 curi.getUURI().toString(),
784                 array);
785         }
786         failedFetchCount++;
787         
788         // Put the failed URI at the very back of the queue.
789         curi.setSchedulingDirective(CandidateURI.NORMAL);
790         // TODO: reconsider this
791         curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE);
792 
793         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
794         // Ready the URI for serialization.
795         curi.processingCleanup();
796         curi.resetDeferrals();
797         curi.resetFetchAttempts();
798         try {
799             // No wait on failure. No contact was made with the server.
800             boolean shouldForget = shouldBeForgotten(curi);
801             if(shouldForget && alreadyIncluded != null){
802                 alreadyIncluded.forget(canonicalize(curi.getUURI()),curi);
803             }
804             hq.update(curi,false, 0, shouldForget); 
805         } catch (IOException   e) {
806             // TODO Handle IOException
807             e.printStackTrace();
808         }
809     }
810 
811     protected void disregardDisposition(CrawlURI curi) {
812         //Let interested listeners know of disregard disposition.
813         controller.fireCrawledURIDisregardEvent(curi);
814 
815         // send to basic log
816         curi.aboutToLog();
817         Object   array[] = { curi };
818         controller.uriProcessing.log(
819             Level.INFO,
820             curi.getUURI().toString(),
821             array);
822 
823         disregardedUriCount++;
824         
825         // Todo: consider timout before retrying disregarded elements.
826         //       Possibly add a setting to the WaitEvaluators?
827         curi.putLong(A_TIME_OF_NEXT_PROCESSING,Long.MAX_VALUE); 
828         curi.setSchedulingDirective(CandidateURI.NORMAL);
829 
830         AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
831         // Ready the URI for reserialization.
832         curi.processingCleanup(); 
833         curi.resetDeferrals();
834         curi.resetFetchAttempts();
835         try {
836             // No politness wait on disregard. No contact was made with server
837             hq.update(curi, false, 0, shouldBeForgotten(curi));
838         } catch (IOException   e) {
839             // TODO Handle IOException
840             e.printStackTrace();
841         }
842     }
843 
844     /**
845      * Some URIs, if they recur,  deserve another
846      * chance at consideration: they might not be too
847      * many hops away via another path, or the scope
848      * may have been updated to allow them passage.
849      *
850      * @param curi
851      * @return True if curi should be forgotten.
852      */
853     protected boolean shouldBeForgotten(CrawlURI curi) {
854         switch(curi.getFetchStatus()) {
855             case S_OUT_OF_SCOPE:
856             case S_TOO_MANY_EMBED_HOPS:
857             case S_TOO_MANY_LINK_HOPS:
858                 return true;
859             default:
860                 return false;
861         }
862     }
863 
864     /**
865      * Checks if a recently completed CrawlURI that did not finish successfully
866      * needs to be retried immediately (processed again as soon as politeness
867      * allows.)
868      *
869      * @param curi The CrawlURI to check
870      * @return True if we need to retry promptly.
871      * @throws AttributeNotFoundException If problems occur trying to read the
872      *            maximum number of retries from the settings framework.
873      */
874     protected boolean needsPromptRetry(CrawlURI curi)
875             throws AttributeNotFoundException   {
876         if (curi.getFetchAttempts() >=
877                 ((Integer  )getAttribute(ATTR_MAX_RETRIES, curi)).intValue() ) {
878             return false;
879         }
880 
881         switch (curi.getFetchStatus()) {
882             case S_DEFERRED:
883                 return true;
884 
885             case HttpStatus.SC_UNAUTHORIZED:
886                 // We can get here though usually a positive status code is
887                 // a success.  We get here if there is rfc2617 credential data
888                 // loaded and we're supposed to go around again.  See if any
889                 // rfc2617 credential present and if there, assume it got
890                 // loaded in FetchHTTP on expectation that we're to go around
891                 // again.  If no rfc2617 loaded, we should not be here.
892                 boolean loaded = curi.hasRfc2617CredentialAvatar();
893                 if (!loaded) {
894                     logger.severe("Have 401 but no creds loaded " + curi);
895                 }
896                 return loaded;
897 
898             default:
899                 return false;
900         }
901     }
902 
903     /**
904      * Checks if a recently completed CrawlURI that did not finish successfully
905      * needs to be retried (processed again after some time elapses)
906      *
907      * @param curi The CrawlURI to check
908      * @return True if we need to retry.
909      * @throws AttributeNotFoundException If problems occur trying to read the
910      *            maximum number of retries from the settings framework.
911      */
912     protected boolean needsRetrying(CrawlURI curi)
913             throws AttributeNotFoundException   {
914         // Check to see if maximum number of retries has been exceeded.
915         if (curi.getFetchAttempts() >= 
916             ((Integer  )getAttribute(ATTR_MAX_RETRIES,curi)).intValue() ) {
917             return false;
918         } else {
919             // Check if FetchStatus indicates that a delayed retry is needed.
920             switch (curi.getFetchStatus()) {
921                 case S_CONNECT_FAILED:
922                 case S_CONNECT_LOST:
923                 case S_DOMAIN_UNRESOLVABLE:
924                     // these are all worth a retry
925                     // TODO: consider if any others (S_TIMEOUT in some cases?) 
926                     //       deserve retry
927                     return true;
928                 default:
929                     return false;
930             }
931         }
932     }
933     
934     protected boolean isDisregarded(CrawlURI curi) {
935         switch (curi.getFetchStatus()) {
936             case S_ROBOTS_PRECLUDED :     // they don't want us to have it
937             case S_OUT_OF_SCOPE :         // filtered out by scope
938             case S_BLOCKED_BY_CUSTOM_PROCESSOR:
939             case S_BLOCKED_BY_USER :      // filtered out by user
940             case S_TOO_MANY_EMBED_HOPS :  // too far from last true link
941             case S_TOO_MANY_LINK_HOPS :   // too far from seeds
942             case S_DELETED_BY_USER :      // user deleted
943                 return true;
944             default:
945                 return false;
946         }
947     }
948     
949     /**
950      * Calculates how long a host queue needs to be snoozed following the
951      * crawling of a URI.
952      *
953      * @param curi The CrawlURI
954      * @return How long to snooze.
955      */
956     protected long calculateSnoozeTime(CrawlURI curi) {
957         long durationToWait = 0;
958         if (curi.containsKey(A_FETCH_BEGAN_TIME)
959             && curi.containsKey(A_FETCH_COMPLETED_TIME)) {
960             
961             try{
962             
963                 long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
964                 long durationTaken = 
965                     (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));
966                 
967                 durationToWait = (long)(
968                         ((Float  ) getAttribute(ATTR_DELAY_FACTOR, curi))
969                             .floatValue() * durationTaken);
970     
971                 long minDelay = 
972                     ((Integer  ) getAttribute(ATTR_MIN_DELAY, curi)).longValue();
973                 
974                 if (minDelay > durationToWait) {
975                     // wait at least the minimum
976                     durationToWait = minDelay;
977                 }
978     
979                 long maxDelay = ((Integer  ) getAttribute(ATTR_MAX_DELAY, curi)).longValue();
980                 if (durationToWait > maxDelay) {
981                     // wait no more than the maximum
982                     durationToWait = maxDelay;
983                 }
984             } catch (AttributeNotFoundException   e) {
985                 logger.severe("Unable to find attribute. " + 
986                         curi.toString());
987                 //Wait for max interval.
988                 durationToWait = DEFAULT_MAX_DELAY.longValue();
989             }
990 
991         }
992         long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ? 
993                 durationToWait : DEFAULT_MIN_DELAY.longValue();
994         logger.finest("Snooze time for " + curi.toString() + " = " + ret );
995         return ret;
996     }
997 
998     /* (non-Javadoc)
999      * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
1000     */
1001    public synchronized long discoveredUriCount() {
1002        return (this.alreadyIncluded != null) ? 
1003                this.alreadyIncluded.count() : hostQueues.getSize();
1004    }
1005
1006    /* (non-Javadoc)
1007     * @see org.archive.crawler.framework.Frontier#queuedUriCount()
1008     */
1009    public synchronized long queuedUriCount() {
1010        return hostQueues.getSize();
1011    }
1012
1013    /* (non-Javadoc)
1014     * @see org.archive.crawler.framework.Frontier#finishedUriCount()
1015     */
1016    public long finishedUriCount() {
1017        return succeededFetchCount+failedFetchCount+disregardedUriCount;
1018    }
1019
1020    /* (non-Javadoc)
1021     * @see org.archive.crawler.framework.Frontier#succeededFetchCount()
1022     */
1023    public long succeededFetchCount() {
1024        return succeededFetchCount;
1025    }
1026
1027    /* (non-Javadoc)
1028     * @see org.archive.crawler.framework.Frontier#failedFetchCount()
1029     */
1030    public long failedFetchCount() {
1031        return failedFetchCount;
1032    }
1033
1034    /* (non-Javadoc)
1035     * @see org.archive.crawler.framework.Frontier#disregardedUriCount()
1036     */
1037    public long disregardedUriCount() {
1038        return disregardedUriCount++;
1039    }
1040
1041    /* (non-Javadoc)
1042     * @see org.archive.crawler.framework.Frontier#totalBytesWritten()
1043     */
1044    public long totalBytesWritten() {
1045        return totalProcessedBytes;
1046    }
1047
1048    /**
1049     * Method is not supported by this Frontier implementation..
1050     * @param pathToLog
1051     * @throws IOException
1052     */
1053    public void importRecoverLog(String   pathToLog) throws IOException   {
1054        throw new IOException  ("Unsupported by this frontier.");
1055    }
1056
1057    public synchronized FrontierMarker getInitialMarker(String   regexpr,
1058            boolean inCacheOnly) {
1059        return null;
1060    }
1061
1062    /* (non-Javadoc)
1063     * @see org.archive.crawler.framework.Frontier#getURIsList(org.archive.crawler.framework.FrontierMarker, int, boolean)
1064     */
1065    public synchronized ArrayList   getURIsList(FrontierMarker marker,
1066            int numberOfMatches, boolean verbose)
1067        throws InvalidFrontierMarkerException {
1068        // TODO Auto-generated method stub
1069        return null;
1070    }
1071
1072    /* (non-Javadoc)
1073     * @see org.archive.crawler.framework.Frontier#deleteURIs(java.lang.String)
1074     */
1075    public synchronized long deleteURIs(String   match) {
1076        // TODO Auto-generated method stub
1077        return 0;
1078    }
1079
1080    /* (non-Javadoc)
1081     * @see org.archive.crawler.framework.Frontier#deleted(org.archive.crawler.datamodel.CrawlURI)
1082     */
1083    public synchronized void deleted(CrawlURI curi) {
1084        // TODO Auto-generated method stub
1085    }
1086
1087    public void considerIncluded(UURI u) {
1088        // This will cause the URI to be crawled!!!
1089        CrawlURI curi = new CrawlURI(u);
1090        innerSchedule(curi);
1091
1092    }
1093
1094    public void kickUpdate() {
1095        loadSeeds();
1096    }
1097    
1098    public void start() {
1099        unpause(); 
1100    }
1101    
1102    synchronized public void pause() { 
1103        shouldPause = true;
1104        notifyAll();
1105    }
1106    synchronized public void unpause() { 
1107        shouldPause = false;
1108        notifyAll();
1109    }
1110    synchronized public void terminate() { 
1111        shouldTerminate = true;
1112    }  
1113
1114    /* (non-Javadoc)
1115     * @see org.archive.crawler.framework.Frontier#getFrontierJournal()
1116     */
1117    public FrontierJournal getFrontierJournal() {
1118        return null;
1119    }
1120
1121    private static class ThreadLocalQueue
1122    extends ThreadLocal  <Queue<CandidateURI>> implements Serializable   {
1123
1124        private static final long serialVersionUID = 8268977225156462059L;
1125
1126        protected Queue<CandidateURI> initialValue() {
1127            return new MemQueue<CandidateURI>();
1128        }
1129
1130        /**
1131         * @return Queue of 'batched' items
1132         */
1133        public Queue<CandidateURI> getQueue() {
1134            return get();
1135        }
1136    }
1137    
1138    /**
1139     * This method is not supported by this Frontier implementation
1140     * @param pathToLog
1141     * @param retainFailures
1142     * @throws IOException
1143     */
1144    public void importRecoverLog(String   pathToLog, boolean retainFailures)
1145    throws IOException   {
1146        throw new IOException  ("Unsupported");
1147    }
1148
1149    //
1150    // Reporter implementation
1151    //
1152    
1153    public String  [] getReports() {
1154        // none but default for now
1155        return new String  [] {};
1156    }
1157    
1158    /* (non-Javadoc)
1159     * @see org.archive.util.Reporter#singleLineReport()
1160     */
1161    public String   singleLineReport() {
1162        return ArchiveUtils.singleLineReport(this);
1163    }
1164
1165    /* (non-Javadoc)
1166     * @see org.archive.util.Reporter#reportTo(java.io.Writer)
1167     */
1168    public void reportTo(PrintWriter   writer) throws IOException   {
1169        reportTo(null,writer);
1170    }
1171    
1172    /* (non-Javadoc)
1173     * @see org.archive.crawler.framework.Frontier#oneLineReport()
1174     */
1175    public synchronized void singleLineReportTo(PrintWriter   w) throws IOException   {
1176        hostQueues.singleLineReportTo(w);
1177    }
1178
1179    /* (non-Javadoc)
1180     * @see org.archive.util.Reporter#singleLineLegend()
1181     */
1182    public String   singleLineLegend() {
1183        return hostQueues.singleLineLegend();
1184    }
1185    
1186    /* (non-Javadoc)
1187     * @see org.archive.crawler.framework.Frontier#report()
1188     */
1189    public synchronized void reportTo(String   name, PrintWriter   writer) {
1190        // ignore name; only one report for now
1191        hostQueues.reportTo(name, writer);
1192    }
1193
1194    /* (non-Javadoc)
1195     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
1196     */
1197    public void crawlStarted(String   message) {
1198        // Not interested
1199    }
1200
1201    /* (non-Javadoc)
1202     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
1203     */
1204    public void crawlEnding(String   sExitMessage) {
1205        // Not interested
1206    }
1207
1208    /* (non-Javadoc)
1209     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
1210     */
1211    public void crawlEnded(String   sExitMessage) {
1212        // Cleanup!
1213        if (this.alreadyIncluded != null) {
1214            this.alreadyIncluded.close();
1215            this.alreadyIncluded = null;
1216        }
1217        hostQueues.close();
1218    }
1219
1220    /* (non-Javadoc)
1221     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
1222     */
1223    public void crawlPausing(String   statusMessage) {
1224        // Not interested
1225    }
1226
1227    /* (non-Javadoc)
1228     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
1229     */
1230    public void crawlPaused(String   statusMessage) {
1231        // Not interested
1232    }
1233
1234    /* (non-Javadoc)
1235     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
1236     */
1237    public void crawlResuming(String   statusMessage) {
1238        // Not interested
1239    }
1240
1241    /* (non-Javadoc)
1242     * @see org.archive.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
1243     */
1244    public void crawlCheckpoint(File   checkpointDir) throws Exception   {
1245        // Not interested
1246    }
1247
1248    /* (non-Javadoc)
1249     * @see org.archive.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(org.archive.crawler.datamodel.CandidateURI)
1250     */
1251    public void receive(CandidateURI item) {
1252        System.out.println("Received " + item);
1253        innerSchedule(item);        
1254    }
1255
1256    /* (non-Javadoc)
1257     * @see org.archive.crawler.framework.Frontier#getGroup(org.archive.crawler.datamodel.CrawlURI)
1258     */
1259    public FrontierGroup getGroup(CrawlURI curi) {
1260        try {
1261            return getHQ(curi);
1262        } catch (IOException   ioe) {
1263            throw new RuntimeException  (ioe);
1264        }
1265    }
1266    
1267    public long averageDepth() {
1268        return hostQueues.getAverageDepth();
1269    }
1270    
1271    public float congestionRatio() {
1272        return hostQueues.getCongestionRatio();
1273    }
1274    
1275    public long deepestUri() {
1276        return hostQueues.getDeepestQueueSize();
1277    }
1278}
1279
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags