SimpleCocoonCrawlerImpl


1   /*
2    * Copyright 1999-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.cocoon.components.crawler;
17  
18  import org.apache.avalon.excalibur.pool.Recyclable;
19  import org.apache.avalon.framework.activity.Disposable;
20  import org.apache.avalon.framework.configuration.Configurable;
21  import org.apache.avalon.framework.configuration.Configuration;
22  import org.apache.avalon.framework.configuration.ConfigurationException;
23  import org.apache.avalon.framework.logger.AbstractLogEnabled;
24  import org.apache.cocoon.Constants;
25  import org.apache.commons.lang.StringUtils;
26  import org.apache.regexp.RE;
27  import org.apache.regexp.RESyntaxException;
28  
29  import java.io.BufferedReader  ;
30  import java.io.IOException  ;
31  import java.io.InputStream  ;
32  import java.io.InputStreamReader  ;
33  import java.net.URL  ;
34  import java.net.URLConnection  ;
35  import java.util.ArrayList  ;
36  import java.util.HashSet  ;
37  import java.util.Iterator  ;
38  import java.util.List  ;
39  
40  /**
41   * A simple cocoon crawler.
42   *
43   * @author     <a HREF="mailto:berni_huber@a1.net">Bernhard Huber</a>
44   * @version CVS $Id: SimpleCocoonCrawlerImpl.java 123788 2004-12-31 12:43:00Z antonio $
45   */
46  public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled
47  implements CocoonCrawler, Configurable, Disposable, Recyclable {
48      
49      /**
50       * Config element name specifying expected link content-typ.
51       * <p>
52       *   Its value is <code>link-content-type</code>.
53       * </p>
54       */
55      public final static String   LINK_CONTENT_TYPE_CONFIG = "link-content-type";
56      
57      /**
58       * Default value of <code>link-content-type</code> configuration value.
59       * <p>
60       *   Its value is <code>application/x-cocoon-links</code>.
61       * </p>
62       */
63      public final String   LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE;
64      
65      /**
66       * Config element name specifying query-string appendend for requesting links
67       * of an URL.
68       * <p>
69       *  Its value is <code>link-view-query</code>.
70       * </p>
71       */
72      public final static String   LINK_VIEW_QUERY_CONFIG = "link-view-query";
73      
74      /**
75       * Default value of <code>link-view-query</code> configuration option.
76       * <p>
77       *   Its value is <code>?cocoon-view=links</code>.
78       * </p>
79       */
80      public final static String   LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
81      
82      /**
83       * Config element name specifying excluding regular expression pattern.
84       * <p>
85       *  Its value is <code>exclude</code>.
86       * </p>
87       */
88      public final static String   EXCLUDE_CONFIG = "exclude";
89      
90      /**
91       * Config element name specifying including regular expression pattern.
92       * <p>
93       *  Its value is <code>include</code>.
94       * </p>
95       */
96      public final static String   INCLUDE_CONFIG = "include";
97      
98      /**
99       * Config element name specifying http header value for user-Agent.
100      * <p>
101      *  Its value is <code>user-agent</code>.
102      * </p>
103      */
104     public final static String   USER_AGENT_CONFIG = "user-agent";
105     
106     /**
107      * Default value of <code>user-agent</code> configuration option.
108      * @see Constants#COMPLETE_NAME
109      */
110     public final static String   USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
111     
112     /**
113      * Config element name specifying http header value for accept.
114      * <p>
115      *  Its value is <code>accept</code>.
116      * </p>
117      */
118     public final static String   ACCEPT_CONFIG = "accept";
119     
120     /**
121      * Default value of <code>accept</code> configuration option.
122      * <p>
123      *   Its value is <code>* / *</code>
124      * </p>
125      */
126     public final static String   ACCEPT_DEFAULT = "*/*";
127     
128     
129     private String   linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
130     private String   linkContentType = LINK_CONTENT_TYPE_DEFAULT;
131     private HashSet   excludeCrawlingURL;
132     private HashSet   includeCrawlingURL;
133     private String   userAgent = USER_AGENT_DEFAULT;
134     private String   accept = ACCEPT_DEFAULT;
135     
136     private int depth;
137     
138     private HashSet   crawled;
139     private HashSet   urlsToProcess;
140     private HashSet   urlsNextDepth;
141     
142     
143     
144     /**
145      * Constructor for the SimpleCocoonCrawlerImpl object
146      */
147     public SimpleCocoonCrawlerImpl() {
148         // by default include everything
149         includeCrawlingURL = null;
150         // by default exclude common image patterns
151         excludeCrawlingURL = null;
152     }
153     
154     
155     /**
156      * Configure the crawler component.
157      * <p>
158      *  Configure can specify which URI to include, and which URI to exclude
159      *  from crawling. You specify the patterns as regular expressions.
160      * </p>
161      * <p>
162      *  Morover you can configure
163      *  the required content-type of crawling request, and the
164      *  query-string appended to each crawling request.
165      * </p>
166      * <pre><tt>
167      * &lt;include&gt;.*\.html?&lt;/exclude&gt; or &lt;exclude&gt;.*\.html?, .*\.xsp&lt;/exclude&gt;
168      * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
169      * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
170      * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
171      * </tt></pre>
172      *
173      * @param  configuration               XML configuration of this avalon component.
174      * @exception  ConfigurationException  is throwing if configuration is invalid.
175      */
176     public void configure(Configuration configuration)
177     throws ConfigurationException {
178         
179         Configuration[] children;
180         children = configuration.getChildren(INCLUDE_CONFIG);
181         if (children.length > 0) {
182             includeCrawlingURL = new HashSet  ();
183             for (int i = 0; i < children.length; i++) {
184                 String   pattern = children[i].getValue();
185                 try {
186                     String   params[] = StringUtils.split(pattern, ", ");
187                     for (int index = 0; index < params.length; index++) {
188                         String   tokenized_pattern = params[index];
189                         this.includeCrawlingURL.add(new RE(tokenized_pattern));
190                     }
191                 } catch (RESyntaxException rese) {
192                     getLogger().error("Cannot create including regular-expression for " +
193                     pattern, rese);
194                 }
195             }
196         } else {
197             if (getLogger().isDebugEnabled()) {
198                 getLogger().debug("Include all URLs");
199             }
200         }
201         
202         children = configuration.getChildren(EXCLUDE_CONFIG);
203         if (children.length > 0) {
204             excludeCrawlingURL = new HashSet  ();
205             for (int i = 0; i < children.length; i++) {
206                 String   pattern = children[i].getValue();
207                 try {
208                     String   params[] = StringUtils.split(pattern, ", ");
209                     for (int index = 0; index < params.length; index++) {
210                         String   tokenized_pattern = params[index];
211                         this.excludeCrawlingURL.add(new RE(tokenized_pattern));
212                     }
213                 } catch (RESyntaxException rese) {
214                     getLogger().error("Cannot create excluding regular-expression for " +
215                     pattern, rese);
216                 }
217             }
218         } else {
219             excludeCrawlingURL = new HashSet  ();
220             setDefaultExcludeFromCrawling();
221             if (getLogger().isDebugEnabled()) {
222                 getLogger().debug("Exclude default URLs only");
223             }
224         }
225         
226         Configuration child;
227         String   value;
228         child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
229         if (child != null) {
230             value = child.getValue();
231             if (value != null && value.length() > 0) {
232                 this.linkContentType = value.trim();
233             }
234         }
235         child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
236         if (child != null) {
237             value = child.getValue();
238             if (value != null && value.length() > 0) {
239                 this.linkViewQuery = value.trim();
240             }
241         }
242         
243         child = configuration.getChild(USER_AGENT_CONFIG, false);
244         if (child != null) {
245             value = child.getValue();
246             if (value != null && value.length() > 0) {
247                 this.userAgent = value;
248             }
249         }
250         
251         child = configuration.getChild(ACCEPT_CONFIG, false);
252         if (child != null) {
253             value = child.getValue();
254             if (value != null && value.length() > 0) {
255                 this.accept = value;
256             }
257         }
258         
259     }
260     
261     
262     /**
263      * dispose at end of life cycle, releasing all resources.
264      */
265     public void dispose() {
266         crawled = null;
267         urlsToProcess = null;
268         urlsNextDepth = null;
269         excludeCrawlingURL = null;
270         includeCrawlingURL = null;
271     }
272     
273     
274     /**
275      * recylcle this object, relasing resources
276      */
277     public void recycle() {
278         crawled = null;
279         urlsToProcess = null;
280         urlsNextDepth = null;
281         depth = -1;
282     }
283     
284     
285     /**
286      * The same as calling crawl(url,-1);
287      *
288      * @param  url  Crawl this URL, getting all links from this URL.
289      */
290     public void crawl(URL   url) {
291         crawl(url, -1);
292     }
293     
294     /**
295      * Start crawling a URL.
296      *
297      * <p>
298      *   Use this method to start crawling.
299      *   Get the this url, and all its children  by using <code>iterator()</code>.
300      *   The Iterator object will return URL objects.
301      * </p>
302      * <p>
303      *  You may use the crawl(), and iterator() methods the following way:
304      * </p>
305      * <pre><tt>
306      *   SimpleCocoonCrawlerImpl scci = ....;
307      *   scci.crawl( "http://foo/bar" );
308      *   Iterator i = scci.iterator();
309      *   while (i.hasNext()) {
310      *     URL url = (URL)i.next();
311      *     ...
312      *   }
313      * </tt></pre>
314      * <p>
315      *   The i.next() method returns a URL, and calculates the links of the
316      *   URL before return it.
317      * </p>
318      *
319      * @param  url  Crawl this URL, getting all links from this URL.
320      * @param  maxDepth  maximum depth to crawl to. -1 for no maximum.
321      */
322     public void crawl(URL   url, int maxDepth) {
323         crawled = new HashSet  ();
324         urlsToProcess = new HashSet  ();
325         urlsNextDepth = new HashSet  ();
326         depth = maxDepth;
327         
328         if (getLogger().isDebugEnabled()) {
329             getLogger().debug("crawl URL " + url + " to depth " + maxDepth);
330         }
331         
332         urlsToProcess.add(url);
333     }
334     
335     
336     /**
337      * Return iterator, iterating over all links of the currently crawled URL.
338      * <p>
339      *   The Iterator object will return URL objects at its <code>next()</code>
340      *   method.
341      * </p>
342      *
343      * @return    Iterator iterator of all links from the crawl URL.
344      * @since
345      */
346     public Iterator   iterator() {
347         return new CocoonCrawlerIterator(this);
348     }
349     
350     
351     /**
352      * Default exclude patterns.
353      * <p>
354      *   By default URLs matching following patterns are excluded:
355      * </p>
356      * <ul>
357      *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
358      *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
359      *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
360      *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
361      *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
362      * </ul>
363      *
364      * @since
365      */
366     private void setDefaultExcludeFromCrawling() {
367         String  [] EXCLUDE_FROM_CRAWLING_DEFAULT = {
368             ".*\\.gif(\\?.*)?$",
369             ".*\\.png(\\?.*)?$",
370             ".*\\.jpe?g(\\?.*)?$",
371             ".*\\.js(\\?.*)?$",
372             ".*\\.css(\\?.*)?$"
373         };
374         
375         for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
376             String   pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
377             try {
378                 excludeCrawlingURL.add(new RE(pattern));
379             } catch (RESyntaxException rese) {
380                 getLogger().error("Cannot create excluding regular-expression for " +
381                 pattern, rese);
382             }
383         }
384     }
385     
386     
387     /**
388      * Compute list of links from the url.
389      * <p>
390      *   Check for include, exclude pattern, content-type, and if url
391      *   has been craweled already.
392      * </p>
393      *
394      * @param  url  Crawl this URL
395      * @return      List of URLs, which are links from url, asserting the conditions.
396      * @since
397      */
398     private List   getLinks(URL   url) {
399         ArrayList   url_links = null;
400         String   sURL = url.toString();
401         
402         if (!isIncludedURL(sURL) || isExcludedURL(sURL)) {
403             return null;
404         }
405         
406         // don't try to get links for url which has been crawled already
407         if (crawled.contains(sURL)) {
408             return null;
409         }
410         
411         // mark it as crawled
412         crawled.add(sURL);
413         
414         // get links of url
415         if (getLogger().isDebugEnabled()) {
416             getLogger().debug("Getting links of URL " + sURL);
417         }
418         BufferedReader   br = null;
419         try {
420             sURL = url.getFile();
421             URL   links = new URL  (url, sURL
422             + ((sURL.indexOf("?") == -1) ? "?" : "&")
423             + linkViewQuery);
424             URLConnection   links_url_connection = links.openConnection();
425             links_url_connection.setRequestProperty("Accept", accept);
426             links_url_connection.setRequestProperty("User-Agent", userAgent);
427             links_url_connection.connect();
428             InputStream   is = links_url_connection.getInputStream();
429             br = new BufferedReader  (new InputStreamReader  (is));
430             
431             String   contentType = links_url_connection.getContentType();
432             if (contentType == null) {
433                 if (getLogger().isDebugEnabled()) {
434                     getLogger().debug("Ignoring " + sURL + " (no content type)");
435                 }
436                 // there is a check on null in the calling method
437                 return null;
438             }
439             
440             int index = contentType.indexOf(';');
441             if (index != -1) {
442                 contentType = contentType.substring(0, index);
443             }
444             
445             if (getLogger().isDebugEnabled()) {
446                 getLogger().debug("Content-type: " + contentType);
447             }
448             
449             if (contentType.equals(linkContentType)) {
450                 url_links = new ArrayList  ();
451                 
452                 // content is supposed to be a list of links,
453                 // relative to current URL
454                 String   line;
455                 while ((line = br.readLine()) != null) {
456                     final URL   newUrl = new URL  (url, line);
457                     final String   sNewUrl = newUrl.toString();
458                     
459                     boolean add_url = true;
460                     // don't add new_url twice
461                     if (add_url) {
462                         add_url &= !url_links.contains(sNewUrl);
463                     }
464                     
465                     // don't add new_url if it has been crawled already
466                     if (add_url) {
467                         add_url &= !crawled.contains(sNewUrl);
468                     }
469                     
470                     // don't add if is not matched by existing include definition
471                     if (add_url) {
472                         add_url &= isIncludedURL(sNewUrl);
473                     }
474                     
475                     // don't add if is matched by existing exclude definition
476                     if (add_url) {
477                         add_url &= !isExcludedURL(sNewUrl);
478                     }
479                     if (add_url) {
480                         if (getLogger().isDebugEnabled()) {
481                             getLogger().debug("Add URL: " + sNewUrl);
482                         }
483                         url_links.add(newUrl);
484                     }
485                 }
486                 // now we have a list of URL which should be examined
487             }
488         } catch (IOException   ioe) {
489             getLogger().warn("Problems get links of " + url, ioe);
490         } finally {
491             if (br != null) {
492                 try {
493                     br.close();
494                     br = null;
495                 } catch (IOException   ignored) {
496                 }
497             }
498         }
499         return url_links;
500     }
501     
502     
503     /**
504      * check if URL is a candidate for indexing
505      *
506      * @param  url  the URL to check
507      * @return      The excludedURL value
508      */
509     private boolean isExcludedURL(String   url) {
510         // by default do not exclude URL for crawling
511         if (excludeCrawlingURL == null) {
512             return false;
513         }
514 
515         final String   s = url;
516         Iterator   i = excludeCrawlingURL.iterator();
517         while (i.hasNext()) {
518             RE pattern = (RE) i.next();
519             if (pattern.match(s)) {
520                 if (getLogger().isDebugEnabled()) {
521                     getLogger().debug("Excluded URL " + url);
522                 }
523                 return true;
524             }
525         }
526         if (getLogger().isDebugEnabled()) {
527             getLogger().debug("Not excluded URL " + url);
528         }
529         return false;
530     }
531     
532     
533     /**
534      * check if URL is a candidate for indexing
535      *
536      * @param  url  Description of Parameter
537      * @return      The includedURL value
538      */
539     private boolean isIncludedURL(String   url) {
540         // by default include URL for crawling
541         if (includeCrawlingURL == null) {
542             return true;
543         }
544 
545         final String   s = url;
546         Iterator   i = includeCrawlingURL.iterator();
547         while (i.hasNext()) {
548             RE pattern = (RE) i.next();
549             if (pattern.match(s)) {
550                 if (getLogger().isDebugEnabled()) {
551                     getLogger().debug("Included URL " + url);
552                 }
553                 return true;
554             }
555         }
556         if (getLogger().isDebugEnabled()) {
557             getLogger().debug("Not included URL " + url);
558         }
559         return false;
560     }
561     
562     
563     /**
564      * Helper class implementing an Iterator
565      * <p>
566      *   This Iterator implementation calculates the links of an URL
567      *   before returning in the next() method.
568      * </p>
569      *
570      * @author     <a HREF="mailto:berni_huber@a1.net">Bernhard Huber</a>
571      * @version    $Id: SimpleCocoonCrawlerImpl.java 123788 2004-12-31 12:43:00Z antonio $
572      */
573     public static class CocoonCrawlerIterator implements Iterator   {
574         private SimpleCocoonCrawlerImpl cocoonCrawler;
575         
576         
577         /**
578          * Constructor for the CocoonCrawlerIterator object
579          *
580          * @param  cocoonCrawler  the containing CocoonCrawler instance.
581          */
582         CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
583             this.cocoonCrawler = cocoonCrawler;
584         }
585         
586         
587         /**
588          * check if crawling is finished.
589          *
590          * @return    <code>true</code> if crawling has finished,
591          * else <code>false</code>.
592          */
593         public boolean hasNext() {
594             return cocoonCrawler.urlsToProcess.size() > 0
595             || cocoonCrawler.urlsNextDepth.size() > 0;
596         }
597         
598         
599         /**
600          * @return    the next URL
601          */
602         public Object   next() {
603             if (cocoonCrawler.urlsToProcess.size() == 0
604             && cocoonCrawler.urlsNextDepth.size() > 0) {
605                 // process queued urls belonging to the next depth level
606                 cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth;
607                 cocoonCrawler.urlsNextDepth = new HashSet  ();
608                 // fix Bugzilla Bug 25270
609                 // only decrease if depth > 0, excluding decreasing
610                 // if depth is already equal to -1
611                 if (cocoonCrawler.depth > 0) {
612                     cocoonCrawler.depth--;
613                 }
614             }
615             URL   theNextUrl = null;
616             // fix Bugzilla Bug 25270
617             // return NextUrl != null only if getLinks() returns non-null
618             // list
619             for (Iterator   i = cocoonCrawler.urlsToProcess.iterator(); 
620               i.hasNext() && theNextUrl == null;) {
621                 // fetch a URL
622                 URL   url = (URL  ) i.next();
623 
624                 // remove it from the to-do list
625                 i.remove();
626 
627                 if (cocoonCrawler.depth == -1 || cocoonCrawler.depth > 0) {
628                     // calc all links from this url
629                     List   url_links = cocoonCrawler.getLinks(url);
630                     if (url_links != null) {
631                         // add links of this url to the to-do list
632                         cocoonCrawler.urlsNextDepth.addAll(url_links);
633                         theNextUrl = url;
634                     }
635                 }
636             }
637             // finally return url
638             return theNextUrl;
639         }
640         
641         
642         /**
643          * remove is not implemented
644          */
645         public void remove() {
646             throw new UnsupportedOperationException  ("remove is not implemented");
647         }
648     }
649 }
650 
651
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags