KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorImpliedURI


1 /*
2  * ExtractorURI
3  *
4  * $Id: ExtractorImpliedURI.java,v 1.1.2.1 2007/01/13 01:31:16 stack-sf Exp $
5  *
6  * Created on July 20, 2006
7  *
8  * Copyright (C) 2006 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.crawler.extractor;
28
29 import java.util.Collection JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32 import java.util.regex.Matcher JavaDoc;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.util.TextUtils;
39
40 /**
41  * An extractor for finding 'implied' URIs inside other URIs. If the
42  * 'trigger' regex is matched, a new URI will be constructed from the
43  * 'build' replacement pattern.
44  *
45  * Unlike most other extractors, this works on URIs discovered by
46  * previous extractors. Thus it should appear near the end of any
47  * set of extractors.
48  *
49  * Initially, only finds absolute HTTP(S) URIs in query-string or its
50  * parameters.
51  *
52  * TODO: extend to find URIs in path-info
53  *
54  * @author Gordon Mohr
55  *
56  **/

57
58 public class ExtractorImpliedURI extends Extractor implements CoreAttributeConstants {
59
60     private static final long serialVersionUID = 8579045413127769497L;
61
62     private static Logger JavaDoc LOGGER =
63         Logger.getLogger(ExtractorImpliedURI.class.getName());
64    
65     /** regex which when matched triggers addition of 'implied' URI */
66     public static final String JavaDoc ATTR_TRIGGER_REGEXP = "trigger-regexp";
67     /** replacement pattern used to build 'implied' URI */
68     public static final String JavaDoc ATTR_BUILD_PATTERN = "build-pattern";
69
70     // FIXME: these counters are not incremented atomically; totals may not
71
// be correct
72
private long numberOfCURIsHandled = 0;
73     private long numberOfLinksExtracted = 0;
74
75     /**
76      * Constructor
77      *
78      * @param name
79      */

80     public ExtractorImpliedURI(String JavaDoc name) {
81         super(name, "Implied URI Extractor. Finds URIs implied by other " +
82                 "URIs according to regex/replacement patterns. Should " +
83                 "appear after most other extractors.");
84
85         addElementToDefinition(
86             new SimpleType(ATTR_TRIGGER_REGEXP,
87                     "Triggering regular expression. When a discovered URI " +
88                     "matches this pattern, the 'implied' URI will be " +
89                     "built. The capturing groups of this expression are " +
90                     "available for the build replacement pattern.", ""));
91         addElementToDefinition(
92                 new SimpleType(ATTR_BUILD_PATTERN,
93                     "Replacement pattern to build 'implied' URI, using " +
94                     "captured groups of trigger expression.", ""));
95     }
96
97     /**
98      * Perform usual extraction on a CrawlURI
99      *
100      * @param curi Crawl URI to process.
101      */

102     public void extract(CrawlURI curi) {
103
104         this.numberOfCURIsHandled++;
105         // use array copy because discoveriess will add to outlinks
106
Collection JavaDoc<Link> links = curi.getOutLinks();
107         Link[] sourceLinks = links.toArray(new Link[links.size()]);
108         for (Link wref: sourceLinks) {
109             String JavaDoc implied = extractImplied(
110                     wref.getDestination(),
111                     (String JavaDoc)getUncheckedAttribute(curi,ATTR_TRIGGER_REGEXP),
112                     (String JavaDoc)getUncheckedAttribute(curi,ATTR_BUILD_PATTERN));
113             if (implied!=null) {
114                 try {
115                     curi.createAndAddLink(
116                             implied,
117                             Link.SPECULATIVE_MISC,
118                             Link.SPECULATIVE_HOP);
119                     numberOfLinksExtracted++;
120                 } catch (URIException e) {
121                     LOGGER.log(Level.FINE, "bad URI", e);
122                 }
123             }
124         }
125     }
126     
127     /**
128      * Utility method for extracting 'implied' URI given a source uri,
129      * trigger pattern, and build pattern.
130      *
131      * @param uri source to check for implied URI
132      * @param trigger regex pattern which if matched implies another URI
133      * @param build replacement pattern to build the implied URI
134      * @return implied URI, or null if none
135      */

136     protected static String JavaDoc extractImplied(CharSequence JavaDoc uri, String JavaDoc trigger, String JavaDoc build) {
137         if(trigger.length()==0) {
138             // short-circuit empty-string trigger
139
return null;
140         }
141         Matcher JavaDoc m = TextUtils.getMatcher(trigger, uri);
142         if(m.matches()) {
143             String JavaDoc result = m.replaceFirst(build);
144             TextUtils.recycleMatcher(m);
145             return result;
146         }
147         return null;
148     }
149
150     public String JavaDoc report() {
151         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
152         ret.append("Processor: "+ExtractorImpliedURI.class.getName()+"\n");
153         ret.append(" Function: Extracts links inside other URIs\n");
154         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
155         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
156
157         return ret.toString();
158     }
159 }
160
Popular Tags