KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorURI


1 /*
2  * ExtractorURI
3  *
4  * $Id: ExtractorURI.java,v 1.1.2.1 2007/01/13 01:31:16 stack-sf Exp $
5  *
6  * Created on July 20, 2006
7  *
8  * Copyright (C) 2006 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.crawler.extractor;
28
29 import java.util.ArrayList JavaDoc;
30 import java.util.Collection JavaDoc;
31 import java.util.List JavaDoc;
32 import java.util.logging.Level JavaDoc;
33 import java.util.logging.Logger JavaDoc;
34 import java.util.regex.Matcher JavaDoc;
35
36 import org.apache.commons.codec.DecoderException;
37 import org.apache.commons.httpclient.URIException;
38 import org.archive.crawler.datamodel.CoreAttributeConstants;
39 import org.archive.crawler.datamodel.CrawlURI;
40 import org.archive.net.LaxURLCodec;
41 import org.archive.net.UURI;
42 import org.archive.util.TextUtils;
43
44 /**
45  * An extractor for finding URIs inside other URIs. Unlike most other
46  * extractors, this works on URIs discovered by previous extractors. Thus
47  * it should appear near the end of any set of extractors.
48  *
49  * Initially, only finds absolute HTTP(S) URIs in query-string or its
50  * parameters.
51  *
52  * TODO: extend to find URIs in path-info
53  *
54  * @author Gordon Mohr
55  *
56  **/

57
58 public class ExtractorURI extends Extractor implements CoreAttributeConstants {
59
60     private static final long serialVersionUID = -6273897743240970822L;
61
62     private static Logger JavaDoc LOGGER =
63         Logger.getLogger(ExtractorURI.class.getName());
64
65     static final String JavaDoc ABS_HTTP_URI_PATTERN = "^https?://[^\\s<>]*$";
66     
67     // FIXME: these counters are not incremented atomically; totals may not
68
// be correct
69
private long numberOfCURIsHandled = 0;
70     private long numberOfLinksExtracted = 0;
71
72     /**
73      * Constructor
74      *
75      * @param name
76      */

77     public ExtractorURI(String JavaDoc name) {
78         super(name, "URI Extractor. Extracts links inside other " +
79                 "discovered URIs. Should appear last among extractors.");
80     }
81
82     /**
83      * Perform usual extraction on a CrawlURI
84      *
85      * @param curi Crawl URI to process.
86      */

87     public void extract(CrawlURI curi) {
88
89         this.numberOfCURIsHandled++;
90         // use array copy because discoveriess will add to outlinks
91
Collection JavaDoc<Link> links = curi.getOutLinks();
92         Link[] sourceLinks = links.toArray(new Link[links.size()]);
93         for (Link wref: sourceLinks) {
94             extractLink(curi,wref);
95         }
96     }
97
98     /**
99      * Consider a single Link for internal URIs
100      *
101      * @param curi CrawlURI to add discoveries to
102      * @param wref Link to examine for internal URIs
103      */

104     protected void extractLink(CrawlURI curi, Link wref) {
105         UURI source = UURI.from(wref.getDestination());
106         if(source == null) {
107             // shouldn't happen
108
return;
109         }
110         List JavaDoc<String JavaDoc> found = extractQueryStringLinks(source);
111         for (String JavaDoc uri : found) {
112             try {
113                 curi.createAndAddLink(
114                         uri,
115                         Link.SPECULATIVE_MISC,
116                         Link.SPECULATIVE_HOP);
117                 numberOfLinksExtracted++;
118             } catch (URIException e) {
119                 LOGGER.log(Level.FINE, "bad URI", e);
120             }
121         }
122         // TODO: consider path URIs too
123

124     }
125
126     /**
127      * Look for URIs inside the supplied UURI.
128      *
129      * Static for ease of testing or outside use.
130      *
131      * @param source UURI to example
132      * @return List of discovered String URIs.
133      */

134     protected static List JavaDoc<String JavaDoc> extractQueryStringLinks(UURI source) {
135         List JavaDoc<String JavaDoc> results = new ArrayList JavaDoc<String JavaDoc>();
136         String JavaDoc decodedQuery;
137         try {
138             decodedQuery = source.getQuery();
139         } catch (URIException e1) {
140             // shouldn't happen
141
return results;
142         }
143         if(decodedQuery==null) {
144             return results;
145         }
146         // check if full query-string appears to be http(s) URI
147
Matcher JavaDoc m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,decodedQuery);
148         if(m.matches()) {
149             TextUtils.recycleMatcher(m);
150             results.add(decodedQuery);
151         }
152         // split into params, see if any param value is http(s) URI
153
String JavaDoc rawQuery = new String JavaDoc(source.getRawQuery());
154         String JavaDoc[] params = rawQuery.split("&");
155         for (String JavaDoc param : params) {
156             String JavaDoc[] keyVal = param.split("=");
157             if(keyVal.length==2) {
158                 String JavaDoc candidate;
159                 try {
160                     candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]);
161                 } catch (DecoderException e) {
162                     continue;
163                 }
164                 // TODO: use other non-UTF8 codecs when appropriate
165
m.reset(candidate);
166                 if(m.matches()) {
167                     results.add(candidate);
168                 }
169             }
170         }
171         return results;
172     }
173
174     public String JavaDoc report() {
175         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
176         ret.append("Processor: "+ExtractorURI.class.getName()+"\n");
177         ret.append(" Function: Extracts links inside other URIs\n");
178         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
179         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
180
181         return ret.toString();
182     }
183 }
184
Popular Tags