KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorJS


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Nov 17, 2003
20  *
21  * To change the template for this generated file go to
22  * Window>Preferences>Java>Code Generation>Code and Comments
23  */

24 package org.archive.crawler.extractor;
25
26 import java.io.IOException JavaDoc;
27 import java.util.logging.Logger JavaDoc;
28 import java.util.regex.Matcher JavaDoc;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.CrawlController;
34 import org.archive.io.ReplayCharSequence;
35 import org.archive.net.UURI;
36 import org.archive.util.DevUtils;
37 import org.archive.util.TextUtils;
38
39 /**
40  * Processes Javascript files for strings that are likely to be
41  * crawlable URIs.
42  *
43  * @author gojomo
44  *
45  */

46 public class ExtractorJS extends Extractor implements CoreAttributeConstants {
47
48     private static final long serialVersionUID = -2231962381454717720L;
49
50     private static Logger JavaDoc LOGGER =
51         Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
52
53     static final String JavaDoc AMP = "&";
54     static final String JavaDoc ESCAPED_AMP = "&";
55     static final String JavaDoc WHITESPACE = "\\s";
56
57     // finds whitespace-free strings in Javascript
58
// (areas between paired ' or " characters, possibly backslash-quoted
59
// on the ends, but not in the middle)
60
static final String JavaDoc JAVASCRIPT_STRING_EXTRACTOR =
61         "(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)";
62     // GROUPS:
63
// (G1) ' or " with optional leading backslashes
64
// (G2) whitespace-free string delimited on boths ends by G1
65

66     // determines whether a string is likely URI
67
// (no whitespace or '<' '>', has an internal dot or some slash,
68
// begins and ends with either '/' or a word-char)
69
static final String JavaDoc STRING_URI_DETECTOR =
70         "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)";
71
72     protected long numberOfCURIsHandled = 0;
73     protected static long numberOfLinksExtracted = 0;
74
75     /**
76      * @param name
77      */

78     public ExtractorJS(String JavaDoc name) {
79         super(name, "JavaScript extractor. Link extraction on JavaScript" +
80                 " files (.js).");
81     }
82
83     /* (non-Javadoc)
84      * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
85      */

86     public void extract(CrawlURI curi) {
87         if (!isHttpTransactionContentToProcess(curi)) {
88             return;
89         }
90         String JavaDoc contentType = curi.getContentType();
91         if ((contentType == null)) {
92             return;
93         }
94         // If content type is not js and if the viaContext
95
// does not begin with 'script', return.
96
if((contentType.indexOf("javascript") < 0) &&
97             (contentType.indexOf("jscript") < 0) &&
98             (contentType.indexOf("ecmascript") < 0) &&
99             (!curi.toString().toLowerCase().endsWith(".js")) &&
100             (curi.getViaContext() == null || !curi.getViaContext().
101                 toString().toLowerCase().startsWith("script"))) {
102             return;
103         }
104
105         this.numberOfCURIsHandled++;
106
107         ReplayCharSequence cs = null;
108         try {
109             cs = curi.getHttpRecorder().getReplayCharSequence();
110         } catch (IOException JavaDoc e) {
111             curi.addLocalizedError(this.getName(), e,
112                 "Failed get of replay char sequence.");
113         }
114         if (cs == null) {
115             LOGGER.warning("Failed getting ReplayCharSequence: " +
116                 curi.toString());
117             return;
118         }
119
120         try {
121             try {
122                 numberOfLinksExtracted += considerStrings(curi, cs,
123                         getController(), true);
124             } catch (StackOverflowError JavaDoc e) {
125                 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
126             }
127             // Set flag to indicate that link extraction is completed.
128
curi.linkExtractorFinished();
129         } finally {
130             // Done w/ the ReplayCharSequence. Close it.
131
if (cs != null) {
132                 try {
133                     cs.close();
134                 } catch (IOException JavaDoc ioe) {
135                     LOGGER.warning(TextUtils.exceptionToString(
136                         "Failed close of ReplayCharSequence.", ioe));
137                 }
138             }
139         }
140     }
141
142     public static long considerStrings(CrawlURI curi, CharSequence JavaDoc cs,
143             CrawlController controller, boolean handlingJSFile) {
144         long foundLinks = 0;
145         Matcher JavaDoc strings =
146             TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
147         while(strings.find()) {
148             CharSequence JavaDoc subsequence =
149                 cs.subSequence(strings.start(2), strings.end(2));
150             Matcher JavaDoc uri =
151                 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence);
152             if(uri.matches()) {
153                 String JavaDoc string = uri.group();
154                 string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
155                 foundLinks++;
156                 try {
157                     if (handlingJSFile) {
158                         curi.createAndAddLinkRelativeToVia(string,
159                             Link.JS_MISC, Link.SPECULATIVE_HOP);
160                     } else {
161                         curi.createAndAddLinkRelativeToBase(string,
162                             Link.JS_MISC, Link.SPECULATIVE_HOP);
163                     }
164                 } catch (URIException e) {
165                     // There may not be a controller (e.g. If we're being run
166
// by the extractor tool).
167
if (controller != null) {
168                         controller.logUriError(e, curi.getUURI(), string);
169                     } else {
170                         LOGGER.info(curi + ", " + string + ": " +
171                             e.getMessage());
172                     }
173                 }
174             } else {
175                foundLinks += considerStrings(curi, subsequence,
176                    controller, handlingJSFile);
177             }
178             TextUtils.recycleMatcher(uri);
179         }
180         TextUtils.recycleMatcher(strings);
181         return foundLinks;
182     }
183
184     /* (non-Javadoc)
185      * @see org.archive.crawler.framework.Processor#report()
186      */

187     public String JavaDoc report() {
188         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
189         ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
190         ret.append(" Function: Link extraction on JavaScript code\n");
191         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
192         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
193
194         return ret.toString();
195     }
196 }
197
Popular Tags