KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > extractor > RegexpJSLinkExtractor


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Nov 17, 2003
20  *
21  * To change the template for this generated file go to
22  * Window>Preferences>Java>Code Generation>Code and Comments
23  */

24 package org.archive.extractor;
25
26 import java.util.LinkedList JavaDoc;
27 import java.util.regex.Matcher JavaDoc;
28 import java.util.regex.Pattern JavaDoc;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.extractor.Link;
32 import org.archive.net.UURI;
33 import org.archive.net.UURIFactory;
34 import org.archive.util.TextUtils;
35
36 /**
37  * Uses regular expressions to find likely URIs inside Javascript.
38  *
39  * ROUGH DRAFT IN PROGRESS / incomplete... untested...
40  *
41  * @author gojomo
42  */

43 public class RegexpJSLinkExtractor extends CharSequenceLinkExtractor {
44 // private static Logger logger =
45
// Logger.getLogger(RegexpJSLinkExtractor.class.getName());
46

47     static final String JavaDoc AMP = "&";
48     static final String JavaDoc ESCAPED_AMP = "&";
49     static final String JavaDoc WHITESPACE = "\\s";
50
51     // finds whitespace-free strings in Javascript
52
// (areas between paired ' or " characters, possibly backslash-quoted
53
// on the ends, but not in the middle)
54
static final Pattern JavaDoc JAVASCRIPT_STRING_EXTRACTOR = Pattern.compile(
55         "(\\\\{0,8}+(?:\"|\'))(.+?)(?:\\1)");
56
57     // determines whether a string is likely URI
58
// (no whitespace or '<' '>', has an internal dot or some slash,
59
// begins and ends with either '/' or a word-char)
60
static final Pattern JavaDoc STRING_URI_DETECTOR = Pattern.compile(
61         "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)");
62
63     Matcher JavaDoc strings;
64     LinkedList JavaDoc<Matcher JavaDoc> matcherStack = new LinkedList JavaDoc<Matcher JavaDoc>();
65
66     protected boolean findNextLink() {
67         if(strings==null) {
68              strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(sourceContent);
69         }
70         while(strings!=null) {
71             while(strings.find()) {
72                 CharSequence JavaDoc subsequence =
73                     sourceContent.subSequence(strings.start(2), strings.end(2));
74                 Matcher JavaDoc uri = STRING_URI_DETECTOR.matcher(subsequence);
75                 if ((subsequence.length() <= UURI.MAX_URL_LENGTH) && uri.matches()) {
76                     String JavaDoc string = uri.group();
77                     string = TextUtils.replaceAll(ESCAPED_AMP, string, AMP);
78                     try {
79                         Link link = new Link(source, UURIFactory.getInstance(
80                                 source, string), Link.JS_MISC, Link.SPECULATIVE_HOP);
81                         next.add(link);
82                         return true;
83                     } catch (URIException e) {
84                         extractErrorListener.noteExtractError(e,source,string);
85                     }
86                 } else {
87                    // push current range
88
matcherStack.addFirst(strings);
89                    // start looking inside string
90
strings = JAVASCRIPT_STRING_EXTRACTOR.matcher(subsequence);
91                 }
92             }
93             // continue at enclosing range, if available
94
strings = (Matcher JavaDoc) (matcherStack.isEmpty() ? null : matcherStack.removeFirst());
95         }
96         return false;
97     }
98
99
100     /* (non-Javadoc)
101      * @see org.archive.extractor.LinkExtractor#reset()
102      */

103     public void reset() {
104         super.reset();
105         matcherStack.clear();
106         strings = null;
107     }
108
109     protected static CharSequenceLinkExtractor newDefaultInstance() {
110         return new RegexpJSLinkExtractor();
111     }
112 }
113
Popular Tags