KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > extractor > RegexpCSSLinkExtractor


1 /*
2  * ExtractorCSS
3  *
4  * $Id: RegexpCSSLinkExtractor.java,v 1.5.12.1 2007/01/13 01:31:31 stack-sf Exp $
5  *
6  * Created on Mar 29, 2005
7  *
8  * Copyright (C) 2005 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.extractor;
28
29 import java.util.regex.Matcher JavaDoc;
30
31 import org.apache.commons.httpclient.URIException;
32 import org.archive.crawler.extractor.Link;
33 import org.archive.net.UURIFactory;
34 import org.archive.util.DevUtils;
35 import org.archive.util.TextUtils;
36
37 /**
38  * This extractor is parsing URIs from CSS type files.
39  * The format of a CSS URL value is 'url(' followed by optional white space
40  * followed by an optional single quote (') or double quote (") character
41  * followed by the URL itself followed by an optional single quote (') or
42  * double quote (") character followed by optional white space followed by ')'.
43  * Parentheses, commas, white space characters, single quotes (') and double
44  * quotes (") appearing in a URL must be escaped with a backslash:
45  * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
46  * the style sheet, not relative to the document. <a HREF="http://www.w3.org/TR/REC-CSS1#url">
47  * Source: www.w3.org</a>
48  *
49  * ROUGH DRAFT IN PROGRESS / incomplete... untested... major changes likely
50  *
51  * @author igor gojomo
52  *
53  **/

54
55 public class RegexpCSSLinkExtractor extends CharSequenceLinkExtractor {
56
57     // private static Logger logger =
58
// Logger.getLogger(RegexpCSSLinkExtractor.class.getName());
59

60     private static String JavaDoc ESCAPED_AMP = "&amp";
61     // CSS escapes: "Parentheses, commas, whitespace characters, single
62
// quotes (') and double quotes (") appearing in a URL must be
63
// escaped with a backslash"
64
static final String JavaDoc CSS_BACKSLASH_ESCAPE = "\\\\([,'\"\\(\\)\\s])";
65
66     protected Matcher JavaDoc uris;
67
68     /**
69      * CSS URL extractor pattern.
70      *
71      * This pattern extracts URIs for CSS files
72      **/

73     static final String JavaDoc CSS_URI_EXTRACTOR =
74     "(?:@import (?:url[(]|)|url[(])\\s*([\\\"\']?)([^\\\"\'].*?)\\1\\s*[);]";
75
76     protected boolean findNextLink() {
77         if (uris == null) {
78             uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, sourceContent);
79             // NOTE: this matcher can't be recycled in this method because
80
// it is reused on rentry
81
}
82         String JavaDoc cssUri;
83         try {
84             while (uris.find()) {
85                 cssUri = uris.group(2);
86                 // TODO: Escape more HTML Entities.
87
cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
88                 // Remove backslashes when used as escape character in CSS URL
89
cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri, "$1");
90                 // TODO: handle relative URIs?
91
try {
92                     Link link = new Link(source, UURIFactory.getInstance(base,
93                             cssUri), Link.EMBED_MISC, Link.EMBED_HOP);
94                     next.addLast(link);
95                 } catch (URIException e) {
96                     extractErrorListener.noteExtractError(e, source, cssUri);
97                 }
98                 return true;
99             }
100         } catch (StackOverflowError JavaDoc e) {
101             DevUtils.warnHandle(e, "RegexpCSSLinkExtractor StackOverflowError");
102         }
103         return false;
104     }
105
106     public void reset() {
107         super.reset();
108         TextUtils.recycleMatcher(uris);
109         uris = null;
110     }
111     
112     protected static CharSequenceLinkExtractor newDefaultInstance() {
113         return new RegexpCSSLinkExtractor();
114     }
115 }
116
Popular Tags