KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > canonicalize > RegexRule


1 /* RegexRule
2  *
3  * Created on Oct 6, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.logging.Logger JavaDoc;
26 import java.util.regex.Matcher JavaDoc;
27
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.util.TextUtils;
30
31 /**
32  * General conversion rule.
33  * @author stack
34  * @version $Date: 2007/01/13 01:31:28 $, $Revision: 1.7.12.1 $
35  */

36 public class RegexRule
37 extends BaseRule {
38
39     private static final long serialVersionUID = -2658094415450237847L;
40
41     protected static Logger JavaDoc logger =
42         Logger.getLogger(BaseRule.class.getName());
43     private static final String JavaDoc DESCRIPTION = "General regex rule. " +
44         "Specify a matching regex and a format string used outputting" +
45         " result if a match was found. If problem compiling regex or" +
46         " interpreting format, problem is logged, and this rule does" +
47         " nothing. See User Manual for example usage.";
48     private static final String JavaDoc ATTR_REGEX = "matching-regex";
49     private static final String JavaDoc ATTR_FORMAT = "format";
50     private static final String JavaDoc ATTR_COMMENT = "comment";
51     
52     public RegexRule(String JavaDoc name) {
53         this(name, "(.*)", "${1}");
54     }
55     
56     protected RegexRule(String JavaDoc name, String JavaDoc defaultRegex,
57             String JavaDoc defaultFormat) {
58         super(name, DESCRIPTION);
59         addElementToDefinition(new SimpleType(ATTR_REGEX,
60             "Java regular expression. If the regex matches, we'll rewrite" +
61             " the passed url using the specified format pattern.",
62             defaultRegex));
63         addElementToDefinition(
64             new SimpleType(ATTR_FORMAT, "Pattern to use rewriting matched" +
65                 "url. Use '${1}' to match first regex group, '${2}' for" +
66                 "next group, etc.", defaultFormat));
67         addElementToDefinition(new SimpleType(ATTR_COMMENT,
68             "Free-text comment on why this rule was added.", ""));
69     }
70
71     public String JavaDoc canonicalize(String JavaDoc url, Object JavaDoc context) {
72         String JavaDoc regex = getNullOrAttribute(ATTR_REGEX, context);
73         if (regex == null) {
74             return url;
75         }
76         String JavaDoc format = getNullOrAttribute(ATTR_FORMAT, context);
77         if (format == null) {
78             return url;
79         }
80         Matcher JavaDoc matcher = TextUtils.getMatcher(regex, url);
81         String JavaDoc retVal;
82         if (matcher == null || !matcher.matches()) {
83             retVal = url;
84         } else {
85             StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(url.length() * 2);
86             format(matcher, format, buffer);
87             retVal = buffer.toString();
88         }
89         TextUtils.recycleMatcher(matcher);
90         return retVal;
91     }
92     
93     /**
94      * @param matcher Matched matcher.
95      * @param format Output format specifier.
96      * @param buffer Buffer to append output to.
97      */

98     protected void format(Matcher JavaDoc matcher, String JavaDoc format,
99             StringBuffer JavaDoc buffer) {
100         for (int i = 0; i < format.length(); i++) {
101             switch(format.charAt(i)) {
102                 case '\\':
103                     if ((i + 1) < format.length() &&
104                             format.charAt(i + 1) == '$') {
105                         // Don't write the escape character in output.
106
continue;
107                     }
108                     
109                 case '$':
110                     // Check to see if its not been escaped.
111
if (i == 0 || (i > 0 && (format.charAt(i - 1) != '\\'))) {
112                         // Looks like we have a matching group specifier in
113
// our format string, something like '$2' or '${2}'.
114
int start = i + 1;
115                         boolean curlyBraceStart = false;
116                         if (format.charAt(start) == '{') {
117                             start++;
118                             curlyBraceStart = true;
119                         }
120                         int j = start;
121                         for (; j < format.length() &&
122                                 Character.isDigit(format.charAt(j)); j++) {
123                             // While a digit, increment.
124
}
125                         if (j > start) {
126                             int groupIndex = Integer.
127                                 parseInt(format.substring(start, j));
128                             if (groupIndex >= 0 && groupIndex < 256) {
129                                 String JavaDoc g = null;
130                                 try {
131                                     g = matcher.group(groupIndex);
132                                 } catch (IndexOutOfBoundsException JavaDoc e) {
133                                     logger.warning("IndexOutOfBoundsException" +
134                                         " getting group " + groupIndex +
135                                         " from " + matcher.group(0) +
136                                         " with format of " + format);
137                                 }
138                                 if (g != null) {
139                                     buffer.append(g);
140                                 }
141                                 // Skip closing curly bracket if one.
142
if (curlyBraceStart &&
143                                         format.charAt(j) == '}') {
144                                     j++;
145                                 }
146                                 // Update the loop index so that we skip over
147
// the ${x} group item.
148
i = (j - 1);
149                                 // Don't fall through to the default.
150
continue;
151                             }
152                         }
153                         
154                     }
155                     // Let fall through to default rule. The '$' was escaped.
156

157                 default:
158                     buffer.append(format.charAt(i));
159             }
160         }
161     }
162
163     protected String JavaDoc getNullOrAttribute(String JavaDoc name, Object JavaDoc context) {
164         try {
165             return (String JavaDoc)getAttribute(context, name);
166         } catch (Exception JavaDoc e) {
167             logger.severe(e.getMessage());
168             return null;
169         }
170     }
171 }
172
Popular Tags