1 23 package org.archive.crawler.url.canonicalize; 24 25 import java.util.logging.Logger ; 26 import java.util.regex.Matcher ; 27 28 import org.archive.crawler.settings.SimpleType; 29 import org.archive.util.TextUtils; 30 31 36 public class RegexRule 37 extends BaseRule { 38 39 private static final long serialVersionUID = -2658094415450237847L; 40 41 protected static Logger logger = 42 Logger.getLogger(BaseRule.class.getName()); 43 private static final String DESCRIPTION = "General regex rule. " + 44 "Specify a matching regex and a format string used outputting" + 45 " result if a match was found. If problem compiling regex or" + 46 " interpreting format, problem is logged, and this rule does" + 47 " nothing. See User Manual for example usage."; 48 private static final String ATTR_REGEX = "matching-regex"; 49 private static final String ATTR_FORMAT = "format"; 50 private static final String ATTR_COMMENT = "comment"; 51 52 public RegexRule(String name) { 53 this(name, "(.*)", "${1}"); 54 } 55 56 protected RegexRule(String name, String defaultRegex, 57 String defaultFormat) { 58 super(name, DESCRIPTION); 59 addElementToDefinition(new SimpleType(ATTR_REGEX, 60 "Java regular expression. If the regex matches, we'll rewrite" + 61 " the passed url using the specified format pattern.", 62 defaultRegex)); 63 addElementToDefinition( 64 new SimpleType(ATTR_FORMAT, "Pattern to use rewriting matched" + 65 "url. Use '${1}' to match first regex group, '${2}' for" + 66 "next group, etc.", defaultFormat)); 67 addElementToDefinition(new SimpleType(ATTR_COMMENT, 68 "Free-text comment on why this rule was added.", "")); 69 } 70 71 public String canonicalize(String url, Object context) { 72 String regex = getNullOrAttribute(ATTR_REGEX, context); 73 if (regex == null) { 74 return url; 75 } 76 String format = getNullOrAttribute(ATTR_FORMAT, context); 77 if (format == null) { 78 return url; 79 } 80 Matcher matcher = TextUtils.getMatcher(regex, url); 81 String retVal; 82 if (matcher == null || !matcher.matches()) { 83 retVal = url; 84 } else { 85 StringBuffer buffer = new StringBuffer (url.length() * 2); 86 format(matcher, format, buffer); 87 retVal = buffer.toString(); 88 } 89 TextUtils.recycleMatcher(matcher); 90 return retVal; 91 } 92 93 98 protected void format(Matcher matcher, String format, 99 StringBuffer buffer) { 100 for (int i = 0; i < format.length(); i++) { 101 switch(format.charAt(i)) { 102 case '\\': 103 if ((i + 1) < format.length() && 104 format.charAt(i + 1) == '$') { 105 continue; 107 } 108 109 case '$': 110 if (i == 0 || (i > 0 && (format.charAt(i - 1) != '\\'))) { 112 int start = i + 1; 115 boolean curlyBraceStart = false; 116 if (format.charAt(start) == '{') { 117 start++; 118 curlyBraceStart = true; 119 } 120 int j = start; 121 for (; j < format.length() && 122 Character.isDigit(format.charAt(j)); j++) { 123 } 125 if (j > start) { 126 int groupIndex = Integer. 127 parseInt(format.substring(start, j)); 128 if (groupIndex >= 0 && groupIndex < 256) { 129 String g = null; 130 try { 131 g = matcher.group(groupIndex); 132 } catch (IndexOutOfBoundsException e) { 133 logger.warning("IndexOutOfBoundsException" + 134 " getting group " + groupIndex + 135 " from " + matcher.group(0) + 136 " with format of " + format); 137 } 138 if (g != null) { 139 buffer.append(g); 140 } 141 if (curlyBraceStart && 143 format.charAt(j) == '}') { 144 j++; 145 } 146 i = (j - 1); 149 continue; 151 } 152 } 153 154 } 155 157 default: 158 buffer.append(format.charAt(i)); 159 } 160 } 161 } 162 163 protected String getNullOrAttribute(String name, Object context) { 164 try { 165 return (String )getAttribute(context, name); 166 } catch (Exception e) { 167 logger.severe(e.getMessage()); 168 return null; 169 } 170 } 171 } 172 | Popular Tags |