|                                                                                                              1
 23  package org.archive.crawler.url.canonicalize;
 24
 25  import java.util.logging.Logger
  ; 26  import java.util.regex.Matcher
  ; 27
 28  import org.archive.crawler.settings.SimpleType;
 29  import org.archive.util.TextUtils;
 30
 31
 36  public class RegexRule
 37  extends BaseRule {
 38
 39      private static final long serialVersionUID = -2658094415450237847L;
 40
 41      protected static Logger
  logger = 42          Logger.getLogger(BaseRule.class.getName());
 43      private static final String
  DESCRIPTION = "General regex rule. " + 44          "Specify a matching regex and a format string used outputting" +
 45          " result if a match was found.  If problem compiling regex or" +
 46          " interpreting format, problem is logged, and this rule does" +
 47          " nothing.  See User Manual for example usage.";
 48      private static final String
  ATTR_REGEX = "matching-regex"; 49      private static final String
  ATTR_FORMAT = "format"; 50      private static final String
  ATTR_COMMENT = "comment"; 51
 52      public RegexRule(String
  name) { 53          this(name, "(.*)", "${1}");
 54      }
 55
 56      protected RegexRule(String
  name, String  defaultRegex, 57              String
  defaultFormat) { 58          super(name, DESCRIPTION);
 59          addElementToDefinition(new SimpleType(ATTR_REGEX,
 60              "Java regular expression. If the regex matches, we'll rewrite" +
 61              " the passed url using the specified format pattern.",
 62              defaultRegex));
 63          addElementToDefinition(
 64              new SimpleType(ATTR_FORMAT, "Pattern to use rewriting matched" +
 65                  "url. Use '${1}' to match first regex group, '${2}' for" +
 66                  "next group, etc.", defaultFormat));
 67          addElementToDefinition(new SimpleType(ATTR_COMMENT,
 68              "Free-text comment on why this rule was added.", ""));
 69      }
 70
 71      public String
  canonicalize(String  url, Object  context) { 72          String
  regex = getNullOrAttribute(ATTR_REGEX, context); 73          if (regex == null) {
 74              return url;
 75          }
 76          String
  format = getNullOrAttribute(ATTR_FORMAT, context); 77          if (format == null) {
 78              return url;
 79          }
 80          Matcher
  matcher = TextUtils.getMatcher(regex, url); 81          String
  retVal; 82          if (matcher == null || !matcher.matches()) {
 83              retVal = url;
 84          } else {
 85              StringBuffer
  buffer = new StringBuffer  (url.length() * 2); 86              format(matcher, format, buffer);
 87              retVal = buffer.toString();
 88          }
 89          TextUtils.recycleMatcher(matcher);
 90          return retVal;
 91      }
 92
 93
 98      protected void format(Matcher
  matcher, String  format, 99              StringBuffer
  buffer) { 100         for (int i = 0; i < format.length(); i++) {
 101             switch(format.charAt(i)) {
 102                 case '\\':
 103                     if ((i + 1) < format.length() &&
 104                             format.charAt(i + 1) == '$') {
 105                                                 continue;
 107                     }
 108
 109                 case '$':
 110                                         if (i == 0 || (i > 0 && (format.charAt(i - 1) != '\\'))) {
 112                                                                         int start = i + 1;
 115                         boolean curlyBraceStart = false;
 116                         if (format.charAt(start) == '{') {
 117                             start++;
 118                             curlyBraceStart = true;
 119                         }
 120                         int j = start;
 121                         for (; j < format.length() &&
 122                                 Character.isDigit(format.charAt(j)); j++) {
 123                                                     }
 125                         if (j > start) {
 126                             int groupIndex = Integer.
 127                                 parseInt(format.substring(start, j));
 128                             if (groupIndex >= 0 && groupIndex < 256) {
 129                                 String
  g = null; 130                                 try {
 131                                     g = matcher.group(groupIndex);
 132                                 } catch (IndexOutOfBoundsException
  e) { 133                                     logger.warning("IndexOutOfBoundsException" +
 134                                         " getting group " + groupIndex +
 135                                         " from " + matcher.group(0) +
 136                                         " with format of " + format);
 137                                 }
 138                                 if (g != null) {
 139                                     buffer.append(g);
 140                                 }
 141                                                                 if (curlyBraceStart &&
 143                                         format.charAt(j) == '}') {
 144                                     j++;
 145                                 }
 146                                                                                                 i = (j - 1);
 149                                                                 continue;
 151                             }
 152                         }
 153
 154                     }
 155
 157                 default:
 158                     buffer.append(format.charAt(i));
 159             }
 160         }
 161     }
 162
 163     protected String
  getNullOrAttribute(String  name, Object  context) { 164         try {
 165             return (String
  )getAttribute(context, name); 166         } catch (Exception
  e) { 167             logger.severe(e.getMessage());
 168             return null;
 169         }
 170     }
 171 }
 172
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |