1 package org.python.modules; 3 4 import org.python.core.*; 5 import org.apache.oro.text.regex.*; 6 7 8 public class RegexObject extends PyObject 9 { 10 private static Perl5Compiler compiler = new Perl5Compiler(); 11 12 private static synchronized Pattern compile(String pattern, int flags) { 13 try { 14 return compiler.compile(pattern, flags); 15 } 16 catch (MalformedPatternException e) { 17 throw re.ReError(e.getMessage()); 18 } 19 } 20 21 private static synchronized Perl5Matcher getMatcher() { 22 Perl5Matcher matcher = new Perl5Matcher(); 23 return matcher; 25 } 26 27 public String pattern; 28 public int flags; 29 public PyDictionary groupindex; 30 private Pattern code; 31 32 public RegexObject(String pattern, int flags) { 33 this.pattern = pattern; 34 this.flags = flags; 35 groupindex = new PyDictionary(); 36 code = compile(fixPattern(pattern), flags); 37 } 38 39 public MatchObject match(String string) { 40 MatchResult result = doMatch(string); 41 if (result == null) 42 return null; 43 return new MatchObject(this, string, 0, string.length(), result); 44 } 45 46 public MatchObject match(String s, int pos) { 47 return match(s, pos, s.length()); 48 } 49 50 public MatchObject match(String string, int pos, int endpos) { 51 if (endpos > string.length()) 52 endpos = string.length(); 53 if (endpos < pos) 54 endpos = pos; 55 56 MatchResult result = 57 doMatch(new PatternMatcherInput(string, pos, endpos-pos)); 58 if (result == null) 59 return null; 60 return new MatchObject(this, string, pos, endpos, result); 61 } 62 63 private MatchResult doMatch(Object input) { 64 Perl5Matcher matcher = getMatcher(); 65 if (input instanceof String ) { 66 if (!matcher.matchesPrefix((String )input, code)) 67 return null; 68 } 69 else { 70 if (!matcher.matchesPrefix((PatternMatcherInput)input, code)) 71 return null; 72 } 73 return matcher.getMatch(); 74 } 75 76 public MatchObject search(String string) { 77 MatchResult result = doSearch(string); 78 if (result == null) 79 return null; 80 return new MatchObject(this, string, 0, string.length(), result); 81 } 82 83 public MatchObject search(String s, int pos) { 84 return search(s, pos, s.length()); 85 } 86 87 public MatchObject search(String string, int pos, int endpos) { 88 if (endpos > string.length()) 89 endpos = string.length(); 90 if (endpos < pos) 91 endpos = pos; 92 93 MatchResult result = 94 doSearch(new PatternMatcherInput(string, pos, endpos-pos)); 95 if (result == null) 96 return null; 97 return new MatchObject(this, string, pos, endpos, result); 98 } 99 100 private MatchResult doSearch(Object input) { 101 Perl5Matcher matcher = getMatcher(); 102 103 if (input instanceof String ) { 104 if (!matcher.contains((String )input, code)) 105 return null; 106 } 107 else { 108 if (!matcher.contains((PatternMatcherInput)input, code)) 109 return null; 110 } 111 return matcher.getMatch(); 112 } 113 114 public PyString sub(PyObject repl, String string) { 115 return sub(repl, string, 0); 116 } 117 118 public PyString sub(PyObject repl, String string, int count) { 119 return (PyString)subn(repl, string, count).__getitem__(0); 120 } 121 122 public PyTuple subn(PyObject repl, String string) { 123 return subn(repl, string, 0); 124 } 125 126 public PyTuple subn(PyObject repl, String string, int count) { 127 String srepl = null; 129 boolean expand = false; 130 if (repl instanceof PyString) { 131 srepl = repl.toString(); 132 expand = (srepl.indexOf('\\') != -1); 133 } 134 if (count < 0) { 135 throw re.ReError("negative substitution count"); 136 } 137 if (count == 0) { 138 count = Integer.MAX_VALUE; 139 } 140 141 int n=0; 143 StringBuffer buf = new StringBuffer (); 144 Perl5Matcher matcher = getMatcher(); 145 PatternMatcherInput match = new PatternMatcherInput(string); 146 int lastmatch = 0; 147 148 while (n < count && !match.endOfInput()) { 149 if (!matcher.contains(match, code)) 150 break; 151 n++; 152 int offset = match.getMatchBeginOffset(); 153 if (offset > lastmatch) { 155 buf.append(match.substring(lastmatch, offset)); 156 } 157 if (srepl == null) { 158 MatchObject m = new MatchObject(this, string, lastmatch, 159 string.length(), 160 matcher.getMatch()); 161 PyObject ret = repl.__call__(m); 162 buf.append(ret.toString()); 163 } 164 else { 165 if (expand) 166 buf.append(expandMatch(matcher.getMatch(), srepl)); 167 else 168 buf.append(srepl); 169 } 170 lastmatch = match.getMatchEndOffset(); 171 } 172 if (lastmatch < match.getEndOffset()) { 173 buf.append(match.substring(lastmatch, match.getEndOffset())); 174 } 175 return new PyTuple( 176 new PyObject[] { 177 new PyString(buf.toString()), 178 new PyInteger(n) 179 }); 180 } 181 182 public PyList split(String string) { 183 return split(string, 0); 184 } 185 186 public PyList split(String string, int maxsplit) { 187 if (maxsplit < 0) { 188 throw re.ReError("maxsplit < 0"); 189 } 190 if (maxsplit == 0) { 191 maxsplit = Integer.MAX_VALUE; 192 } 193 194 int n=0; 195 Perl5Matcher matcher = getMatcher(); 196 PatternMatcherInput match = new PatternMatcherInput(string); 197 int lastmatch = 0; 198 PyList results = new PyList(); 199 200 while (n < maxsplit && !match.endOfInput()) { 201 if (!matcher.contains(match, code)) 202 break; 203 n++; 204 205 int begin = match.getMatchBeginOffset(); 206 int end = match.getMatchEndOffset(); 207 208 if (begin == end) { 209 continue; 211 } 212 213 results.append(new PyString(match.substring(lastmatch, begin))); 214 215 MatchResult m = matcher.getMatch(); 216 int ngroups = m.groups(); 217 if (ngroups > 1) { 218 for (int j=1; j<ngroups; j++) { 219 String tmp = m.group(j); 220 if (tmp == null) { 221 results.append(Py.None); 222 } 223 else { 224 results.append(new PyString(tmp)); 225 } 226 } 227 } 228 lastmatch = end; 229 } 230 results.append( 231 new PyString(match.substring(lastmatch, match.getEndOffset()))); 232 return results; 233 } 234 235 private int getindex(PyString s) { 236 PyInteger v = (PyInteger)groupindex.__finditem__(s); 237 if (v == null) { 238 try { 239 v = (PyInteger)s.__int__(); 240 } 241 catch (PyException exc) { 242 if (!isname(s.toString())) 243 throw re.ReError("illegal character in group name"); 244 else 245 throw Py.IndexError("group "+s.__repr__() + 246 " is undefined"); 247 } 248 } 249 return v.getValue(); 250 } 251 252 private boolean isdigit(char c) { 253 return '0' <= c && c <= '9'; 254 } 255 256 private boolean isident(char c) { 257 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'); 258 } 259 260 private boolean isname(String name) { 261 int n = name.length(); 262 if (n <= 0 || !isident(name.charAt(0))) 263 return false; 264 for (int i = 1; i < n; i++) { 265 char c = name.charAt(i); 266 if (!isident(c) && !isdigit(c)) 267 return false; 268 } 269 return true; 270 } 271 272 private String fixPattern(String pattern) { 273 char[] chars = pattern.toCharArray(); 274 275 int index=0; 276 int group=1; 277 int lasti=0; 278 int n = chars.length; 279 280 StringBuffer buf = new StringBuffer (); 281 282 while (index < n) { 283 if (chars[index++] == '(') { 284 if (index > 2 && chars[index-2] == '\\') 286 continue; 287 288 if (index < n && chars[index] == '?') { 289 index++; 290 if (index < n && chars[index] == 'P') { 291 index++; 292 if (index == n) 293 break; 294 char c = chars[index++]; 295 int start = index; 296 if (c == '<') { 297 while (index < n && chars[index] != '>') 298 index++; 299 if (index == n) 300 throw re.ReError("unmatched <"); 301 String name = 302 new String (chars, start, index-start); 303 if (!isname(name)) 305 throw re.ReError("illegal character in " + 306 "group name"); 307 groupindex.__setitem__(new PyString(name), 308 new PyInteger(group)); 309 buf.append(chars, lasti, start-3-lasti); 310 index++; 311 lasti = index; 312 group++; 313 continue; 314 } 315 else { 316 if (c == '=') { 317 while (index < n && chars[index] != ')') { 318 c = chars[index]; 319 if (Character.isJavaIdentifierPart(c) && 320 c != '$') 321 { 322 index++; 323 } 324 else { 325 throw re.ReError( 326 "illegal character in symbol"); 327 } 328 } 329 if (index == n) 330 throw re.ReError("?P= not closed"); 331 if (!(Character.isJavaIdentifierStart( 332 chars[start]))) 333 { 334 throw re.ReError( 335 "illegal character starting symbol"); 336 } 337 String name = new String (chars, start, 338 index-start); 339 PyString pname = new PyString(name); 340 buf.append(chars, lasti, start-4-lasti); 341 buf.append('\\'); 342 buf.append(getindex(pname)); 343 index++; 344 lasti=index; 345 } 346 else { 347 throw re.ReError("invalid ?P grouping"); 348 } 349 } 350 } 351 else { 352 if (chars[index] == ':') 353 continue; 354 while (index < n && chars[index] != ')') 355 index++; 356 } 357 } 358 else { 359 group++; 360 } 361 } 362 } 363 if (lasti > 0) { 364 buf.append(chars, lasti, n-lasti); 365 return buf.toString(); 367 } 368 else { 369 return pattern; 371 } 372 } 373 374 public String expandMatch(MatchResult match, String repl) { 375 char[] chars = repl.toCharArray(); 376 377 int index=0; 378 int lasti=0; 379 int n = chars.length; 380 381 StringBuffer buf = new StringBuffer (); 382 try { 383 while (index<n) { 384 386 if (chars[index++] == '\\') { 387 char ch = 0; 388 switch (chars[index++]) { 389 case '\\': 390 ch = '\\'; break; 391 case 'E': 392 case 'G': 393 case 'L': 394 case 'Q': 395 case 'U': 396 case 'l': 397 case 'u': 398 throw re.ReError("\\"+chars[index-1]+ 399 " is not allowed"); 400 case 'n': 401 ch = '\n'; break; 402 case 't': 403 ch = '\t'; break; 404 case 'r': 405 ch = '\r'; break; 406 case 'v': 407 ch = '\013'; break; 408 case 'f': 409 ch = '\f'; break; 410 case 'a': 411 ch = '\007'; break; 412 case 'b': 413 ch = '\b'; break; 414 415 case 'g': 416 if (chars[index++] != '<') { 417 throw re.ReError( 418 "missing < in symbolic reference"); 419 } 420 int start = index; 421 while (index < n && chars[index] != '>') 422 index++; 423 if (index == n) { 424 throw re.ReError("unfinished symbolic reference"); 425 } 426 index++; 427 buf.append(chars, lasti, start-3-lasti); 428 PyString str = new PyString(new String (chars, start, 429 index-1-start)); 430 String tmp = match.group(getindex(str)); 431 if (tmp == null) { 432 throw re.ReError("group not in match: "+str); 433 } 434 buf.append(tmp); 435 lasti=index; 436 continue; 437 438 case '1': 439 case '2': 440 case '3': 441 case '4': 442 case '5': 443 case '6': 444 case '7': 445 case '8': 446 case '9': 447 start = index-2; 448 int v = chars[index-1]-'0'; 449 char ch1; 450 if (index<n) { 451 ch = chars[index]; 452 if (ch >= '0' && ch <= '9') { 453 index++; 454 if (index < n && ch <= '7') { 455 ch1 = chars[index]; 456 if (ch1 >= '0' && ch1 <= '7') { 457 v = v*64 + 458 (ch - '0')*8 + 459 (ch1 - '0'); 460 buf.append(chars, lasti, 461 index-2-lasti); 462 buf.append((char)v); 463 index++; 464 lasti=index; 465 } 466 } 467 v = v*10 + (ch - '0'); 468 } 469 } 470 buf.append(chars, lasti, start-lasti); 471 tmp = match.group(v); 472 if (tmp == null) { 473 throw re.ReError("group not in match: "+v); 474 } 475 buf.append(tmp); 476 lasti=index; 477 continue; 478 default: 479 continue; 480 } 481 buf.append(chars, lasti, index-2-lasti); 482 buf.append(ch); 483 lasti=index; 484 } 485 } 486 } 487 catch (ArrayIndexOutOfBoundsException exc) { 488 throw re.ReError("invalid expression"); 489 } 490 if (lasti > 0) { 491 buf.append(chars, lasti, n-lasti); 492 return buf.toString(); 493 } 494 else { 495 return repl; 496 } 497 } 498 499 public PyList findall(String string) { 500 Perl5Matcher matcher = getMatcher(); 501 PatternMatcherInput match = new PatternMatcherInput(string); 502 PyList ret = new PyList(); 503 504 while (matcher.contains(match, code)) { 505 MatchResult result = matcher.getMatch(); 506 int groups = result.groups(); 507 508 if (groups == 1) 509 ret.append(new PyString(result.group(0))); 511 else if (groups == 2) 512 ret.append(new PyString(result.group(1))); 515 else { 516 PyString[] submatches = new PyString[groups-1]; 518 for (int g = 1; g < groups; g++) 519 submatches[g-1] = new PyString(result.group(g)); 520 PyTuple tup = new PyTuple(submatches); 521 ret.append(tup); 522 } 523 } 524 return ret; 525 } 526 } 527 | Popular Tags |