1 64 65 package com.jcorporate.expresso.ext.regexp; 66 67 123 124 import java.util.Vector ; 125 126 127 416 public class RE { 417 418 421 public static final int MATCH_NORMAL = 0x0000; 422 423 426 public static final int MATCH_CASEINDEPENDENT = 0x0001; 427 428 431 public static final int MATCH_MULTILINE = 0x0002; 432 433 446 static final char OP_END = 'E'; static final char OP_BOL = '^'; static final char OP_EOL = '$'; static final char OP_ANY = '.'; static final char OP_ANYOF = '['; static final char OP_BRANCH = '|'; static final char OP_ATOM = 'A'; static final char OP_STAR = '*'; static final char OP_PLUS = '+'; static final char OP_MAYBE = '?'; static final char OP_ESCAPE = '\\'; static final char OP_OPEN = '('; static final char OP_CLOSE = ')'; static final char OP_BACKREF = '#'; static final char OP_GOTO = 'G'; static final char OP_NOTHING = 'N'; static final char OP_RELUCTANTSTAR = '8'; static final char OP_RELUCTANTPLUS = '='; static final char OP_RELUCTANTMAYBE = '/'; static final char OP_POSIXCLASS = 'P'; 469 static final char E_ALNUM = 'w'; static final char E_NALNUM = 'W'; static final char E_BOUND = 'b'; static final char E_NBOUND = 'B'; static final char E_SPACE = 's'; static final char E_NSPACE = 'S'; static final char E_DIGIT = 'd'; static final char E_NDIGIT = 'D'; 479 static final char POSIX_CLASS_ALNUM = 'w'; static final char POSIX_CLASS_ALPHA = 'a'; static final char POSIX_CLASS_BLANK = 'b'; static final char POSIX_CLASS_CNTRL = 'c'; static final char POSIX_CLASS_DIGIT = 'd'; static final char POSIX_CLASS_GRAPH = 'g'; static final char POSIX_CLASS_LOWER = 'l'; static final char POSIX_CLASS_PRINT = 'p'; static final char POSIX_CLASS_PUNCT = '!'; static final char POSIX_CLASS_SPACE = 's'; static final char POSIX_CLASS_UPPER = 'u'; static final char POSIX_CLASS_XDIGIT = 'x'; static final char POSIX_CLASS_JSTART = 'j'; static final char POSIX_CLASS_JPART = 'k'; 495 static final int maxNode = 65536; static final int maxParen = 16; 499 static final int offsetOpcode = 0; static final int offsetOpdata = 1; static final int offsetNext = 2; static final int nodeSize = 3; 505 508 static final String NEWLINE = System.getProperty("line.separator"); 509 510 REProgram program; CharacterIterator search; int idx; int matchFlags; 516 int parenCount; int start0; int end0; int start1; int end1; int start2; int end2; int[] startn; int[] endn; 527 int[] startBackref; int[] endBackref; 531 535 public static final int REPLACE_ALL = 0x0000; 536 537 541 public static final int REPLACE_FIRSTONLY = 0x0001; 542 543 547 public RE() { 548 this((REProgram) null, MATCH_NORMAL); 549 } 550 551 558 public RE(REProgram program) { 559 this(program, MATCH_NORMAL); 560 } 561 562 580 public RE(REProgram program, int matchFlags) { 581 setProgram(program); 582 setMatchFlags(matchFlags); 583 } 584 585 594 public RE(String pattern) 595 throws RESyntaxException { 596 this(pattern, MATCH_NORMAL); 597 } 598 599 609 public RE(String pattern, int matchFlags) 610 throws RESyntaxException { 611 this(new RECompiler().compile(pattern)); 612 setMatchFlags(matchFlags); 613 } 614 615 618 private final void allocParens() { 619 620 startn = new int[maxParen]; 622 endn = new int[maxParen]; 623 624 for (int i = 0; i < maxParen; i++) { 626 startn[i] = -1; 627 endn[i] = -1; 628 } 629 } 630 631 645 public int getMatchFlags() { 646 return matchFlags; 647 } 648 649 655 public String getParen(int which) { 656 if (which < parenCount) { 657 return search.substring(getParenStart(which), getParenEnd(which)); 658 } 659 660 return null; 661 } 662 663 668 public int getParenCount() { 669 return parenCount; 670 } 671 672 678 public final int getParenEnd(int which) { 679 if (which < parenCount) { 680 switch (which) { 681 case 0: 682 return end0; 683 684 case 1: 685 return end1; 686 687 case 2: 688 return end2; 689 690 default: 691 692 if (endn == null) { 693 allocParens(); 694 } 695 696 return endn[which]; 697 } 698 } 699 700 return -1; 701 } 702 703 709 public final int getParenLength(int which) { 710 if (which < parenCount) { 711 return getParenEnd(which) - getParenStart(which); 712 } 713 714 return -1; 715 } 716 717 723 public final int getParenStart(int which) { 724 if (which < parenCount) { 725 switch (which) { 726 case 0: 727 return start0; 728 729 case 1: 730 return start1; 731 732 case 2: 733 return start2; 734 735 default: 736 737 if (startn == null) { 738 allocParens(); 739 } 740 741 return startn[which]; 742 } 743 } 744 745 return -1; 746 } 747 748 754 public REProgram getProgram() { 755 return program; 756 } 757 758 767 public String [] grep(Object [] search) { 768 769 Vector v = new Vector (); 771 772 for (int i = 0; i < search.length; i++) { 774 775 String s = search[i].toString(); 777 778 if (match(s)) { 780 v.addElement(s); 781 } 782 } 783 784 String [] ret = new String [v.size()]; 786 v.copyInto(ret); 787 788 return ret; 789 } 790 791 798 protected void internalError(String s) 799 throws Error { 800 throw new Error ("RE internal error: " + s); 801 } 802 803 806 private boolean isNewline(int i) { 807 if (i < NEWLINE.length() - 1) { 808 return false; 809 } 810 if (search.charAt(i) == '\n') { 811 return true; 812 } 813 for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) { 814 if (NEWLINE.charAt(j) != search.charAt(i)) { 815 return false; 816 } 817 } 818 819 return true; 820 } 821 822 830 public boolean match(CharacterIterator search, int i) { 831 832 if (program == null) { 834 835 internalError("No RE program to run!"); 838 } 839 840 this.search = search; 842 843 if (program.prefix == null) { 845 846 for (; !search.isEnd(i - 1); i++) { 848 849 if (matchAt(i)) { 851 return true; 852 } 853 } 854 855 return false; 856 } else { 857 858 char[] prefix = program.prefix; 860 861 for (; !search.isEnd(i + prefix.length - 1); i++) { 862 863 if (search.charAt(i) == prefix[0]) { 865 866 int firstChar = i++; 868 int k; 869 870 for (k = 1; k < prefix.length;) { 871 872 if (search.charAt(i++) != prefix[k++]) { 874 break; 875 } 876 } 877 if (k == prefix.length) { 879 880 if (matchAt(firstChar)) { 882 return true; 883 } 884 } 885 886 i = firstChar; 888 } 889 } 890 891 return false; 892 } 893 } 894 895 901 public boolean match(String search) { 902 return match(search, 0); 903 } 904 905 913 public boolean match(String search, int i) { 914 return match(new StringCharacterIterator(search), i); 915 } 916 917 925 protected boolean matchAt(int i) { 926 927 start0 = -1; 929 end0 = -1; 930 start1 = -1; 931 end1 = -1; 932 start2 = -1; 933 end2 = -1; 934 startn = null; 935 endn = null; 936 parenCount = 1; 937 setParenStart(0, i); 938 939 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) { 941 startBackref = new int[maxParen]; 942 endBackref = new int[maxParen]; 943 } 944 945 int idx; 947 948 if ((idx = matchNodes(0, maxNode, i)) != -1) { 949 setParenEnd(0, idx); 950 951 return true; 952 } 953 954 parenCount = 0; 956 957 return false; 958 } 959 960 969 protected int matchNodes(int firstNode, int lastNode, int idxStart) { 970 971 int idx = idxStart; 973 974 int next; 976 int opcode; 977 int opdata; 978 int idxNew; 979 char[] instruction = program.instruction; 980 981 for (int node = firstNode; node < lastNode;) { 982 opcode = instruction[node + offsetOpcode]; 983 next = node + (short) instruction[node + offsetNext]; 984 opdata = instruction[node + offsetOpdata]; 985 986 switch (opcode) { 987 case OP_RELUCTANTMAYBE: 988 { 989 int once = 0; 990 991 do { 992 993 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) { 995 return idxNew; 996 } 997 } while ((once++ == 0) && (idx = matchNodes(node + nodeSize, 998 next, idx)) != -1); 999 1000 return -1; 1001 } 1002 case OP_RELUCTANTPLUS: 1003 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) { 1004 1005 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) { 1007 return idxNew; 1008 } 1009 } 1010 1011 return -1; 1012 1013 case OP_RELUCTANTSTAR: 1014 do { 1015 1016 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) { 1018 return idxNew; 1019 } 1020 } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1); 1021 1022 return -1; 1023 1024 case OP_OPEN: 1025 1026 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) { 1028 startBackref[opdata] = idx; 1029 } 1030 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) { 1031 1032 if ((opdata + 1) > parenCount) { 1034 parenCount = opdata + 1; 1035 } 1036 if (getParenStart(opdata) == -1) { 1038 setParenStart(opdata, idx); 1039 } 1040 } 1041 1042 return idxNew; 1043 1044 case OP_CLOSE: 1045 1046 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) { 1048 endBackref[opdata] = idx; 1049 } 1050 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) { 1051 1052 if ((opdata + 1) > parenCount) { 1054 parenCount = opdata + 1; 1055 } 1056 if (getParenEnd(opdata) == -1) { 1058 setParenEnd(opdata, idx); 1059 } 1060 } 1061 1062 return idxNew; 1063 1064 case OP_BACKREF: 1065 { 1066 1067 int s = startBackref[opdata]; 1069 int e = endBackref[opdata]; 1070 1071 if (s == -1 || e == -1) { 1073 return -1; 1074 } 1075 if (s == e) { 1077 break; 1078 } 1079 1080 int l = e - s; 1082 1083 if (search.isEnd(idx + l)) { 1085 return -1; 1086 } 1087 if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) { 1089 1090 for (int i = 0; i < l; i++) { 1092 if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt( 1093 s + i))) { 1094 return -1; 1095 } 1096 } 1097 } else { 1098 1099 for (int i = 0; i < l; i++) { 1101 if (search.charAt(idx++) != search.charAt(s + i)) { 1102 return -1; 1103 } 1104 } 1105 } 1106 } 1107 1108 break; 1109 1110 case OP_BOL: 1111 1112 if (idx != 0) { 1114 1115 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) { 1117 1118 if (idx <= 0 || !isNewline(idx - 1)) { 1120 return -1; 1121 } else { 1122 break; 1123 } 1124 } 1125 1126 return -1; 1127 } 1128 1129 break; 1130 1131 case OP_EOL: 1132 1133 if (!search.isEnd(0) && !search.isEnd(idx)) { 1135 1136 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) { 1138 1139 if (!isNewline(idx)) { 1141 return -1; 1142 } else { 1143 break; 1144 } 1145 } 1146 1147 return -1; 1148 } 1149 1150 break; 1151 1152 case OP_ESCAPE: 1153 1154 switch (opdata) { 1156 1157 case E_NBOUND: 1159 case E_BOUND: 1160 { 1161 char cLast = ((idx == getParenStart(0)) 1162 ? '\n' : search.charAt(idx - 1)); 1163 char cNext = ((search.isEnd(idx)) 1164 ? '\n' : search.charAt(idx)); 1165 1166 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) { 1167 return -1; 1168 } 1169 } 1170 1171 break; 1172 1173 case E_ALNUM: 1175 case E_NALNUM: 1176 case E_DIGIT: 1177 case E_NDIGIT: 1178 case E_SPACE: 1179 case E_NSPACE: 1180 1181 if (search.isEnd(idx)) { 1183 return -1; 1184 } 1185 switch (opdata) { 1187 case E_ALNUM: 1188 case E_NALNUM: 1189 if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM))) { 1190 return -1; 1191 } 1192 1193 break; 1194 1195 case E_DIGIT: 1196 case E_NDIGIT: 1197 if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) { 1198 return -1; 1199 } 1200 1201 break; 1202 1203 case E_SPACE: 1204 case E_NSPACE: 1205 if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) { 1206 return -1; 1207 } 1208 1209 break; 1210 } 1211 1212 idx++; 1213 break; 1214 1215 default: 1216 internalError("Unrecognized escape '" + opdata + 1217 "'"); 1218 } 1219 1220 break; 1221 1222 case OP_ANY: 1223 1224 if (search.isEnd(idx) || search.charAt(idx++) == '\n') { 1226 return -1; 1227 } 1228 1229 break; 1230 1231 case OP_ATOM: 1232 { 1233 1234 if (search.isEnd(idx)) { 1236 return -1; 1237 } 1238 1239 int lenAtom = opdata; 1241 int startAtom = node + nodeSize; 1242 1243 if (search.isEnd(lenAtom + idx - 1)) { 1245 return -1; 1246 } 1247 if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) { 1249 for (int i = 0; i < lenAtom; i++) { 1250 if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase( 1251 instruction[startAtom + i])) { 1252 return -1; 1253 } 1254 } 1255 } else { 1256 for (int i = 0; i < lenAtom; i++) { 1257 if (search.charAt(idx++) != instruction[startAtom + i]) { 1258 return -1; 1259 } 1260 } 1261 } 1262 } 1263 1264 break; 1265 1266 case OP_POSIXCLASS: 1267 { 1268 1269 if (search.isEnd(idx)) { 1271 return -1; 1272 } 1273 switch (opdata) { 1274 case POSIX_CLASS_ALNUM: 1275 if (!Character.isLetterOrDigit(search.charAt(idx))) { 1276 return -1; 1277 } 1278 1279 break; 1280 1281 case POSIX_CLASS_ALPHA: 1282 if (!Character.isLetter(search.charAt(idx))) { 1283 return -1; 1284 } 1285 1286 break; 1287 1288 case POSIX_CLASS_DIGIT: 1289 if (!Character.isDigit(search.charAt(idx))) { 1290 return -1; 1291 } 1292 1293 break; 1294 1295 case POSIX_CLASS_BLANK: if (!Character.isSpaceChar(search.charAt(idx))) { 1297 return -1; 1298 } 1299 1300 break; 1301 1302 case POSIX_CLASS_SPACE: 1303 if (!Character.isWhitespace(search.charAt(idx))) { 1304 return -1; 1305 } 1306 1307 break; 1308 1309 case POSIX_CLASS_CNTRL: 1310 if (Character.getType(search.charAt(idx)) != Character.CONTROL) { 1311 return -1; 1312 } 1313 1314 break; 1315 1316 case POSIX_CLASS_GRAPH: switch (Character.getType(search.charAt(idx))) { 1318 case Character.MATH_SYMBOL: 1319 case Character.CURRENCY_SYMBOL: 1320 case Character.MODIFIER_SYMBOL: 1321 case Character.OTHER_SYMBOL: 1322 break; 1323 1324 default: 1325 return -1; 1326 } 1327 1328 break; 1329 1330 case POSIX_CLASS_LOWER: 1331 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) { 1332 return -1; 1333 } 1334 1335 break; 1336 1337 case POSIX_CLASS_UPPER: 1338 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) { 1339 return -1; 1340 } 1341 1342 break; 1343 1344 case POSIX_CLASS_PRINT: 1345 if (Character.getType(search.charAt(idx)) == Character.CONTROL) { 1346 return -1; 1347 } 1348 1349 break; 1350 1351 case POSIX_CLASS_PUNCT: 1352 { 1353 int type = Character.getType(search.charAt(idx)); 1354 1355 switch (type) { 1356 case Character.DASH_PUNCTUATION: 1357 case Character.START_PUNCTUATION: 1358 case Character.END_PUNCTUATION: 1359 case Character.CONNECTOR_PUNCTUATION: 1360 case Character.OTHER_PUNCTUATION: 1361 break; 1362 1363 default: 1364 return -1; 1365 } 1366 } 1367 1368 break; 1369 1370 case POSIX_CLASS_XDIGIT: { 1372 boolean isXDigit = ((search.charAt(idx) >= '0' && 1373 search.charAt(idx) <= '9') || 1374 (search.charAt(idx) >= 'a' && 1375 search.charAt(idx) <= 'f') || 1376 (search.charAt(idx) >= 'A' && 1377 search.charAt(idx) <= 'F')); 1378 1379 if (!isXDigit) { 1380 return -1; 1381 } 1382 } 1383 1384 break; 1385 1386 case POSIX_CLASS_JSTART: 1387 if (!Character.isJavaIdentifierStart(search.charAt(idx))) { 1388 return -1; 1389 } 1390 1391 break; 1392 1393 case POSIX_CLASS_JPART: 1394 if (!Character.isJavaIdentifierPart(search.charAt(idx))) { 1395 return -1; 1396 } 1397 1398 break; 1399 1400 default: 1401 internalError("Bad posix class"); 1402 break; 1403 } 1404 1405 idx++; 1407 } 1408 1409 break; 1410 1411 case OP_ANYOF: 1412 { 1413 1414 if (search.isEnd(idx)) { 1416 return -1; 1417 } 1418 1419 char c = search.charAt(idx); 1421 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; 1422 1423 if (caseFold) { 1424 c = Character.toLowerCase(c); 1425 } 1426 1427 int idxRange = node + nodeSize; 1429 int idxEnd = idxRange + (opdata * 2); 1430 boolean match = false; 1431 1432 for (int i = idxRange; i < idxEnd;) { 1433 1434 char s = instruction[i++]; 1436 char e = instruction[i++]; 1437 1438 if (caseFold) { 1440 s = Character.toLowerCase(s); 1441 e = Character.toLowerCase(e); 1442 } 1443 if (c >= s && c <= e) { 1445 match = true; 1446 break; 1447 } 1448 } 1449 if (!match) { 1451 return -1; 1452 } 1453 1454 idx++; 1455 } 1456 1457 break; 1458 1459 case OP_BRANCH: 1460 { 1461 1462 if (instruction[next + offsetOpcode] != OP_BRANCH) { 1464 1465 node += nodeSize; 1467 continue; 1468 } 1469 1470 short nextBranch; 1472 1473 do { 1474 1475 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) { 1477 return idxNew; 1478 } 1479 1480 nextBranch = (short) instruction[node + offsetNext]; 1482 node += nextBranch; 1483 } while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH)); 1484 return -1; 1486 } 1487 case OP_NOTHING: 1488 case OP_GOTO: 1489 1490 break; 1492 1493 case OP_END: 1494 1495 setParenEnd(0, idx); 1497 1498 return idx; 1499 1500 default: 1501 1502 internalError("Invalid opcode '" + opcode + "'"); 1504 } 1505 1506 node = next; 1508 } 1509 1510 internalError("Corrupt program"); 1512 1513 return -1; 1514 } 1515 1516 1529 public void setMatchFlags(int matchFlags) { 1530 this.matchFlags = matchFlags; 1531 } 1532 1533 1539 protected final void setParenEnd(int which, int i) { 1540 if (which < parenCount) { 1541 switch (which) { 1542 case 0: 1543 end0 = i; 1544 break; 1545 1546 case 1: 1547 end1 = i; 1548 break; 1549 1550 case 2: 1551 end2 = i; 1552 break; 1553 1554 default: 1555 1556 if (endn == null) { 1557 allocParens(); 1558 } 1559 1560 endn[which] = i; 1561 break; 1562 } 1563 } 1564 } 1565 1566 1572 protected final void setParenStart(int which, int i) { 1573 if (which < parenCount) { 1574 switch (which) { 1575 case 0: 1576 start0 = i; 1577 break; 1578 1579 case 1: 1580 start1 = i; 1581 break; 1582 1583 case 2: 1584 start2 = i; 1585 break; 1586 1587 default: 1588 1589 if (startn == null) { 1590 allocParens(); 1591 } 1592 1593 startn[which] = i; 1594 break; 1595 } 1596 } 1597 } 1598 1599 1606 public void setProgram(REProgram program) { 1607 this.program = program; 1608 } 1609 1610 1616 public static String simplePatternToFullRegularExpression(String pattern) { 1617 StringBuffer buf = new StringBuffer (); 1618 1619 for (int i = 0; i < pattern.length(); i++) { 1620 char c = pattern.charAt(i); 1621 1622 switch (c) { 1623 case '*': 1624 buf.append(".*"); 1625 break; 1626 1627 case '.': 1628 case '[': 1629 case ']': 1630 case '\\': 1631 case '+': 1632 case '?': 1633 case '{': 1634 case '}': 1635 case '$': 1636 case '^': 1637 case '|': 1638 case '(': 1639 case ')': 1640 buf.append('\\'); 1641 1642 default: 1643 buf.append(c); 1644 break; 1645 } 1646 } 1647 1648 return buf.toString(); 1649 } 1650 1651 1661 public String [] split(String s) { 1662 1663 Vector v = new Vector (); 1665 1666 int pos = 0; 1668 int len = s.length(); 1669 1670 while (pos < len && match(s, pos)) { 1672 1673 int start = getParenStart(0); 1675 1676 int newpos = getParenEnd(0); 1678 1679 if (newpos == pos) { 1681 v.addElement(s.substring(pos, start + 1)); 1682 newpos++; 1683 } else { 1684 v.addElement(s.substring(pos, start)); 1685 } 1686 1687 pos = newpos; 1689 } 1690 1691 String remainder = s.substring(pos); 1693 1694 if (remainder.length() != 0) { 1695 v.addElement(remainder); 1696 } 1697 1698 String [] ret = new String [v.size()]; 1700 v.copyInto(ret); 1701 1702 return ret; 1703 } 1704 1705 1719 public String subst(String substituteIn, String substitution) { 1720 return subst(substituteIn, substitution, REPLACE_ALL); 1721 } 1722 1723 1741 public String subst(String substituteIn, String substitution, int flags) { 1742 1743 StringBuffer ret = new StringBuffer (); 1745 1746 int pos = 0; 1748 int len = substituteIn.length(); 1749 1750 while (pos < len && match(substituteIn, pos)) { 1752 1753 ret.append(substituteIn.substring(pos, getParenStart(0))); 1755 1756 ret.append(substitution); 1758 1759 int newpos = getParenEnd(0); 1761 1762 if (newpos == pos) { 1764 newpos++; 1765 } 1766 1767 pos = newpos; 1769 1770 if ((flags & REPLACE_FIRSTONLY) != 0) { 1772 break; 1773 } 1774 } 1775 if (pos < len) { 1777 ret.append(substituteIn.substring(pos)); 1778 } 1779 1780 return ret.toString(); 1782 } 1783} | Popular Tags |