1 package org.jruby.util.string; 2 import java.io.Serializable ; 3 import java.util.Hashtable ; 4 5 40 public class Ustr 41 implements Comparable , Serializable { 42 private static final long serialVersionUID = -7263880042540200296L; 43 44 45 private static final byte[] encLength = { 47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 56 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 57 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 58 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 59 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 60 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 61 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 62 4, 4, 4, 4, 4, 4, 4, 4, -1, -1, -1, -1, -1, -1, -1, -1 63 }; 64 65 66 private static Hashtable interns = new Hashtable (); 67 68 73 public byte[] s; 75 79 public int base = 0; 81 86 public int offset = 0; 88 91 public Ustr() { 92 base = offset = 0; 93 } 94 99 public Ustr(int length) { 100 s = new byte[length]; 101 base = offset = 0; 102 s[0] = 0; 103 } 104 110 public Ustr(byte[] bytes) { 111 s = bytes; 112 base = offset = 0; 113 } 114 121 public Ustr(byte[] bytes, int start) { 122 s = bytes; 123 base = offset = start; 124 } 125 130 public Ustr(Ustr from) { 131 s = new byte[from.strlen() + 1]; 132 base = offset = 0; 133 strcpy(from); 134 } 135 141 public Ustr(char [] chars) { 142 143 int size = 0; 144 for (int i = 0; i < chars.length; i++) { 145 char utf16 = chars[i]; 146 size += bytesInChar(utf16); 149 } 150 s = new byte[size + 1]; 151 base = 0; 152 prepareAppend(); 153 int i = 0; 154 while (i < chars.length) { 155 int val = chars[i]; 156 if (val < 0xd800 || val > 0xdfff) 157 ; 159 else { 161 if (val > 0xdbff) 162 throw new UstrException("Mangled surrogate pair"); 163 164 i++; 165 if (i == chars.length) 166 throw new UstrException("Mangled surrogate pair"); 167 168 int val2 = chars[i]; 169 if (val2 < 0xdc00 || val2 > 0xdfff) 170 throw new UstrException("Mangled surrogate pair"); 171 172 val &= 0x3ff; 173 val <<= 10; 174 val2 &= 0x3ff; 175 val |= val2; 176 val += 0x10000; 177 } 178 i++; 179 appendChar(val); 180 } 181 s[s.length - 1] = 0; 182 } 183 184 193 public Ustr(int [] ints) { 194 int bufsiz = 0; 195 196 for (int j = 0; j < ints.length; j++) { 197 int i = ints[j]; 198 if (i < 0) 199 throw new UstrException("Negative character value"); 200 if (i > 0x10ffff) 201 throw new UstrException("Character out of Unicode range"); 202 203 bufsiz += bytesInChar(i); 204 205 } 206 s = new byte[bufsiz + 1]; 207 base = offset = 0; 208 209 for (int j = 0; j < ints.length; j++) { 210 int i = ints[j]; 211 appendChar(i); 212 } 213 } 214 215 224 public Ustr(Object o) { 225 byte[] inbytes; 226 227 base = offset = 0; 228 try { 229 inbytes = o.toString().getBytes("UTF8"); 230 } catch (java.io.UnsupportedEncodingException e) { 231 throw new UstrException("UTF8 not supported!?!?"); 232 } 233 234 s = new byte[inbytes.length + 1]; 236 for (int i = 0; i < inbytes.length; i++) 237 s[i] = inbytes[i]; 238 239 s[inbytes.length] = 0; 240 } 241 249 public Ustr(int space, Object o) { 250 s = new byte[space]; 251 base = offset = 0; 252 byte [] b; 253 254 try { 255 b = o.toString().getBytes("UTF8"); 256 } catch (java.io.UnsupportedEncodingException e) { 257 throw new RuntimeException ("UTF8 not supported!?!?"); 258 } 259 260 for (int i = 0; i < b.length; i++) 261 s[i] = b[i]; 262 263 s[b.length] = 0; 264 } 265 266 269 public void init() { 270 s[base] = 0; 271 offset = base; 272 } 273 274 282 public int compareTo(Object other) { 283 Ustr o = (other instanceof Ustr) ? (Ustr) other : new Ustr(other); 284 return strcmp(s, base, o.s, o.base); 285 } 286 287 294 public String toString() { 295 try { 296 return new String (s, base, strlen(), "UTF8"); 297 } catch (java.io.UnsupportedEncodingException e) { 298 throw new UstrException("UTF8 not supported!?!?"); 299 } 300 } 301 302 309 public int length() { 310 int saveOffset = offset; 311 int l = 0; 312 for (prepareNext(); nextChar() != 0; l++) 313 ; offset = saveOffset; 315 return l; 316 } 317 325 public static int length(byte [] b, int offset) { 326 return (new Ustr(b, offset)).length(); 327 } 328 335 public static int length(byte [] b) { 336 return length(b, 0); 337 } 338 339 348 public static int length(String str) { 349 return (new Ustr(str)).length(); 350 } 351 352 356 public void prepareAppend() { 357 offset = strlen(); 358 } 359 374 public void appendChar(int c) { 375 offset = appendChar(c, s, offset); 376 } 377 378 389 public static int appendChar(int c, byte [] s, int offset) { 390 if (c < 0) 391 throw new UstrException("Appended negative character"); 392 if (c < 128) 393 s[offset++] = (byte) c; 394 else if (c <= 0x7ff) { 395 s[offset++] = (byte) ( (c >> 6) | 0xc0); 396 s[offset++] = (byte) ((c & 0x3f) | 0x80); 397 } else if (c <= 0xffff) { 398 s[offset++] = (byte) ( (c >> 12) | 0xe0); 399 s[offset++] = (byte) (((c >> 6) & 0x3f) | 0x80); 400 s[offset++] = (byte) ( (c & 0x3f) | 0x80); 401 } else if (c <= 0x10ffff) { 402 s[offset++] = (byte) ( (c >> 18) | 0xf0); 403 s[offset++] = (byte) (((c >> 12) & 0x3f) | 0x80); 404 s[offset++] = (byte) ( ((c >> 6) & 0x3f) | 0x80); 405 s[offset++] = (byte) ( (c & 0x3f) | 0x80); 406 } else 407 throw new UstrException("Appended character > 0x10ffff"); 408 s[offset] = 0; 409 return offset; 410 } 411 412 416 public void prepareNext() { 417 offset = base; 418 } 419 425 public int nextChar() { 426 if (s[offset] == 0) 427 return 0; 428 if ((s[offset] & 0x80) == 0) 429 return (int) s[offset++]; 430 if ((s[offset] & 0xe0) == 0xc0) { 431 int c = (s[offset++] & 0x1f) << 6; 434 c |= s[offset++] & 0x3f; 435 return c; 436 } 437 if ((s[offset] & 0xf0) == 0xe0) { 438 int c = (s[offset++] & 0xf) << 12; 441 c |= (s[offset++] & 0x3f) << 6; 442 c |= s[offset++] & 0x3f; 443 return c; 444 } 445 int c = (s[offset++] & 0x7) << 18; 448 c |= (s[offset++] & 0x3f) << 12; 449 c |= (s[offset++] & 0x3f) << 6; 450 c |= s[offset++] & 0x3f; 451 return c; 452 } 453 454 462 public int strlen() { 463 return strlen(s, base); 464 } 465 471 public static int strlen(byte [] b) { 472 int i = 0; 473 while (b[i] != 0) 474 i++; 475 return i; 476 } 477 485 public static int strlen(byte [] b, int base) { 486 int i = base; 487 while (b[i] != 0) 488 i++; 489 return i - base; 490 } 491 492 501 public static byte [] strcpy(byte [] to, byte [] from) { 502 return strcpy(to, 0, from, 0); 503 } 504 513 public static byte [] strcpy(byte [] to, int tbase, byte [] from, int fbase) { 514 while (from[fbase] != 0) 515 to[tbase++] = from[fbase++]; 516 to[tbase] = 0; 517 518 return to; 519 } 520 526 public Ustr strcpy(Ustr from) { 527 strcpy(s, base, from.s, from.base); 528 return this; 529 } 530 531 538 public Ustr strcpy(Object o) { 539 strcpy(new Ustr(o)); 540 return this; 541 } 542 549 public Ustr strcpy(byte[] from) { 550 strcpy(s, from); 551 return this; 552 } 553 561 public Ustr strcpy(byte[] from, int boffset) { 562 strcpy(s, 0, from, boffset); 563 return this; 564 } 565 575 public static byte [] strcpy(byte [] b, String s) { 576 return strcpy(b, 0, s); 577 } 578 579 588 public static byte [] strcpy(byte [] b, int offset, String s) { 589 byte [] sbytes; 590 591 try { sbytes = s.getBytes("UTF8"); } catch (java.io.UnsupportedEncodingException e) { 592 throw new RuntimeException ("UTF8 not supported!?!?"); } 593 594 for (int i = 0; i < sbytes.length; i++) 595 b[offset + i] = sbytes[i]; 596 b[offset + sbytes.length] = 0; 597 return b; 598 } 599 600 601 611 public Ustr sstrcat(Ustr from) { 612 sstrcat(s, base, from.s, from.base); 613 return this; 614 } 615 616 624 public byte [] sstrcat(byte [] to, byte[] from) { 625 return sstrcat(to, 0, from, 0); 626 } 627 637 public static byte [] sstrcat(byte [] to, int tbase, byte [] from, int fbase) { 638 while (to[tbase] != 0) 640 tbase++; 641 642 try { 643 while (from[fbase] != 0) 644 to[tbase++] = from[fbase++]; 645 to[tbase] = 0; 646 647 return to; 648 } catch (java.lang.ArrayIndexOutOfBoundsException e) { 649 if (tbase >= to.length) 650 to[to.length - 1] = 0; 651 else 652 throw e; 653 } 654 return to; 655 } 656 657 667 public static byte [] sstrcpy(byte [] to, int tbase, byte [] from, int fbase) { 668 try { 669 while (from[fbase] != 0) 670 to[tbase++] = from[fbase++]; 671 to[tbase] = 0; 672 } 673 674 catch (java.lang.ArrayIndexOutOfBoundsException e) { 675 if (tbase >= to.length) 677 to[to.length - 1] = 0; 678 679 else 682 throw e; 683 } 684 return to; 685 } 686 694 public static byte [] sstrcpy(byte [] to, byte [] from) { 695 return sstrcpy(to, 0, from, 0); 696 } 697 698 705 public Ustr sstrcpy(Ustr from) { 706 sstrcpy(s, base, from.s, from.base); 707 return this; 708 } 709 710 720 public static byte [] strcat(byte [] to, int tbase, byte [] from, int fbase) { 721 while (to[tbase] != 0) 722 tbase++; 723 724 while (from[fbase] != 0) 725 to[tbase++] = from[fbase++]; 726 to[tbase] = 0; 727 728 return to; 729 } 730 731 738 public static byte [] strcat(byte [] to, byte [] from) { 739 return strcat(to, 0, from, 0); 740 } 741 742 748 public Ustr strcat(Ustr other) { 749 strcat(s, other.s); 750 return this; 751 } 752 753 762 public static int strcmp(byte [] s1, byte [] s2) { 763 return strcmp(s1, 0, s2, 0); 764 } 765 778 public static int strcmp(byte [] s1, int s1base, byte [] s2, int s2base) { 779 780 Ustr u1 = new Ustr(s1, s1base); 781 Ustr u2 = new Ustr(s2, s2base); 782 783 int c1 = u1.nextChar(); 784 int c2 = u2.nextChar(); 785 786 while (c1 != 0 && c2 != 0 && c1 == c2) { 787 c1 = u1.nextChar(); 788 c2 = u2.nextChar(); 789 } 790 791 return c1 - c2; 792 } 793 802 public int strcmp(Ustr other) { 803 return strcmp(s, base, other.s, other.base); 804 } 805 806 815 public int strcmp(Object other) { 816 return strcmp(new Ustr(other)); 817 } 818 819 827 public Ustr strchr(int c) { 828 int where = strchr(s, c); 829 return (where == -1) ? null : new Ustr(s, where); 830 } 831 832 841 public static int strchr(byte [] b, int c) { 842 byte [] cbytes = new byte[10]; 843 appendChar(c, cbytes, 0); 844 return strstr(b, cbytes); 845 } 846 847 855 public Ustr strrchr(int c) { 856 int where = strrchr(s, c); 857 return (where == -1) ? null : new Ustr(s, where); 858 } 859 860 869 public static int strrchr(byte [] b, int c) { 870 byte [] cbytes = new byte[10]; 871 appendChar(c, cbytes, 0); 872 873 int where = b.length - strlen(cbytes); 874 while (where >= 0) { 875 int i; 876 for (i = 0; cbytes[i] != 0; i++) 877 if (b[where + i] != cbytes[i]) 878 break; 879 if (cbytes[i] == 0) 880 return where; 881 where--; 882 } 883 return -1; 884 } 885 886 894 public Ustr strstr(Ustr little) { 895 int where = strstr(s, little.s); 896 return (where == -1) ? null : new Ustr(s, where); 897 } 898 899 907 public static int strstr(byte [] big, byte [] little) { 908 910 for (int bi = 0; big[bi] != 0; bi++) { 911 int li; 912 for (li = 0; little[li] != 0; li++) 913 if (big[bi + li] != little[li]) 914 break; 915 if (little[li] == 0) 916 return bi; 917 } 918 return -1; 919 } 920 921 925 931 static Ustr copyValueOf(char [] data) { 932 return new Ustr(data); 933 } 934 935 943 static Ustr copyValueOf(char [] data, int offset, int count) { 944 char [] chunk = new char[count]; 945 for (int i = 0; i < count; i++) 946 chunk[i] = data[offset + i]; 947 return new Ustr(chunk); 948 } 949 950 957 public int charAt(int at) 958 throws IndexOutOfBoundsException { 959 if (at < 0) 960 throw new IndexOutOfBoundsException ("Negative Ustr charAt"); 961 int c = 0; 962 offset = 0; 963 prepareNext(); 964 do { 965 c = nextChar(); 966 at--; 967 } while (c != 0 && at >= 0); 968 969 if (at > 0) 970 throw new IndexOutOfBoundsException ("Ustr charAt too large"); 971 return c; 972 } 973 974 980 public Ustr concat(String str) { 981 Ustr us = new Ustr(str); 982 return concat(us); 983 } 984 985 991 public Ustr concat(Ustr us) { 992 Ustr ret = new Ustr(strlen() + us.strlen() + 1); 993 ret.strcpy(this); 994 ret.strcat(us); 995 return ret; 996 } 997 998 1004 public boolean endsWith(Ustr suffix) { 1005 int start = strlen() - suffix.strlen(); 1006 if (start < 0) 1007 return false; 1008 return (strcmp(s, base + start, suffix.s, suffix.base) == 0); 1009 } 1010 1011 1017 public boolean endsWith(String suffix) { 1018 return endsWith(new Ustr(suffix)); 1019 } 1020 1021 1027 public boolean equals(Object anObject) { 1028 return (compareTo(anObject) == 0); 1029 } 1030 1031 1037 public byte [] getBytes() { 1038 return toString().getBytes(); 1039 } 1040 1041 1048 public byte [] getBytes(String enc) 1049 throws java.io.UnsupportedEncodingException { 1050 return toString().getBytes(enc); 1051 } 1052 1053 1064 public static void getChars(String str, int srcBegin, int srcEnd, 1065 char [] dst, int dstBegin) { 1066 Ustr us = new Ustr(str); 1067 us.getChars(srcBegin, srcEnd, dst, dstBegin); 1068 } 1069 1070 1080 public void getChars(int srcBegin, int srcEnd, char [] dst, int dstBegin) { 1081 if (srcBegin < 0 || srcBegin > srcEnd || dstBegin < 0) 1082 throw new IndexOutOfBoundsException ("bogus getChars index bounds"); 1083 if (dst == null) 1084 throw new NullPointerException ("null 'dst' argument to getChars"); 1085 1086 prepareNext(); 1087 while (srcBegin > 0) { 1088 srcBegin--; 1089 nextChar(); 1090 } 1091 int c; 1092 int howMany = srcEnd - srcBegin; 1093 int i, j; 1094 for (i = j = 0; i < howMany; i++, j++) { 1095 c = nextChar(); 1096 if (c == 0 && i < howMany - 1) 1097 throw new IndexOutOfBoundsException ("getChars ran off buffer"); 1098 if (c < 0x10000) 1099 dst[dstBegin + j] = (char) c; 1100 else { 1101 1106 c -= 0x10000; 1107 int uHi = (c >> 10) & 0x3ff; 1108 dst[dstBegin + j] = (char) (0xd800 | uHi); 1109 j++; 1110 1111 int uLo = c & 0x3ff; 1112 dst[dstBegin + j] = (char) (0xdc00 | uLo); 1113 } 1114 } 1115 } 1116 1117 1126 public int hashCode() { 1127 long h = 0; 1128 long c; 1129 long n = length() - 1; 1130 prepareNext(); 1131 while ((c = nextChar()) != 0) { 1132 h += c * pow(31, n); 1133 n--; 1134 } 1135 return (int) (h & 0xffffffff); 1136 } 1137 1138 private static long pow(long a, long b) { 1140 long p = 1; 1141 while (b-- > 0) 1142 p *= a; 1143 return p; 1144 } 1145 1146 1153 public int indexOf(int ch) { 1154 return indexOf(ch, 0); 1155 } 1156 1157 1165 public int indexOf(int ch, int start) { 1166 int i = 0; 1167 prepareNext(); 1168 while (start-- > 0) { 1169 nextChar(); 1170 i++; 1171 } 1172 int c; 1173 while ((c = nextChar()) != 0) { 1174 if (c == ch) 1175 return i; 1176 i++; 1177 } 1178 if (ch == 0) 1179 return i; 1180 return -1; 1181 } 1182 1183 1190 public int indexOf(Ustr us) { 1191 return indexOf(us, 0); 1192 } 1193 1194 1202 public int indexOf(Ustr us, int start) { 1203 int i = 0; 1204 prepareNext(); 1205 while (start-- > 0) { 1206 nextChar(); 1207 i++; 1208 } 1209 1210 do { 1212 int j; 1213 for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++) 1214 if (s[base + offset + j] != us.s[us.base + j]) 1215 break; 1216 if (us.s[base + j] == 0) 1217 return i; 1218 i++; 1219 } while (nextChar() != 0); 1220 1221 return -1; 1222 } 1223 1224 1234 public Ustr intern() { 1235 Ustr u = (Ustr)interns.get(this); 1236 if (u != null) 1237 return u; 1238 1239 u = new Ustr(strlen() + 1); 1240 u.strcpy(this); 1241 interns.put(u, u); 1242 return u; 1243 } 1244 1245 1252 public int lastIndexOf(int ch) { 1253 return lastIndexOf(ch, length()); 1254 } 1255 1256 1264 public int lastIndexOf(int ch, int stop) { 1265 int i = 0; 1266 prepareNext(); 1267 int foundAt = -1; 1268 do { 1269 if (ch == nextChar()) 1270 foundAt = i; 1271 i++; 1272 } while (i <= stop); 1273 1274 return foundAt; 1275 } 1276 1277 1283 public int lastIndexOf(Ustr us) { 1284 return lastIndexOf(us, length()); 1285 } 1286 1287 1294 public int lastIndexOf(Ustr us, int stop) { 1295 int i = 0; 1296 int foundAt = -1; 1297 1298 prepareNext(); 1300 do { 1301 int j; 1302 for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++) 1303 if (s[base + offset + j] != us.s[us.base + j]) 1304 break; 1305 if (us.s[base + j] == 0) 1306 foundAt = i; 1307 i++; 1308 } while (nextChar() != 0 && i <= stop); 1309 1310 return foundAt; 1311 } 1312 1313 private static int bytesInChar(int c) { 1314 if (c < 128) 1315 return 1; 1316 else if (c < 0x800) 1317 return 2; 1318 else if (c < 0x10000) 1319 return 3; 1320 else 1321 return 4; 1322 } 1323 1324 1334 public Ustr replace(int oldChar, int newChar) { 1335 if (newChar < 0) 1336 throw new UstrException("Negative replacement character"); 1337 else if (newChar > 0x10ffff) 1338 throw new UstrException("Replacement character > 0x10ffff"); 1339 1340 int space = strlen() + 1; 1342 int delta = bytesInChar(newChar) - bytesInChar(newChar); 1343 if (delta != 0) { 1344 int c; 1345 1346 while ((c = nextChar()) != 0) 1347 if (c == oldChar) 1348 space += delta; 1349 } 1350 1351 Ustr us = new Ustr(space); 1352 prepareNext(); us.prepareAppend(); 1353 int c; 1354 while ((c = nextChar()) != 0) 1355 us.appendChar((c == oldChar) ? newChar : c); 1356 return us; 1357 } 1358 1359 1365 public boolean startsWith(Ustr us) { 1366 return startsWith(us, 0); 1367 } 1368 1369 1376 public boolean startsWith(Ustr us, int start) { 1377 prepareNext(); 1378 while (start-- > 0) 1379 nextChar(); 1380 1381 for (int i = 0; us.s[base + i] != 0; i++) 1382 if (s[base + offset + i] != us.s[us.base + i]) 1383 return false; 1384 1385 return true; 1386 } 1387 1388 1394 public Ustr substring(int start) { 1395 return substring(start, length()); 1396 } 1397 1398 1406 public Ustr substring(int start, int end) { 1407 if (start < 0 || end < start || end > length()) 1408 throw new IndexOutOfBoundsException ("bogus start/end"); 1409 1410 int howMany = end - start; 1411 offset = 0; 1412 1413 while (start-- > 0) { 1415 int c = s[base + offset] & 0xff; 1416 if (c == 0) 1417 throw new IndexOutOfBoundsException ("substring too long"); 1418 offset += encLength[c]; 1419 } 1420 1421 int startAt = offset; 1422 for (int i = 0; i < howMany; i++) { 1423 int c = s[base + offset] & 0xff; 1424 if (c == 0) 1425 throw new IndexOutOfBoundsException ("substring too long"); 1426 offset += encLength[c]; 1427 } 1428 int bytesToMove = offset - startAt; 1429 Ustr us = new Ustr(bytesToMove + 1); 1430 System.arraycopy(s, startAt, us.s, 0, bytesToMove); 1431 us.s[bytesToMove] = 0; 1432 1433 1439 1440 1455 return us; 1456 } 1457 1458 1463 public char [] toCharArray() { 1464 return toString().toCharArray(); 1465 } 1466} 1467 | Popular Tags |