Ustr


1   package org.jruby.util.string;
2   import java.io.Serializable  ;
3   import java.util.Hashtable  ;
4   
5   /**
6    * Ustr - rhymes with Wooster.
7    * Implements a string, with three design goals:
8    *
9    * <ol>
10   * <li>Correct implementation of Unicode semantics.</li>
11   * <li>Support for as many of java's String and StringBuffer methods as
12   *    is reasonable.</li>
13   * <li>Support for the familiar null-terminated-string primitives
14   *    of the C programming language: strcpy() and friends.</li></ol>
15   *
16   * <p>A Ustr is a fairly thin wrapper around a byte[] array, which
17   * contains null-terminated UTF8-encoded text.</p>
18   *
19   * <p><b>Note</b> that in the context of a Ustr, "index" always means how
20   * many Unicode characters you are into the Ustr's text, while "offset"
21   * always mean how many bytes you are into its UTF8 encoded form.</p>
22   *
23   * <p>Similarly, "char" and "String" always refer to the Java constructs,
24   * while "character" always means a Unicode character, always identified
25   * by a Java int.</p>
26   *
27   * <p>If any of the Ustr methods are passed an integer alleged to represent
28   * a Unicode character whose value is not a valid code point, i.e. is either
29   * negative or greater than 0x10ffff, the method will throw a UstrException,
30   * which extends RuntimeException and is thus not checked at compile-time.</p>
31   *
32   * <p>For any method that copies characters and might overrun a buffer, a
33   * "safe" version is provided, starting with an extra <code>s</code>, e.g.
34   * <code>sstrcopy</code> and <code>sstrcat</code>. These versions always
35   * arrange that the copied string not overrun the provided buffer, which
36   * will be properly null-terminated.</p>
37   *
38   * @see org.jruby.util.string.UstrException
39   */
40  public class Ustr
41          implements Comparable  , Serializable   {
42      private static final long serialVersionUID = -7263880042540200296L;
43  
44  
45      // the number of bytes of UTF8, indexed by the value of the first byte
46      private static final byte[] encLength = {
47          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
48          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
49          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
50          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
54          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
55          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
56          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
57          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
58          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
59          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
60          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
61          3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
62          4,  4,  4,  4,  4,  4,  4,  4, -1, -1, -1, -1, -1, -1, -1, -1
63      };
64      
65      
66      private static Hashtable   interns = new Hashtable  ();
67      
68      /**
69       * A null-terminated byte array containing the string in UTF-8 form.  All
70       * Ustr operations count on null-termination.  The byte array may
71       * be much bigger than the contained string
72       */
73      public byte[] s;       // UTF-encoded text
74      
75      /**
76       * Where in the array <code>s</code> the string starts.  You can
77       *  have lots of different Ustrs co-existing in a single byte array.
78       */
79      public int base = 0;   // of the start of the string
80      
81      /**
82       * To keep track of a single character position within the string;
83       * this is used by the <code>nextChar</code> and <code>appendChar</code>
84       * methods.
85       */
86      public int offset = 0; // for iterating, relative to base
87      
88      /**
89       * Creates an empty Ustr with no buffer
90       */
91      public Ustr() {
92          base = offset = 0;
93      }
94      /**
95       * Creates an empty Ustr, with a null termination at the front.
96       *
97       * @param length length of the buffer, in bytes
98       */
99      public Ustr(int length) {
100         s = new byte[length];
101         base = offset = 0;
102         s[0] = 0;
103     }
104     /**
105      * Wraps a Ustr around a buffer.  Does not do null termination, so you
106      * can pass in a buffer already containing a string.
107      *
108      * @param bytes the buffer
109      */
110     public Ustr(byte[] bytes) {
111         s = bytes;
112         base = offset = 0;
113     }
114     /**
115      * Wraps a Ustr around a position in a buffer.   Does not do null
116      * termination, so you can pass in a buffer already containing a string.
117      *
118      * @param bytes the buffer
119      * @param start where in the buffer the strings starts
120      */
121     public Ustr(byte[] bytes, int start) {
122         s = bytes;
123         base = offset = start;
124     }
125     /**
126      * Makes a Ustr which is a copy of another Ustr
127      *
128      * @param from the Ustr to copy
129      */
130     public Ustr(Ustr from) {
131         s = new byte[from.strlen() + 1];
132         base = offset = 0;
133         strcpy(from);
134     }
135     /**
136      * Makes a Ustr from a char[] array.  The Ustr is null-terminated, but
137      * no space is allocated beyond what's needed.
138      *
139      * @param chars the char array
140      */
141     public Ustr(char [] chars) {
142         
143         int size = 0;
144         for (int i = 0; i < chars.length; i++) {
145             char utf16 = chars[i];
146             // this works because surrogate characters will be counted as 2
147             //  each, and anything in the astral planes takes 4 bytes.
148             size += bytesInChar(utf16);
149         }
150         s = new byte[size + 1];
151         base = 0;
152         prepareAppend();
153         int i = 0;
154         while (i < chars.length) {
155             int val = chars[i];
156             if (val < 0xd800 || val > 0xdfff)
157                 ; // no-op
158             
159             // argh, surrogates.
160             else {
161                 if (val > 0xdbff)
162                     throw new UstrException("Mangled surrogate pair");
163                 
164                 i++;
165                 if (i == chars.length)
166                     throw new UstrException("Mangled surrogate pair");
167                 
168                 int val2 = chars[i];
169                 if (val2 < 0xdc00 || val2 > 0xdfff)
170                     throw new UstrException("Mangled surrogate pair");
171                 
172                 val &= 0x3ff;
173                 val <<= 10;
174                 val2 &= 0x3ff;
175                 val |= val2;
176                 val += 0x10000;
177             }
178             i++;
179             appendChar(val);
180         }
181         s[s.length - 1] = 0;
182     }
183 
184     /**
185      * Makes a Ustr from an int[] array, where each int is the value of
186      * a Unicode character.  Throws a UstrException if one of the ints
187      * is not a Unicode codepoint (negative or >0x10ffff).
188      *
189      * @param ints the int array
190      * @throws UstrException
191      *
192      */
193     public Ustr(int [] ints) {
194         int bufsiz = 0;
195         
196         for (int j = 0; j < ints.length; j++) {
197             int i = ints[j];
198             if (i < 0)
199                 throw new UstrException("Negative character value");
200             if (i > 0x10ffff)
201                 throw new UstrException("Character out of Unicode range");
202             
203             bufsiz += bytesInChar(i);
204             
205         }
206         s = new byte[bufsiz + 1];
207         base = offset = 0;
208         
209         for (int j = 0; j < ints.length; j++) {
210             int i = ints[j];
211             appendChar(i);
212         }
213     }
214     
215     /**
216      * Makes a Ustr from an object, based on its <code>toString()</code>.
217      * Most commonly used with a String argument.  The Ustr is null-terminated,
218      * but no space is allocated beyond what's needed.  Throws a UstrException
219      * if the environment doesn't support the UTF8 encoding.
220      *
221      * @param o the Object
222      * @throws UstrException
223      */
224     public Ustr(Object   o) {
225         byte[] inbytes;
226         
227         base = offset = 0;
228         try {
229             inbytes = o.toString().getBytes("UTF8");
230         } catch (java.io.UnsupportedEncodingException   e) {
231             throw new UstrException("UTF8 not supported!?!?");
232         }
233         
234         // because we need one more byte than getBytes provides
235         s = new byte[inbytes.length + 1];
236         for (int i = 0; i < inbytes.length; i++)
237             s[i] = inbytes[i];
238         
239         s[inbytes.length] = 0;
240     }
241     /**
242      * Makes a Ustr from an object, based on its <code>toString()</code>,
243      * leaving room for growth. Most commonly used with a String argument.
244      * The Ustr is null-terminated.
245      *
246      * @param space How large a buffer to allocate
247      * @param o     The object
248      */
249     public Ustr(int space, Object   o) {
250         s = new byte[space];
251         base = offset = 0;
252         byte [] b;
253         
254         try {
255             b = o.toString().getBytes("UTF8");
256         } catch (java.io.UnsupportedEncodingException   e) {
257             throw new RuntimeException  ("UTF8 not supported!?!?");
258         }
259         
260         for (int i = 0; i < b.length; i++)
261             s[i] = b[i];
262         
263         s[b.length] = 0;
264     }
265     
266     /**
267      * Empty a Ustr by setting its first byte to 0.
268      */
269     public void init() {
270         s[base] = 0;
271         offset = base;
272     }
273     
274     /**
275      * Supports the <code>Comparable</code> interface.  The ordering is that of
276      * native Unicode code points and probably not culturally appropriate
277      * anywhere.
278      *
279      * @param other the object compared
280      * @return -1, 0, or 1 as you'd expect.
281      */
282     public int compareTo(Object   other) {
283         Ustr o = (other instanceof Ustr) ? (Ustr) other : new Ustr(other);
284         return strcmp(s, base, o.s, o.base);
285     }
286     
287     /**
288      * Generates a Java String representing the Ustr.  Throws a UstrException
289      * if the Java environment doesn't support the UTF8 encoding.
290      *
291      * @return the String.
292      * @throws UstrException
293      */
294     public String   toString() {
295         try {
296             return new String  (s, base, strlen(), "UTF8");
297         } catch (java.io.UnsupportedEncodingException   e) {
298             throw new UstrException("UTF8 not supported!?!?");
299         }
300     }
301     
302     // per-Unicode-character operations
303     //
304     /**
305      * Length of a Ustr in Unicode characters (not bytes).
306      *
307      * @return the number of Unicode characters.
308      */
309     public int length() {
310         int saveOffset = offset;
311         int l = 0;
312         for (prepareNext(); nextChar() != 0; l++)
313             ; // empty
314         offset = saveOffset;
315         return l;
316     }
317     /**
318      * Number of Unicode characters stored starting at some offset in a byte
319      * array.  Assumes UTF-8 encoding and null termination.
320      *
321      * @param b      the byte array
322      * @param offset where to start counting
323      * @return       the number of unicode characters.
324      */
325     public static int length(byte [] b, int offset) {
326         return (new Ustr(b, offset)).length();
327     }
328     /**
329      * Number of Unicode characters stored in a byte array.  Assumes UTF-8
330      * encoding and null termination.
331      *
332      * @param b the byte array
333      * @return  the number of Unicode characters.
334      */
335     public static int length(byte [] b) {
336         return length(b, 0);
337     }
338     
339     /**
340      * Number of Unicode characters stored in a Java string.
341      * if <code>s</code> is a String, <code>s.length()</code> and
342      * <code>Ustr.length(s)</code> will be the same except when <code>s</code>
343      * contains non-BMP characters.
344      *
345      * @param str the string
346      * @return    the number of Unicode characters
347      */
348     public static int length(String   str) {
349         return (new Ustr(str)).length();
350     }
351     
352     /**
353      * Set up for <code>appendChar</code>.  Points the <code>offset</code>
354      * field at the buffer's null terminator.
355      */
356     public void prepareAppend() {
357         offset = strlen();
358     }
359     /**
360      * Append one Unicode character to a Ustr.  Assumes that the
361      * <code>offset</code> points to the null-termination,
362      * where the character ought to go, updates that field and applies
363      * another null termination.  You could change the value of
364      * <code>offset</code> and start "appending" into the middle of a Ustr
365      * if that's what you wanted.  This generates the UTF-8 bytes from
366      * the input characters.
367      * <p>If the character is less than 128, one byte of buffer is used.
368      * If less than 0x8000, two bytes.  If less than 2**16, three bytes.
369      * If less than 0x10ffff, four bytes.  If greater than 0x10ffff, or
370      * negative, you get an exception.</p>
371      *
372      * @param c the character to be appended.
373      */
374     public void appendChar(int c) {
375         offset = appendChar(c, s, offset);
376     }
377     
378     /**
379      * Writes one Unicode character into a UTF-8 encoded byte array at
380      * a given offset, and null-terminates it.  Throws a UstrException if
381      * the 'c' argument is not a Unicode codepoint (negative or >0x10ffff)
382      *
383      * @param        c the Unicode character
384      * @param        s the array
385      * @param offset the offset to write at
386      * @return       the offset of the null byte after the encoded character
387      * @throws       UstrException
388      */
389     public static int appendChar(int c, byte [] s, int offset) {
390         if (c < 0)
391             throw new UstrException("Appended negative character");
392         if (c < 128)
393             s[offset++] = (byte) c;
394         else if (c <= 0x7ff) {
395             s[offset++] = (byte) (  (c >> 6) | 0xc0);
396             s[offset++] = (byte) ((c & 0x3f) | 0x80);
397         } else if (c <= 0xffff) {
398             s[offset++] = (byte) (        (c >> 12) | 0xe0);
399             s[offset++] = (byte) (((c >> 6) & 0x3f) | 0x80);
400             s[offset++] = (byte) (       (c & 0x3f) | 0x80);
401         } else if (c <= 0x10ffff) {
402             s[offset++] = (byte) (         (c >> 18) | 0xf0);
403             s[offset++] = (byte) (((c >> 12) & 0x3f) | 0x80);
404             s[offset++] = (byte) ( ((c >> 6) & 0x3f) | 0x80);
405             s[offset++] = (byte) (        (c & 0x3f) | 0x80);
406         } else
407             throw new UstrException("Appended character > 0x10ffff");
408         s[offset] = 0;
409         return offset;
410     }
411     
412     /**
413      * Set up for <code>nextChar()</code>.  Points the <code>offset</code>
414      * field at the start of the buffer.
415      */
416     public void prepareNext() {
417         offset = base;
418     }
419     /**
420      * Retrieve one Unicode character from a Ustr and advance the working
421      * offset.  Assumes the working offset is sanely located.
422      *
423      * @return the Unicode character, 0 signaling the end of the string
424      */
425     public int nextChar() {
426         if (s[offset] == 0)
427             return 0;
428         if ((s[offset] & 0x80) == 0)
429             return (int) s[offset++];
430         if ((s[offset] & 0xe0) == 0xc0) {
431             // 110w wwww 10zz zzzz
432             // xxxx xwww wwzz zzzz
433             int c = (s[offset++] & 0x1f) << 6;
434             c |= s[offset++] & 0x3f;
435             return c;
436         }
437         if ((s[offset] & 0xf0) == 0xe0) {
438             // 1110 wwww 10zz zzzz 10xx xxxx
439             // wwww zzzz zzxx xxxx
440             int c = (s[offset++] & 0xf) << 12;
441             c |= (s[offset++] & 0x3f) << 6;
442             c |= s[offset++] & 0x3f;
443             return c;
444         }
445         // 1111 0www 10zz zzzz 10xx xxxx 10yy yyyy
446         // wwwwzz zzzzxxxx xxyyyyyy
447         int c = (s[offset++] & 0x7) << 18;
448         c |= (s[offset++] & 0x3f) << 12;
449         c |= (s[offset++] & 0x3f) << 6;
450         c |= s[offset++] & 0x3f;
451         return c;
452     }
453     
454     // Strlen variants
455     //
456     /**
457      * The length in bytes of a Ustr's UTF representation.  Assumes
458      * null-termination.
459      *
460      * @return the number of bytes
461      */
462     public int strlen() {
463         return strlen(s, base);
464     }
465     /**
466      * The length in bytes of a null-terminated byte array
467      *
468      * @param b the array
469      * @return  the number of bytes
470      */
471     public static int strlen(byte [] b) {
472         int i = 0;
473         while (b[i] != 0)
474             i++;
475         return i;
476     }
477     /**
478      * The length in bytes of a null-terminated sequence starting at some
479      * offset in a byte array.
480      *
481      * @param b    the byte array
482      * @param base the byte offset to start counting at
483      * @return     the number of bytes
484      */
485     public static int strlen(byte [] b, int base) {
486         int i = base;
487         while (b[i] != 0)
488             i++;
489         return i - base;
490     }
491     
492     // Strcpy variants
493     //
494     /**
495      * Copy a null-terminated byte array.
496      *
497      * @param to   destination array
498      * @param from source array
499      * @return     the destination array
500      */
501     public static byte [] strcpy(byte [] to, byte [] from) {
502         return strcpy(to, 0, from, 0);
503     }
504     /**
505      * Copy null-terminated byte arrays with control over offsets.
506      *
507      * @param to    destination array
508      * @param tbase starting offset in destination array
509      * @param from  source array
510      * @param fbase starting offset in source array
511      * @return      the destination array
512      */
513     public static byte [] strcpy(byte [] to, int tbase, byte [] from, int fbase) {
514         while (from[fbase] != 0)
515             to[tbase++] = from[fbase++];
516         to[tbase] = 0;
517         
518         return to;
519     }
520     /**
521      * Copy in the contents of another Ustr.  Does not change the offset.
522      *
523      * @param from source Ustr
524      * @return     this Ustr
525      */
526     public Ustr strcpy(Ustr from) {
527         strcpy(s, base, from.s, from.base);
528         return this;
529     }
530     
531     /**
532      * Copy in the String representation of an Object.  Does not change the
533      * offset.
534      *
535      * @param o the source object
536      * @return  this Ustr
537      */
538     public Ustr strcpy(Object   o) {
539         strcpy(new Ustr(o));
540         return this;
541     }
542     /**
543      * Copy in the contents of a null-terminated byte array.  Does not change
544      * the offset.
545      *
546      * @param from the byte array
547      * @return     this Ustr
548      */
549     public Ustr strcpy(byte[] from) {
550         strcpy(s, from);
551         return this;
552     }
553     /**
554      * Copy in the contents at some offset in a null-terminated byte array.
555      * Does not change the offset.
556      *
557      * @param from    the source byte array
558      * @param boffset where to start copying in the source array
559      * @return        this Ustr
560      */
561     public Ustr strcpy(byte[] from, int boffset) {
562         strcpy(s, 0, from, boffset);
563         return this;
564     }
565     /**
566      *
567      * Load a null-terminated UTF-8 encoding of a String into a byte array at
568      * the front.
569      *
570      * @param b      the byte array
571      * @param s      the String
572      *
573      * @return the byte array
574      */
575     public static byte [] strcpy(byte [] b, String   s) {
576         return strcpy(b, 0, s);
577     }
578     
579     /**
580      * Load a null-terminated UTF-8 encoding of a String into a byte array.
581      *
582      * @param b      the byte array
583      * @param offset where in the byte array to load
584      * @param s      the String
585      *
586      * @return the byte array
587      */
588     public static byte [] strcpy(byte [] b, int offset, String   s) {
589         byte [] sbytes;
590         
591         try { sbytes = s.getBytes("UTF8"); } catch (java.io.UnsupportedEncodingException   e) {
592             throw new RuntimeException  ("UTF8 not supported!?!?"); }
593         
594         for (int i = 0; i < sbytes.length; i++)
595             b[offset + i] = sbytes[i];
596         b[offset + sbytes.length] = 0;
597         return b;
598     }
599     
600     
601     // safe versions
602     // could check for to.length myself, but since Java is necessarily
603     //  doing this for me each time around the loop, why bother?
604     //
605     /**
606      * Safely append one Ustr to another.
607      *
608      * @param from the Ustr to be appended
609      * @return     this
610      */
611     public Ustr sstrcat(Ustr from) {
612         sstrcat(s, base, from.s, from.base);
613         return this;
614     }
615     
616     /**
617      * Safely append one null-terminated byte array to another.  Destination
618      * buffer will not be overrun.
619      *
620      * @param to   dest array
621      * @param from source array
622      * @return     dest array
623      */
624     public byte [] sstrcat(byte [] to, byte[] from) {
625         return sstrcat(to, 0, from, 0);
626     }
627     /**
628      * Safely append one null-terminated byte array to another with control
629      * over offsets.  Destination buffer will not be overrun.
630      *
631      * @param to    dest array
632      * @param tbase base of dest array
633      * @param from  source array
634      * @param fbase base of source array
635      * @return to
636      */
637     public static byte [] sstrcat(byte [] to, int tbase, byte [] from, int fbase) {
638         // don't want to catch if the dest string is malformed
639         while (to[tbase] != 0)
640             tbase++;
641         
642         try {
643             while (from[fbase] != 0)
644                 to[tbase++] = from[fbase++];
645             to[tbase] = 0;
646             
647             return to;
648         } catch (java.lang.ArrayIndexOutOfBoundsException   e) {
649             if (tbase >= to.length)
650                 to[to.length - 1] = 0;
651             else
652                 throw e;
653         }
654         return to;
655     }
656     
657     /**
658      * Safely copy null-terminated byte arrays with control over offsets.
659      * Destination buffer will not be overrun.
660      *
661      * @param to    destination array
662      * @param tbase starting offset in destination array
663      * @param from  source array
664      * @param fbase starting offset in source array`
665      * @return      the destination array
666      */
667     public static byte [] sstrcpy(byte [] to, int tbase, byte [] from, int fbase) {
668         try {
669             while (from[fbase] != 0)
670                 to[tbase++] = from[fbase++];
671             to[tbase] = 0;
672         }
673         
674         catch (java.lang.ArrayIndexOutOfBoundsException   e) {
675             // if the buffer's too short
676             if (tbase >= to.length)
677                 to[to.length - 1] = 0;
678             
679             // otherwise there's some problem with the source string, we
680             //  shouldn't catch it
681             else
682                 throw e;
683         }
684         return to;
685     }
686     /**
687      * Safely copy a null-terminated byte array.  The destination buffer will not
688      * be overrun.
689      *
690      * @param to   destination array
691      * @param from source array
692      * @return     the destination array
693      */
694     public static byte [] sstrcpy(byte [] to, byte [] from) {
695         return sstrcpy(to, 0, from, 0);
696     }
697     
698     /**
699      * Safely copy in the contents of another Ustr.  Does not change the offset.
700      * The destination buffer will not be overrun.
701      *
702      * @param from source Ustr
703      * @return     this Ustr
704      */
705     public Ustr sstrcpy(Ustr from) {
706         sstrcpy(s, base, from.s, from.base);
707         return this;
708     }
709     
710     /**
711      * Copy one null-terminated array to the end of another, with
712      * starting offsets for each
713      *
714      * @param to    destination array
715      * @param tbase  base pos of destination
716      * @param from  source array
717      * @param fbase base pos of source
718      * @return      destination
719      */
720     public static byte [] strcat(byte [] to, int tbase, byte [] from, int fbase) {
721         while (to[tbase] != 0)
722             tbase++;
723         
724         while (from[fbase] != 0)
725             to[tbase++] = from[fbase++];
726         to[tbase] = 0;
727         
728         return to;
729     }
730     
731     /**
732      * Copy one null-terminated byte array to the end of another.
733      *
734      * @param to   destination array
735      * @param from source array
736      * @return     the destionation array
737      */
738     public static byte [] strcat(byte [] to, byte [] from) {
739         return strcat(to, 0, from, 0);
740     }
741     
742     /**
743      * Append the contents of another Ustr to the end of this one
744      *
745      * @param  other the other Ustr
746      * @return       this Ustr
747      */
748     public Ustr strcat(Ustr other) {
749         strcat(s, other.s);
750         return this;
751     }
752     
753     /**
754      * Compare two null-terminated byte arrays.  The ordering is that of
755      * native Unicode code points and probably not culturally appropriate
756      * anywhere.
757      *
758      * @param s1 first byte array
759      * @param s2 second byte array
760      * @return   a negative number, zero, or a positive number depending
761      * on whether s1 is lexically less than, equal to, or greater than s2.   */
762     public static int strcmp(byte [] s1, byte [] s2) {
763         return strcmp(s1, 0, s2, 0);
764     }
765     /**
766      * Compare sections of two null-terminated byte arrays.  The ordering is
767      * that of
768      * native Unicode code points and probably not culturally appropriate
769      * anywhere.
770      *
771      * @param s1     first byte array
772      * @param s1base byte offset in first array to start comparing
773      * @param s2     second byte array
774      * @param s2base byte offset in second array to start comparing
775      * @return       a negative number, zero, or a positive number depending on
776      * whether s1 is lexically less than, equal to, or greater than s2.
777      */
778     public static int strcmp(byte [] s1, int s1base, byte [] s2, int s2base) {
779         
780         Ustr u1 = new Ustr(s1, s1base);
781         Ustr u2 = new Ustr(s2, s2base);
782         
783         int c1 = u1.nextChar();
784         int c2 = u2.nextChar();
785         
786         while (c1 != 0 && c2 != 0 && c1 == c2) {
787             c1 = u1.nextChar();
788             c2 = u2.nextChar();
789         }
790         
791         return c1 - c2;
792     }
793     /**
794      * Compare two Ustrs.  The ordering is that of
795      * native Unicode code points and probably not culturally appropriate
796      * anywhere.
797      *
798      * @param other the other Ustr
799      * @return   a negative number, zero, or a positive number depending on
800      * whether the other is lexically less than, equal to, or greater than this.
801      */
802     public int strcmp(Ustr other) {
803         return strcmp(s, base, other.s, other.base);
804     }
805     
806     /**
807      * Compare a Ustr to an object's String representation.  The ordering
808      * is that of native Unicode code points and probably not culturally
809      * appropriate anywhere.
810      *
811      * @param other the other Object
812      * @return   a negative number, zero, or a positive number depending on
813      * whether the other is lexically less than, equal to, or greater than this.
814      */
815     public int strcmp(Object   other) {
816         return strcmp(new Ustr(other));
817     }
818     
819     /**
820      * Locate a Unicode character in a Ustr.  Returns null if not
821      * found; if the character is zero, finds the offset of the null termination.
822      *
823      * @param c the character, as an integer
824      * @return  a Ustr with the same buffer, starting at the matching character,
825      * or null if it's not found.
826      */
827     public Ustr strchr(int c) {
828         int where = strchr(s, c);
829         return (where == -1) ? null : new Ustr(s, where);
830     }
831     
832     /**
833      * Find the offset where a Unicode character starts in a null-terminated
834      * UTF-encoded byte array.
835      * Returns -1 if not found; if the character is zero, finds the index of
836      * the null termination.
837      *
838      * @param b UTF-encoded null-terminated byte array
839      * @return  the offset in the string, or -1
840      */
841     public static int strchr(byte [] b, int c) {
842         byte [] cbytes = new byte[10];
843         appendChar(c, cbytes, 0);
844         return strstr(b, cbytes);
845     }
846     
847     /**
848      * Locate the last occurrence of a Unicode character in a Ustr.
849      * If found, returns a Ustr built around the same buffer as
850      * this, with the base set to the matching location.  If not found, null
851      *
852      * @param c the character, as an integer
853      * @return  a Ustr with the base set to the match, or null
854      */
855     public Ustr strrchr(int c) {
856         int where = strrchr(s, c);
857         return (where == -1) ? null : new Ustr(s, where);
858     }
859     
860     /**
861      * Find the index of the last appearance of a Unicode character in a
862      * null-terminated UTF-encoded byte array.
863      * Returns -1 if not found.
864      *
865      * @param b the byte array
866      * @param c the integer
867      * @return  the offset where the last occurence of c starts, or -1
868      */
869     public static int strrchr(byte [] b, int c) {
870         byte [] cbytes = new byte[10];
871         appendChar(c, cbytes, 0);
872         
873         int where = b.length - strlen(cbytes);
874         while (where >= 0) {
875             int i;
876             for (i = 0; cbytes[i] != 0; i++)
877                 if (b[where + i] != cbytes[i])
878                     break;
879             if (cbytes[i] == 0)
880                 return where;
881             where--;
882         }
883         return -1;
884     }
885     
886     /**
887      * Locate a substring in a string.  Returns a Ustr built around the same
888      * buffer, but starting at the matching position, or null if no match
889      * is found.
890      *
891      * @param little the substring to be located
892      * @return       matching Ustr, or null
893      */
894     public Ustr strstr(Ustr little) {
895         int where = strstr(s, little.s);
896         return (where == -1) ? null : new Ustr(s, where);
897     }
898     
899     /**
900      * locate a substring in a byte array.  Returns the offset of the substring
901      * if it matches, otherwise -1.
902      *
903      * @param big    the array to search in
904      * @param little the array to search for
905      * @return       the index of the match, or -1
906      */
907     public static int strstr(byte [] big, byte [] little) {
908         // should BoyerMooreify this...
909         
910         for (int bi = 0; big[bi] != 0; bi++) {
911             int li;
912             for (li = 0; little[li] != 0; li++)
913                 if (big[bi + li] != little[li])
914                     break;
915             if (little[li] == 0)
916                 return bi;
917         }
918         return -1;
919     }
920     
921     /////////////////////////////////////////////////////////////////
922     // From here on down the methods are those from java.lang.String
923     /////////////////////////////////////////////////////////////////
924     
925     /**
926      * Returns a Ustr generated from the char array.
927      *
928      * @param data the char array
929      * @return     a new Ustr
930      */
931     static Ustr copyValueOf(char [] data) {
932         return new Ustr(data);
933     }
934     
935     /**
936      * Returns a Ustr generated from a piece of the char array.
937      *
938      * @param data   the char array
939      * @param offset where to start generating from
940      * @param count  how many java chars to use
941      * @return       a new Ustr
942      */
943     static Ustr copyValueOf(char [] data, int offset, int count) {
944         char [] chunk = new char[count];
945         for (int i = 0; i < count; i++)
946             chunk[i] = data[offset + i];
947         return new Ustr(chunk);
948     }
949     
950     /**
951      * find the Unicode character at some index in a Ustr.  Throws an
952      * IndexOutOfBounds exception if appropriate.
953      *
954      * @param at the index
955      * @return   the Unicode character, as an integer
956      */
957     public int charAt(int at)
958             throws IndexOutOfBoundsException   {
959         if (at < 0)
960             throw new IndexOutOfBoundsException  ("Negative Ustr charAt");
961         int c = 0;
962         offset = 0;
963         prepareNext();
964         do {
965             c = nextChar();
966             at--;
967         } while (c != 0 && at >= 0);
968         
969         if (at > 0)
970             throw new IndexOutOfBoundsException  ("Ustr charAt too large");
971         return c;
972     }
973     
974     /**
975      * Append a String to the end of this.
976      *
977      * @param str the string
978      * @return a  a new Ustr which contains the concatenation
979      */
980     public Ustr concat(String   str) {
981         Ustr us = new Ustr(str);
982         return concat(us);
983     }
984     
985     /**
986      * Append a Ustr to the end of this.
987      *
988      * @param us the ustr to append
989      * @return   a new ustr
990      */
991     public Ustr concat(Ustr us) {
992         Ustr ret = new Ustr(strlen() + us.strlen() + 1);
993         ret.strcpy(this);
994         ret.strcat(us);
995         return ret;
996     }
997     
998     /**
999      * Test if this Ustr ends with the specified suffix (a Ustr).
1000     *
1001     * @param suffix the possible suffix.
1002     * @return       true or false.
1003     */
1004    public boolean endsWith(Ustr suffix) {
1005        int start = strlen() - suffix.strlen();
1006        if (start < 0)
1007            return false;
1008        return (strcmp(s, base + start, suffix.s, suffix.base) == 0);
1009    }
1010    
1011    /**
1012     * Test if this Ustr ends with specified suffix (a String).
1013     *
1014     * @param suffix the possible suffix
1015     * @return       true or false
1016     */
1017    public boolean endsWith(String   suffix) {
1018        return endsWith(new Ustr(suffix));
1019    }
1020    
1021    /**
1022     * Compares this Ustr to another object.
1023     *
1024     * @param anObject the other object
1025     * @return         true or false
1026     */
1027    public boolean equals(Object   anObject) {
1028        return (compareTo(anObject) == 0);
1029    }
1030    
1031    /**
1032     * Convert this Ustr into bytes according to the platform's default
1033     * character encoding, storing the result in a new byte array.
1034     *
1035     * @return a new byte array
1036     */
1037    public byte [] getBytes() {
1038        return toString().getBytes();
1039    }
1040    
1041    /**
1042     * Convert this Ustr into bytes according to the specified
1043     * character encoding, storing the result into a new byte array.
1044     *
1045     * @param enc the encoding to use in generating bytes
1046     * @return    the new byte array
1047     */
1048    public byte [] getBytes(String   enc)
1049            throws java.io.UnsupportedEncodingException   {
1050        return toString().getBytes(enc);
1051    }
1052    
1053    /**
1054     * Copies Unicode characters from this String into the destination
1055     * char array.  Note that if the String contains UTF-16 surrogate
1056     * pairs, each pair counts as a single character.
1057     *
1058     * @param str      the string
1059     * @param srcBegin where to start copying
1060     * @param srcEnd   index after last char to copy
1061     * @param dst      start of destination array
1062     * @param dstBegin where in the destination array to start copying
1063     */
1064    public static void getChars(String   str, int srcBegin, int srcEnd,
1065            char [] dst, int dstBegin) {
1066        Ustr us = new Ustr(str);
1067        us.getChars(srcBegin, srcEnd, dst, dstBegin);
1068    }
1069    
1070    /**
1071     * Copies Unicode characters from this Ustr into the destination
1072     * char array.  We can't just dispatch to the String implementation
1073     * because we do Unicode characters, it does UTF-16 code points
1074     *
1075     * @param srcBegin where to start copying
1076     * @param srcEnd   index after last char to copy
1077     * @param dst      start of destination array
1078     * @param dstBegin where in the destination array to start copying
1079     */
1080    public void getChars(int srcBegin, int srcEnd, char [] dst, int dstBegin) {
1081        if (srcBegin < 0 || srcBegin > srcEnd || dstBegin < 0)
1082            throw new IndexOutOfBoundsException  ("bogus getChars index bounds");
1083        if (dst == null)
1084            throw new NullPointerException  ("null 'dst' argument to getChars");
1085        
1086        prepareNext();
1087        while (srcBegin > 0) {
1088            srcBegin--;
1089            nextChar();
1090        }
1091        int c;
1092        int howMany = srcEnd - srcBegin;
1093        int i, j;
1094        for (i = j = 0; i < howMany; i++, j++) {
1095            c = nextChar();
1096            if (c == 0 && i < howMany - 1)
1097                throw new IndexOutOfBoundsException  ("getChars ran off buffer");
1098            if (c < 0x10000)
1099                dst[dstBegin + j] = (char) c;
1100            else {
1101                // two UTF-16 codepoints
1102                // 10346 => D800/DF46
1103                // 000uuuuuxxxxxxxxxxxxxxxx 110110wwwwxxxxxx 110111xxxxxxxxxx
1104                // where wwww = uuuuu - 1
1105                
1106                c -= 0x10000;
1107                int uHi = (c >> 10) & 0x3ff;
1108                dst[dstBegin + j] = (char) (0xd800 | uHi);
1109                j++;
1110                
1111                int uLo = c & 0x3ff;
1112                dst[dstBegin + j] = (char) (0xdc00 | uLo);
1113            }
1114        }
1115    }
1116    
1117    /**
1118     * Returns a hashcode for this Ustr.  The algorithm is that documented
1119     * for String, only that documentation says 'int'
1120     * arithmetic, which is clearly wrong, but this produces the same result
1121     * as String's hashCode() for the strings "1" and "2", and thus by
1122     * induction must be correct.
1123     *
1124     * @return an integer hashcode
1125     */
1126    public int hashCode() {
1127        long h = 0;
1128        long c;
1129        long n = length() - 1;
1130        prepareNext();
1131        while ((c = nextChar()) != 0) {
1132            h += c * pow(31, n);
1133            n--;
1134        }
1135        return (int) (h & 0xffffffff);
1136    }
1137    
1138    // er blush I'm on a plane and can't find long exponentiation in Java
1139    private static long pow(long a, long b) {
1140        long p = 1;
1141        while (b-- > 0)
1142            p *= a;
1143        return p;
1144    }
1145    
1146    /**
1147     * Returns the first index within this Ustr of the specified
1148     * Unicode character.
1149     *
1150     * @param ch    the character
1151     * @return      index (usable by charAt) in the string of the char, or -1
1152     */
1153    public int indexOf(int ch) {
1154        return indexOf(ch, 0);
1155    }
1156    
1157    /**
1158     * Returns the first index within this Ustr of the specified
1159     * character, starting at the specified index.
1160     *
1161     * @param ch    the character
1162     * @param start where to start looking
1163     * @return      index (usable by charAt) in the string of the char, or -1
1164     */
1165    public int indexOf(int ch, int start) {
1166        int i = 0;
1167        prepareNext();
1168        while (start-- > 0) {
1169            nextChar();
1170            i++;
1171        }
1172        int c;
1173        while ((c = nextChar()) != 0) {
1174            if (c == ch)
1175                return i;
1176            i++;
1177        }
1178        if (ch == 0)
1179            return i;
1180        return -1;
1181    }
1182    
1183    /**
1184     * Returns the index within this Ustr of the first occurrence of the
1185     * specified other Ustr, or -1.
1186     *
1187     * @param us the other Ustr
1188     * @return   the index of the match, or -1
1189     */
1190    public int indexOf(Ustr us) {
1191        return indexOf(us, 0);
1192    }
1193    
1194    /**
1195     * Returns the index within this Ustr of the first occurrence of the
1196     * specified other Ustr starting at the given offset, or -1.
1197     *
1198     * @param us    the other Ustr
1199     * @param start the index to start looking
1200     * @return      the index of the match, or -1
1201     */
1202    public int indexOf(Ustr us, int start) {
1203        int i = 0;
1204        prepareNext();
1205        while (start-- > 0) {
1206            nextChar();
1207            i++;
1208        }
1209        
1210        // we'll work at the UTF level, but this should be BoyerMoore-ized
1211        do {
1212            int j;
1213            for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++)
1214                if (s[base + offset + j] != us.s[us.base + j])
1215                    break;
1216            if (us.s[base + j] == 0)
1217                return i;
1218            i++;
1219        } while (nextChar() != 0);
1220        
1221        return -1;
1222    }
1223    
1224    /**
1225     * returns a canonical version of the Ustr, which should be treated as
1226     * read-only.  Differs from the intern function
1227     * of String in that it never returns the input string; if a new hashtable
1228     * entry is required, it makes a new Ustr and returns that.  If a programmer
1229     * updates the contents of a Ustr returned from intern(), grave disorder
1230     * will ensue.
1231     *
1232     * @return the canonical version of the Ustr.
1233     */
1234    public Ustr intern() {
1235        Ustr u = (Ustr)interns.get(this);
1236        if (u != null)
1237            return u;
1238        
1239        u = new Ustr(strlen() + 1);
1240        u.strcpy(this);
1241        interns.put(u, u);
1242        return u;
1243    }
1244    
1245    /**
1246     * Returns the index within this Ustr of the last occurrence of the
1247     * specified Unicode character.
1248     *
1249     * @param ch   the character
1250     * @return     the last index of the character, or -1
1251     */
1252    public int lastIndexOf(int ch) {
1253        return lastIndexOf(ch, length());
1254    }
1255    
1256    /**
1257     * Returns the index within this Ustr of the last occurrence of the
1258     * specified Unicode character before the specified stop index.
1259     *
1260     * @param ch   the character
1261     * @param stop last index to consider
1262     * @return     the last index of the character, or -1
1263     */
1264    public int lastIndexOf(int ch, int stop) {
1265        int i = 0;
1266        prepareNext();
1267        int foundAt = -1;
1268        do {
1269            if (ch == nextChar())
1270                foundAt = i;
1271            i++;
1272        } while (i <= stop);
1273        
1274        return foundAt;
1275    }
1276    
1277    /**
1278     * Finds the last substring match.
1279     *
1280     * @param us   the subtring to search for
1281     * @return     the match index, or =1
1282     */
1283    public int lastIndexOf(Ustr us) {
1284        return lastIndexOf(us, length());
1285    }
1286    
1287    /**
1288     * Finds the last substring match before the given index.
1289     *
1290     * @param us   the subtring to search for
1291     * @param stop where to stop searching
1292     * @return     the match index, or =1
1293     */
1294    public int lastIndexOf(Ustr us, int stop) {
1295        int i = 0;
1296        int foundAt = -1;
1297        
1298        // we'll work at the UTF level, but this should be BoyerMoore-ized
1299        prepareNext();
1300        do {
1301            int j;
1302            for (j = 0; s[base + offset + j] != 0 && us.s[us.base + j] != 0; j++)
1303                if (s[base + offset + j] != us.s[us.base + j])
1304                    break;
1305            if (us.s[base + j] == 0)
1306                foundAt = i;
1307            i++;
1308        } while (nextChar() != 0 && i <= stop);
1309        
1310        return foundAt;
1311    }
1312    
1313    private static int bytesInChar(int c) {
1314        if (c < 128)
1315            return 1;
1316        else if (c < 0x800)
1317            return 2;
1318        else if (c < 0x10000)
1319            return 3;
1320        else
1321            return 4;
1322    }
1323    
1324    /**
1325     * returns a new Ustr with all instances of one Unicode character replaced
1326     * by another.  Throws a UstrException if newChar
1327     * is not a Unicode codepoint (negative or >0x10ffff).
1328     *
1329     * @param oldChar the Unicode character to be replaced
1330     * @param newChar the Unicode character to replace it with
1331     * @return        the new Ustr
1332     * @throws        UstrException
1333     */
1334    public Ustr replace(int oldChar, int newChar) {
1335        if (newChar < 0)
1336            throw new UstrException("Negative replacement character");
1337        else if (newChar > 0x10ffff)
1338            throw new UstrException("Replacement character > 0x10ffff");
1339        
1340        // figure out how much space we need
1341        int space = strlen() + 1;
1342        int delta = bytesInChar(newChar) - bytesInChar(newChar);
1343        if (delta != 0) {
1344            int c;
1345            
1346            while ((c = nextChar()) != 0)
1347                if (c == oldChar)
1348                    space += delta;
1349        }
1350        
1351        Ustr us = new Ustr(space);
1352        prepareNext(); us.prepareAppend();
1353        int c;
1354        while ((c = nextChar()) != 0)
1355            us.appendChar((c == oldChar) ? newChar : c);
1356        return us;
1357    }
1358    
1359    /**
1360     * Tests if other Ustr is prefix of this.
1361     *
1362     * @param us    the other Ustr
1363     * @return      true/false
1364     */
1365    public boolean startsWith(Ustr us) {
1366        return startsWith(us, 0);
1367    }
1368    
1369    /**
1370     * Tests if other Ustr is prefix at given index.
1371     *
1372     * @param us    the other Ustr
1373     * @param start where to test
1374     * @return      true/false
1375     */
1376    public boolean startsWith(Ustr us, int start) {
1377        prepareNext();
1378        while (start-- > 0)
1379            nextChar();
1380        
1381        for (int i = 0; us.s[base + i] != 0; i++)
1382            if (s[base + offset + i] != us.s[us.base + i])
1383                return false;
1384        
1385        return true;
1386    }
1387    
1388    /**
1389     * makes a new substring of a Ustr given a start index.
1390     *
1391     * @param start index of start of substr
1392     * @return      new Ustr containing substr
1393     */
1394    public Ustr substring(int start) {
1395        return substring(start, length());
1396    }
1397    
1398    /**
1399     * makes a new substring of a Ustr identified by start and end
1400     *  indices.
1401     *
1402     * @param start index of start of substr
1403     * @param end   index of end of substr
1404     * @return      new Ustr containing substr
1405     */
1406    public Ustr substring(int start, int end) {
1407        if (start < 0 || end < start || end > length())
1408            throw new IndexOutOfBoundsException  ("bogus start/end");
1409        
1410        int howMany = end - start;
1411        offset = 0;
1412        
1413        // move up to the start
1414        while (start-- > 0) {
1415            int c = s[base + offset] & 0xff;
1416            if (c == 0)
1417                throw new IndexOutOfBoundsException  ("substring too long");
1418            offset += encLength[c];
1419        }
1420        
1421        int startAt = offset;
1422        for (int i = 0; i < howMany; i++) {
1423            int c = s[base + offset] & 0xff;
1424            if (c == 0)
1425                throw new IndexOutOfBoundsException  ("substring too long");
1426            offset += encLength[c];
1427        }
1428        int bytesToMove = offset - startAt;
1429        Ustr us = new Ustr(bytesToMove + 1);
1430        System.arraycopy(s, startAt, us.s, 0, bytesToMove);
1431        us.s[bytesToMove] = 0;
1432        
1433    /*
1434    int to = 0;
1435    while (startAt < offset)
1436      us.s[to++] = s[startAt++];
1437    us.s[to] = 0;
1438     */
1439        
1440    /*
1441    prepareNext();
1442    while (start-- > 0)
1443      nextChar();
1444     
1445    Ustr us = new Ustr(strlen(s, offset) + 1);
1446     
1447    us.prepareAppend();
1448    for (int i = 0; i < howMany; i++) {
1449      int c = nextChar();
1450      if (c == 0)
1451        throw new IndexOutOfBoundsException("substring too long");
1452      us.appendChar(c);
1453    }
1454     */
1455        return us;
1456    }
1457    
1458    /**
1459     * converts a Ustr to a char array.
1460     *
1461     * @return the new char array
1462     */
1463    public char [] toCharArray() {
1464        return toString().toCharArray();
1465    }
1466}
1467
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags