KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > lang > UScriptRun


1 /*
2  *******************************************************************************
3  *
4  * Copyright (C) 1999-2006, International Business Machines
5  * Corporation and others. All Rights Reserved.
6  *
7  *******************************************************************************
8  */

9
10 package com.ibm.icu.lang;
11
12 import com.ibm.icu.text.UTF16;
13
14 /**
15  * <code>UScriptRun</code> is used to find runs of characters in
16  * the same script, as defined in the <code>UScript</code> class.
17  * It implements a simple iterator over an array of characters.
18  * The iterator will assign <code>COMMON</code> and <code>INHERITED</code>
19  * characters to the same script as the preceeding characters. If the
20  * COMMON and INHERITED characters are first, they will be assigned to
21  * the same script as the following characters.
22  *
23  * The iterator will try to match paired punctuation. If it sees an
24  * opening punctuation character, it will remember the script that
25  * was assigned to that character, and assign the same script to the
26  * matching closing punctuation.
27  *
28  * No attempt is made to combine related scripts into a single run. In
29  * particular, Hiragana, Katakana, and Han characters will appear in seperate
30  * runs.
31
32  * Here is an example of how to iterate over script runs:
33  * <pre>
34  * void printScriptRuns(char[] text)
35  * {
36  * UScriptRun scriptRun = new UScriptRun(text);
37  *
38  * while (scriptRun.next()) {
39  * int start = scriptRun.getScriptStart();
40  * int limit = scriptRun.getScriptLimit();
41  * int script = scriptRun.getScriptCode();
42  *
43  * System.out.println("Script \"" + UScript.getName(script) + "\" from " +
44  * start + " to " + limit + ".");
45  * }
46  * }
47  * </pre>
48  *
49  * @internal
50  * @deprecated This API is ICU internal only.
51  */

52 public final class UScriptRun
53 {
54     /**
55      * Puts a copyright in the .class file
56      */

57     private static final String JavaDoc copyrightNotice
58         = "Copyright \u00a91999-2002 IBM Corp. All rights reserved.";
59
60     /**
61      * Construct an empty <code>UScriptRun</code> object. The <code>next()</code>
62      * method will return <code>false</code> the first time it is called.
63      *
64      * @internal
65      * @deprecated This API is ICU internal only.
66      */

67     public UScriptRun()
68     {
69         char[] nullChars = null;
70         
71         reset(nullChars, 0, 0);
72     }
73     
74     /**
75      * Construct a <code>UScriptRun</code> object which iterates over the
76      * characters in the given string.
77      *
78      * @param text the string of characters over which to iterate.
79      *
80      * @internal
81      * @deprecated This API is ICU internal only.
82      */

83     public UScriptRun(String JavaDoc text)
84     {
85         reset (text);
86     }
87     
88     /**
89      * Construct a <code>UScriptRun</code> object which iterates over a subrange
90      * of the characetrs in the given string.
91      *
92      * @param text the string of characters over which to iterate.
93      * @param start the index of the first character over which to iterate
94      * @param count the number of characters over which to iterate
95      *
96      * @internal
97      * @deprecated This API is ICU internal only.
98      */

99     public UScriptRun(String JavaDoc text, int start, int count)
100     {
101         reset(text, start, count);
102     }
103
104     /**
105      * Construct a <code>UScriptRun</code> object which iterates over the given
106      * characetrs.
107      *
108      * @param chars the array of characters over which to iterate.
109      *
110      * @internal
111      * @deprecated This API is ICU internal only.
112      */

113     public UScriptRun(char[] chars)
114     {
115         reset(chars);
116     }
117
118     /**
119      * Construct a <code>UScriptRun</code> object which iterates over a subrange
120      * of the given characetrs.
121      *
122      * @param chars the array of characters over which to iterate.
123      * @param start the index of the first character over which to iterate
124      * @param count the number of characters over which to iterate
125      *
126      * @internal
127      * @deprecated This API is ICU internal only.
128      */

129     public UScriptRun(char[] chars, int start, int count)
130     {
131         reset(chars, start, count);
132     }
133
134
135     /**
136      * Reset the iterator to the start of the text.
137      *
138      * @internal
139      * @deprecated This API is ICU internal only.
140      */

141     public final void reset()
142     {
143         // empty any old parenStack contents.
144
// NOTE: this is not the most efficient way
145
// to do this, but it's the easiest to write...
146
while (stackIsNotEmpty()) {
147             pop();
148         }
149         
150         scriptStart = textStart;
151         scriptLimit = textStart;
152         scriptCode = UScript.INVALID_CODE;
153         parenSP = -1;
154         pushCount = 0;
155         fixupCount = 0;
156         
157         textIndex = textStart;
158     }
159
160     /**
161      * Reset the iterator to iterate over the given range of the text. Throws
162      * IllegalArgumentException if the range is outside of the bounds of the
163      * character array.
164      *
165      * @param start the index of the new first character over which to iterate
166      * @param count the new number of characters over which to iterate.
167      * @exception IllegalArgumentException
168      *
169      * @internal
170      * @deprecated This API is ICU internal only.
171      */

172     public final void reset(int start, int count)
173     throws IllegalArgumentException JavaDoc
174     {
175         int len = 0;
176         
177         if (text != null) {
178             len = text.length;
179         }
180         
181         if (start < 0 || count < 0 || start > len - count) {
182             throw new IllegalArgumentException JavaDoc();
183         }
184         
185         textStart = start;
186         textLimit = start + count;
187
188         reset();
189     }
190
191     /**
192      * Reset the iterator to iterate over <code>count</code> characters
193      * in <code>chars</code> starting at <code>start</code>. This allows
194      * clients to reuse an iterator.
195      *
196      * @param chars the new array of characters over which to iterate.
197      * @param start the index of the first character over which to iterate.
198      * @param count the number of characters over which to iterate.
199      *
200      * @internal
201      * @deprecated This API is ICU internal only.
202      */

203     public final void reset(char[] chars, int start, int count)
204     {
205         if (chars == null) {
206             chars = emptyCharArray;
207         }
208         
209         text = chars;
210
211         reset(start, count);
212     }
213     
214     /**
215      * Reset the iterator to iterate over the characters
216      * in <code>chars</code>. This allows clients to reuse an iterator.
217      *
218      * @param chars the new array of characters over which to iterate.
219      *
220      * @internal
221      * @deprecated This API is ICU internal only.
222      */

223     public final void reset(char[] chars)
224     {
225         int length = 0;
226         
227         if (chars != null) {
228             length = chars.length;
229         }
230         
231         reset(chars, 0, length);
232     }
233     
234     /**
235      * Reset the iterator to iterate over <code>count</code> characters
236      * in <code>text</code> starting at <code>start</code>. This allows
237      * clients to reuse an iterator.
238      *
239      * @param text the new string of characters over which to iterate.
240      * @param start the index of the first character over which to iterate.
241      * @param count the nuber of characters over which to iterate.
242      *
243      * @internal
244      * @deprecated This API is ICU internal only.
245      */

246     public final void reset(String JavaDoc text, int start, int count)
247     {
248         char[] chars = null;
249         
250         if (text != null) {
251             chars = text.toCharArray();
252         }
253         
254         reset(chars, start, count);
255     }
256     
257     /**
258      * Reset the iterator to iterate over the characters
259      * in <code>text</code>. This allows clients to reuse an iterator.
260      *
261      * @param text the new string of characters over which to iterate.
262      *
263      * @internal
264      * @deprecated This API is ICU internal only.
265      */

266     public final void reset(String JavaDoc text)
267     {
268         int length = 0;
269         
270         if (text != null) {
271             length = text.length();
272         }
273         
274         reset(text, 0, length);
275     }
276         
277
278
279     /**
280      * Get the starting index of the current script run.
281      *
282      * @return the index of the first character in the current script run.
283      *
284      * @internal
285      * @deprecated This API is ICU internal only.
286      */

287     public final int getScriptStart()
288     {
289         return scriptStart;
290     }
291
292     /**
293      * Get the index of the first character after the current script run.
294      *
295      * @return the index of the first character after the current script run.
296      *
297      * @internal
298      * @deprecated This API is ICU internal only.
299      */

300     public final int getScriptLimit()
301     {
302         return scriptLimit;
303     }
304
305     /**
306      * Get the script code for the script of the current script run.
307      *
308      * @return the script code for the script of the current script run.
309      * @see com.ibm.icu.lang.UScript
310      *
311      * @internal
312      * @deprecated This API is ICU internal only.
313      */

314     public final int getScriptCode()
315     {
316         return scriptCode;
317     }
318
319     /**
320      * Find the next script run. Returns <code>false</code> if there
321      * isn't another run, returns <code>true</code> if there is.
322      *
323      * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
324      *
325      * @internal
326      * @deprecated This API is ICU internal only.
327      */

328     public final boolean next()
329     {
330         // if we've fallen off the end of the text, we're done
331
if (scriptLimit >= textLimit) {
332             return false;
333         }
334     
335         scriptCode = UScript.COMMON;
336         scriptStart = scriptLimit;
337         
338         syncFixup();
339         
340         while (textIndex < textLimit) {
341             int ch = UTF16.charAt(text, textStart, textLimit, textIndex - textStart);
342             int codePointCount = UTF16.getCharCount(ch);
343             int sc = UScript.getScript(ch);
344             int pairIndex = getPairIndex(ch);
345
346             textIndex += codePointCount;
347             
348             // Paired character handling:
349
//
350
// if it's an open character, push it onto the stack.
351
// if it's a close character, find the matching open on the
352
// stack, and use that script code. Any non-matching open
353
// characters above it on the stack will be poped.
354
if (pairIndex >= 0) {
355                 if ((pairIndex & 1) == 0) {
356                     push(pairIndex, scriptCode);
357                 } else {
358                     int pi = pairIndex & ~1;
359
360                     while (stackIsNotEmpty() && top().pairIndex != pi) {
361                         pop();
362                     }
363
364                     if (stackIsNotEmpty()) {
365                         sc = top().scriptCode;
366                     }
367                 }
368             }
369
370             if (sameScript(scriptCode, sc)) {
371                 if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
372                     scriptCode = sc;
373
374                     fixup(scriptCode);
375                 }
376
377                 // if this character is a close paired character,
378
// pop the matching open character from the stack
379
if (pairIndex >= 0 && (pairIndex & 1) != 0) {
380                     pop();
381                 }
382             } else {
383                 // We've just seen the first character of
384
// the next run. Back over it so we'll see
385
// it again the next time.
386
textIndex -= codePointCount;
387                 break;
388             }
389         }
390
391         scriptLimit = textIndex;
392         return true;
393     }
394
395     /**
396      * Compare two script codes to see if they are in the same script. If one script is
397      * a strong script, and the other is INHERITED or COMMON, it will compare equal.
398      *
399      * @param scriptOne one of the script codes.
400      * @param scriptTwo the other script code.
401      * @return <code>true</code> if the two scripts are the same.
402      * @see com.ibm.icu.lang.UScript
403      */

404     private static boolean sameScript(int scriptOne, int scriptTwo)
405     {
406         return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED || scriptOne == scriptTwo;
407     }
408
409     /*
410      * An internal class which holds entries on the paren stack.
411      */

412     private static final class ParenStackEntry
413     {
414         int pairIndex;
415         int scriptCode;
416         
417         public ParenStackEntry(int thePairIndex, int theScriptCode)
418         {
419             pairIndex = thePairIndex;
420             scriptCode = theScriptCode;
421         }
422     }
423     
424     private static final int mod(int sp)
425     {
426         return sp % PAREN_STACK_DEPTH;
427     }
428     
429     private static final int inc(int sp, int count)
430     {
431         return mod(sp + count);
432     }
433     
434     private static final int inc(int sp)
435     {
436         return inc(sp, 1);
437     }
438     
439     private static final int dec(int sp, int count)
440     {
441         return mod(sp + PAREN_STACK_DEPTH - count);
442     }
443     
444     private static final int dec(int sp)
445     {
446         return dec(sp, 1);
447     }
448     
449     private static final int limitInc(int count)
450     {
451         if (count < PAREN_STACK_DEPTH) {
452             count += 1;
453         }
454         
455         return count;
456     }
457     
458     private final boolean stackIsEmpty()
459     {
460         return pushCount <= 0;
461     }
462     
463     private final boolean stackIsNotEmpty()
464     {
465         return ! stackIsEmpty();
466     }
467     
468     private final void push(int pairIndex, int scriptCode)
469     {
470         pushCount = limitInc(pushCount);
471         fixupCount = limitInc(fixupCount);
472         
473         parenSP = inc(parenSP);
474         parenStack[parenSP] = new ParenStackEntry(pairIndex, scriptCode);
475     }
476     
477     private final void pop()
478     {
479         
480         if (stackIsEmpty()) {
481             return;
482         }
483         
484         parenStack[parenSP] = null;
485         
486         if (fixupCount > 0) {
487             fixupCount -= 1;
488         }
489         
490         pushCount -= 1;
491         parenSP = dec(parenSP);
492         
493         // If the stack is now empty, reset the stack
494
// pointers to their initial values.
495
if (stackIsEmpty()) {
496             parenSP = -1;
497         }
498     }
499     
500     private final ParenStackEntry top()
501     {
502         return parenStack[parenSP];
503     }
504     
505     private final void syncFixup()
506     {
507         fixupCount = 0;
508     }
509     
510     private final void fixup(int scriptCode)
511     {
512         int fixupSP = dec(parenSP, fixupCount);
513         
514         while (fixupCount-- > 0) {
515             fixupSP = inc(fixupSP);
516             parenStack[fixupSP].scriptCode = scriptCode;
517         }
518     }
519     
520     private char[] emptyCharArray = {};
521
522     private char[] text;
523
524     private int textIndex;
525     private int textStart;
526     private int textLimit;
527     
528     private int scriptStart;
529     private int scriptLimit;
530     private int scriptCode;
531
532     private static int PAREN_STACK_DEPTH = 32;
533     private static ParenStackEntry parenStack[] = new ParenStackEntry[PAREN_STACK_DEPTH];
534     private int parenSP = -1;
535     private int pushCount = 0;
536     private int fixupCount = 0;
537
538     /**
539      * Find the highest bit that's set in a word. Uses a binary search through
540      * the bits.
541      *
542      * @param n the word in which to find the highest bit that's set.
543      * @return the bit number (counting from the low order bit) of the highest bit.
544      */

545     private static final byte highBit(int n)
546     {
547         if (n <= 0) {
548             return -32;
549         }
550
551         byte bit = 0;
552
553         if (n >= 1 << 16) {
554             n >>= 16;
555             bit += 16;
556         }
557
558         if (n >= 1 << 8) {
559             n >>= 8;
560             bit += 8;
561         }
562
563         if (n >= 1 << 4) {
564             n >>= 4;
565             bit += 4;
566         }
567
568         if (n >= 1 << 2) {
569             n >>= 2;
570             bit += 2;
571         }
572
573         if (n >= 1 << 1) {
574             n >>= 1;
575             bit += 1;
576         }
577
578         return bit;
579     }
580
581     /**
582      * Search the pairedChars array for the given character.
583      *
584      * @param ch the character for which to search.
585      * @return the index of the character in the table, or -1 if it's not there.
586      */

587     private static int getPairIndex(int ch)
588     {
589         int probe = pairedCharPower;
590         int index = 0;
591
592         if (ch >= pairedChars[pairedCharExtra]) {
593             index = pairedCharExtra;
594         }
595
596         while (probe > (1 << 0)) {
597             probe >>= 1;
598
599             if (ch >= pairedChars[index + probe]) {
600                 index += probe;
601             }
602         }
603
604         if (pairedChars[index] != ch) {
605             index = -1;
606         }
607
608         return index;
609     }
610
611     private static int pairedChars[] = {
612         0x0028, 0x0029, // ascii paired punctuation
613
0x003c, 0x003e,
614         0x005b, 0x005d,
615         0x007b, 0x007d,
616         0x00ab, 0x00bb, // guillemets
617
0x2018, 0x2019, // general punctuation
618
0x201c, 0x201d,
619         0x2039, 0x203a,
620         0x3008, 0x3009, // chinese paired punctuation
621
0x300a, 0x300b,
622         0x300c, 0x300d,
623         0x300e, 0x300f,
624         0x3010, 0x3011,
625         0x3014, 0x3015,
626         0x3016, 0x3017,
627         0x3018, 0x3019,
628         0x301a, 0x301b
629     };
630
631     private static int pairedCharPower = 1 << highBit(pairedChars.length);
632     private static int pairedCharExtra = pairedChars.length - pairedCharPower;
633 }
634
635
Popular Tags