KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RuleBasedBreakIterator_Old


1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2006, International Business Machines Corporation and *
4  * others. All Rights Reserved. *
5  *******************************************************************************
6  */

7
8
9 /*
10  * @(#)RuleBasedBreakIterator.java 1.3 99/04/07
11  *
12  */

13
14 package com.ibm.icu.text;
15
16 import com.ibm.icu.util.CompactByteArray;
17 import com.ibm.icu.impl.Utility;
18 import java.util.Vector JavaDoc;
19 import java.util.Stack JavaDoc;
20 import java.util.Hashtable JavaDoc;
21 import java.util.Enumeration JavaDoc;
22 import java.text.CharacterIterator JavaDoc;
23 import java.text.StringCharacterIterator JavaDoc;
24
25 import java.io.*;
26
27 /**
28  * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
29  *
30  * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
31  * and <i>regular expressions.</i></p>
32  *
33  * <p>A substitution rule defines a name that can be used in place of an expression. It
34  * consists of a name, an equals sign, and an expression. (There can be no whitespace on
35  * either side of the equals sign.) To keep its syntactic meaning intact, the expression
36  * must be enclosed in parentheses or square brackets. A substitution is visible after its
37  * definition, and is filled in using simple textual substitution (when a substitution is
38  * used, its name is enclosed in curly braces. The curly braces are optional in the
39  * substition's definition). Substitution definitions can contain other substitutions, as
40  * long as those substitutions have been defined first. Substitutions are generally used to
41  * make the regular expressions (which can get quite complex) shorter and easier to read.
42  * They typically define either character categories or commonly-used subexpressions.</p>
43  *
44  * <p>There is one special substitution.&nbsp; If the description defines a substitution
45  * called &quot;_ignore_&quot;, the expression must be a [] expression, and the
46  * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
47  * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
48  * same way it would if any ignore characters it contains are taken out.&nbsp; Break
49  * positions never occur before ignore characters, except when the character before the
50  * ignore characters is a line or paragraph terminator.</p>
51  *
52  * <p>A regular expression uses a syntax similar to the normal Unix regular-expression
53  * syntax, and defines a sequence of characters to be kept together. With one significant
54  * exception, the iterator uses a longest-possible-match algorithm when matching text to regular
55  * expressions. The iterator also treats descriptions containing multiple regular expressions
56  * as if they were ORed together (i.e., as if they were separated by |).</p>
57  *
58  * <p>The special characters recognized by the regular-expression parser are as follows:</p>
59  *
60  * <blockquote>
61  * <table border="1" width="100%">
62  * <tr>
63  * <td width="6%">*</td>
64  * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
65  * of times (including not at all).</td>
66  * </tr>
67  * <tr>
68  * <td width="6%">+</td>
69  * <td width="94%">Specifies that the expression preceding the asterisk may occur one or
70  * more times, but must occur at least once.</td>
71  * </tr>
72  * <tr>
73  * <td width="6%">?</td>
74  * <td width="94%">Specifies that the expression preceding the asterisk may occur once
75  * or not at all (i.e., it makes the preceding expression optional).</td>
76  * </tr>
77  * <tr>
78  * <td width="6%">()</td>
79  * <td width="94%">Encloses a sequence of characters. If followed by * or +, the
80  * sequence repeats. If followed by ?, the sequence is optional. Otherwise, the
81  * parentheses are just a grouping device and a way to delimit the ends of expressions
82  * containing |.</td>
83  * </tr>
84  * <tr>
85  * <td width="6%">|</td>
86  * <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
87  * sequence or the other, but not both, matches this expression.&nbsp; The | character can
88  * only occur inside ().</td>
89  * </tr>
90  * <tr>
91  * <td width="6%">.</td>
92  * <td width="94%">Matches any character.</td>
93  * </tr>
94  * <tr>
95  * <td width="6%">*?</td>
96  * <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
97  * when there is overlap between the last group of characters in the expression preceding the
98  * * and the first group of characters following the *.&nbsp; When there is this kind of
99  * overlap, * will match the longest sequence of characters that match the expression before
100  * the *, and *? will match the shortest sequence of characters matching the expression
101  * before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
102  * &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
103  * but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
104  * </tr>
105  * <tr>
106  * <td width="6%">[]</td>
107  * <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
108  * match any single character that is specified in the [] expression.&nbsp; For more on the
109  * syntax of [] expressions, see below.</td>
110  * </tr>
111  * <tr>
112  * <td width="6%">/</td>
113  * <td width="94%">Specifies where the break position should go if text matches this
114  * expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*[1-0]&quot; will match if the iterator sees a run
115  * of letters, followed by a run of whitespace, followed by a digit, but the break position
116  * will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
117  * break position at the end of the matching text.</td>
118  * </tr>
119  * <tr>
120  * <td width="6%">\</td>
121  * <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
122  * character to be treated as literal character.&nbsp; This has no effect for many
123  * characters, but for the characters listed above, this deprives them of their special
124  * meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
125  * newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
126  * &quot;\n&quot; will be converted to a literal newline character by the time the
127  * regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
128  * visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
129  * characters in the ASCII range except for letters, digits, and control characters are
130  * reserved characters to the parser and must be preceded by \ even if they currently don't
131  * mean anything.</td>
132  * </tr>
133  * <tr>
134  * <td width="6%">!</td>
135  * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
136  * parser that this expression specifies the backwards-iteration behavior of the iterator,
137  * and not its normal iteration behavior.&nbsp; This is generally only used in situations
138  * where the automatically-generated backwards-iteration behavior doesn't produce
139  * satisfactory results and must be supplemented with extra client-specified rules.</td>
140  * </tr>
141  * <tr>
142  * <td width="6%"><em>(all others)</em></td>
143  * <td width="94%">All other characters are treated as literal characters, which must match
144  * the corresponding character(s) in the text exactly.</td>
145  * </tr>
146  * </table>
147  * </blockquote>
148  *
149  * <p>Within a [] expression, a number of other special characters can be used to specify
150  * groups of characters:</p>
151  *
152  * <blockquote>
153  * <table border="1" width="100%">
154  * <tr>
155  * <td width="6%">-</td>
156  * <td width="94%">Specifies a range of matching characters.&nbsp; For example
157  * &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
158  * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
159  * language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
160  * it include accented letters such as a-umlaut.</td>
161  * </tr>
162  * <tr>
163  * <td width="6%">^</td>
164  * <td width="94%">Inverts the expression. All characters the expression includes are
165  * excluded, and vice versa. (i.e., it has the effect of saying "all Unicode characters
166  * except...") This character only has its special meaning when it's the first character
167  * in the [] expression. (Generally, you only see the ^ character inside a nested []
168  * expression used in conjunction with the syntax below.)</td>
169  * </tr>
170  * <tr>
171  * <td width="6%"><em>(all others)</em></td>
172  * <td width="94%">All other characters are treated as literal characters.&nbsp; (For
173  * example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
174  * </tr>
175  * </table>
176  * </blockquote>
177  *
178  * <p>[] expressions can nest. There are some other characters that have special meaning only
179  * when used in conjunction with nester [] expressions:</p>
180  *
181  * <blockquote>
182  * <table border="1" width="100%">
183  * <tr>
184  * <td width="6%">::</td>
185  * <td width="94%">Within a nested [] expression, a pair of colons containing a one- or
186  * two-letter code matches all characters in the corresponding Unicode category.&nbsp;
187  * The :: expression has to be the only thing inside the [] expression. The two-letter codes
188  * are the same as the two-letter codes in the Unicode database (for example,
189  * &quot;[[:Sc:][:Sm:]]&quot; matches all currency symbols and all math symbols).&nbsp;
190  * Specifying a one-letter code is the same as specifying all two-letter codes that begin
191  * with that letter (for example, &quot;[[:L:]]&quot; matches all letters, and is equivalent
192  * to &quot;[[:Lu:][:Ll:][:Lo:][:Lm:][:Lt:]]&quot;).&nbsp; Anything other than a valid
193  * two-letter Unicode category code or a single letter that begins a valide Unicode category
194  * code is illegal within the colons.</td>
195  * </tr>
196  * <tr>
197  * <td width="6%">|</td>
198  * <td width="94%">Two nested [] expressions juxtaposed or separated only by a | character
199  * are merged together into a single [] expression matching all the characters in either
200  * of the original [] expressions. (e.g., "[[ab][bc]]" is equivalent to "[abc]", and so
201  * is "[[ab]|[bc]]". <b>NOTE:</b> "[ab][bc]" is NOT the same thing as "[[ab][bc]]".
202  * The first expression will match two characters: an a or b followed by either another
203  * b or a c. The second expression will match a single character, which may be a, b, or c.
204  * The nesting is <em>required</em> for the expressions to merge together.</td>
205  * </tr>
206  * <tr>
207  * <td width="6%">&</td>
208  * <td width="94%">Two nested [] expressions with only & between them will match any
209  * character that appears in both nested [] expressions (this is a set intersection).
210  * (e.g., "[[ab]&[bc]]" will only match the letter b.)</td>
211  * </tr>
212  * <tr>
213  * <td width="6%">-</td>
214  * <td width="94%">Two nested [] expressions with - between them will match any
215  * character that appears in the first nested [] expression <em>but not</em> the
216  * second one (this is an asymmetrical set difference). (e.g., "[[:Sc:]-[$]]"
217  * matches any currency symbol except the dollar sign. "[[ab]-[bc]] will match
218  * only the letter a. This has exactly the same effect as "[[ab]&[^bc]]".)</td>
219  * </tr>
220  *
221  * <p>For a more complete explanation, see <a
222  * HREF="http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/">http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/</a>.
223  * &nbsp; For examples, see the resource data (which is annotated).</p>
224  *
225  * @author Richard Gillam
226  * @internal ICU 2.0
227  */

228 public class RuleBasedBreakIterator_Old extends RuleBasedBreakIterator {
229
230     /**
231      * A token used as a character-category value to identify ignore characters
232      * @stable ICU 2.0
233      */

234     protected static final byte IGNORE = -1;
235
236     /**
237      * Special variable used to define ignore characters
238      * @stable ICU 2.0
239      */

240     private static final String JavaDoc IGNORE_VAR = "_ignore_";
241
242     /**
243      * The state number of the starting state
244      */

245     private static final short START_STATE = 1;
246
247     /**
248      * The state-transition value indicating "stop"
249      */

250     private static final short STOP_STATE = 0;
251
252     /**
253      * The textual description this iterator was created from
254      */

255     private String JavaDoc description;
256
257     /**
258      * A table that indexes from character values to character category numbers
259      */

260     private CompactByteArray charCategoryTable = null;
261
262     /**
263      * The table of state transitions used for forward iteration
264      */

265     private short[] stateTable = null;
266
267     /**
268      * The table of state transitions used to sync up the iterator with the
269      * text in backwards and random-access iteration
270      */

271     private short[] backwardsStateTable = null;
272
273     /**
274      * A list of flags indicating which states in the state table are accepting
275      * ("end") states
276      */

277     private boolean[] endStates = null;
278
279     /**
280      * A list of flags indicating which states in the state table are
281      * lookahead states (states which turn lookahead on and off)
282      */

283     private boolean[] lookaheadStates = null;
284
285     /**
286      * The number of character categories (and, thus, the number of columns in
287      * the state tables)
288      */

289     private int numCategories;
290
291     /**
292      * The character iterator through which this BreakIterator accesses the text
293      */

294     private CharacterIterator JavaDoc text = null;
295
296     //=======================================================================
297
// constructors
298
//=======================================================================
299

300     /**
301      * Constructs a RuleBasedBreakIterator_Old according to the description
302      * provided. If the description is malformed, throws an
303      * IllegalArgumentException. Normally, instead of constructing a
304      * RuleBasedBreakIterator_Old directory, you'll use the factory methods
305      * on BreakIterator to create one indirectly from a description
306      * in the framework's resource files. You'd use this when you want
307      * special behavior not provided by the built-in iterators.
308      * @stable ICU 2.0
309      */

310     public RuleBasedBreakIterator_Old(String JavaDoc description) {
311 //System.out.println(">>>RBBI constructor");
312
this.description = description;
313
314         // the actual work is done by the Builder class
315
Builder builder = makeBuilder();
316         builder.buildBreakIterator();
317 //System.out.println("<<<RBBI constructor");
318
}
319
320     /**
321      * Creates a Builder.
322      * @stable ICU 2.0
323      */

324     protected Builder makeBuilder() {
325         return new Builder();
326     }
327
328     //=======================================================================
329
// boilerplate
330
//=======================================================================
331
/**
332      * Clones this iterator.
333      * @return A newly-constructed RuleBasedBreakIterator_Old with the same
334      * behavior as this one.
335      * @stable ICU 2.0
336      */

337     public Object JavaDoc clone()
338     {
339         RuleBasedBreakIterator_Old result = (RuleBasedBreakIterator_Old) super.clone();
340         if (text != null) {
341             result.text = (CharacterIterator JavaDoc) text.clone();
342         }
343         return result;
344     }
345
346     /**
347      * Returns true if both BreakIterators are of the same class, have the same
348      * rules, and iterate over the same text.
349      * @stable ICU 2.0
350      */

351     public boolean equals(Object JavaDoc that) {
352         try {
353             RuleBasedBreakIterator_Old other = (RuleBasedBreakIterator_Old) that;
354             if (!description.equals(other.description)) {
355                 return false;
356             }
357             return getText().equals(other.getText());
358         }
359         catch(ClassCastException JavaDoc e) {
360             return false;
361         }
362     }
363
364     /**
365      * Returns the description used to create this iterator
366      * @stable ICU 2.0
367      */

368     public String JavaDoc toString() {
369         return description;
370     }
371
372     /**
373      * Compute a hashcode for this BreakIterator
374      * @return A hash code
375      * @stable ICU 2.0
376      */

377     public int hashCode()
378     {
379         return description.hashCode();
380     }
381
382 /**
383  * Dump out a more-or-less human readable form of the
384  * complete state table and character class definitions
385  * @internal
386  */

387     ///CLOVER:OFF
388
public void debugDumpTables() {
389     System.out.println("Character Classes:");
390     int currentCharClass = 257;
391     int startCurrentRange = 0;
392     int initialStringLength = 0;
393
394     StringBuffer JavaDoc[] charClassRanges = new StringBuffer JavaDoc[numCategories];
395     for (int i=0; i<numCategories; i++) {
396         charClassRanges[i] = new StringBuffer JavaDoc();
397     }
398
399     for (int i = 0; i < 0xffff; i++) {
400         if ((int)charCategoryTable.elementAt((char)i) != currentCharClass) {
401             if (currentCharClass != 257) {
402                 // Complete the output of the previous range.
403
if (i != startCurrentRange+1) {
404                     charClassRanges[currentCharClass].append("-"+ Integer.toHexString(i-1));
405                 }
406                 if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
407                     charClassRanges[currentCharClass].append("\n ");
408                 }
409             }
410
411             // Output the start of the new range.
412
currentCharClass = (int)charCategoryTable.elementAt((char)i);
413             startCurrentRange = i;
414             initialStringLength = charClassRanges[currentCharClass].length();
415             if (charClassRanges[currentCharClass].length() > 0)
416                 charClassRanges[currentCharClass].append(", ");
417             charClassRanges[currentCharClass].append(Integer.toHexString(i));
418         }
419     }
420
421     for (int i=0; i<numCategories; i++) {
422         System.out.println(i + ": " + charClassRanges[i]);
423     }
424
425
426     System.out.println("\n\nState Table. *: end state %: look ahead state");
427     System.out.print("C:\t");
428     for (int i = 0; i < numCategories; i++)
429         System.out.print(Integer.toString(i) + "\t");
430     System.out.println(); System.out.print("=================================================");
431     for (int i = 0; i < stateTable.length; i++) {
432         if (i % numCategories == 0) {
433             System.out.println();
434             if (endStates[i / numCategories])
435                 System.out.print("*");
436             else
437                 System.out.print(" ");
438             if (lookaheadStates[i / numCategories]) {
439                 System.out.print("%");
440             }
441             else
442                 System.out.print(" ");
443             System.out.print(Integer.toString(i / numCategories) + ":\t");
444         }
445         if (stateTable[i] == 0) {
446             System.out.print(".\t");
447         } else {
448             System.out.print(Integer.toString(stateTable[i]) + "\t");
449         }
450     }
451     System.out.println();
452 }
453     ///CLOVER:ON
454

455     ///CLOVER:OFF
456
// DELETE ME BEFORE RELEASE!!!
457
/**
458      * Write the RBBI runtime engine state transition tables to a file.
459      * Formerly used to export the tables to the C++ RBBI Implementation.
460      * Now obsolete, as C++ builds its own tables.
461      * @internal
462      */

463 public void writeTablesToFile(FileOutputStream file, boolean littleEndian) throws IOException {
464     // NOTE: The format being written here is designed to be compatible with
465
// the ICU udata interfaces and may not be useful for much else
466
DataOutputStream out = new DataOutputStream(file);
467     
468 // --- write the file header ---
469
byte[] comment = "Copyright (C) 1999, International Business Machines Corp. and others. All Rights Reserved.".getBytes("US-ASCII");
470 // write the size of the header (rounded up to the next 16-byte boundary)
471
short headerSize = (short)(comment.length + 1 // length of comment
472
+ 24); // size of static header data
473
short realHeaderSize = (short)(headerSize + ((headerSize % 16 == 0) ? 0 : 16 - (headerSize % 16)));
474     writeSwappedShort(realHeaderSize, out, littleEndian);
475 // write magic byte values
476
out.write(0xda);
477     out.write(0x27);
478 // write size of core header data
479
writeSwappedShort((short)20, out, littleEndian);
480 // write reserved bytes
481
writeSwappedShort((short)0, out, littleEndian);
482     
483 // write flag indicating whether we're big-endian
484
if (littleEndian) {
485         out.write(0);
486     } else {
487         out.write(1);
488     }
489     
490 // write character set family code (0 means ASCII)
491
out.write(0);
492 // write size of UChar in this file
493
out.write(2);
494 // write reserved byte
495
out.write(0);
496 // write data format identifier (this is an array of bytes in ICU, so the value is NOT swapped!)
497
out.writeInt(0x42524b53); // ("BRKS")
498
// write file format version number (NOT swapped!)
499
out.writeInt(0);
500 // write data version number (NOT swapped!)
501
out.writeInt(0);
502 // write copyright notice
503
out.write(comment);
504     out.write(0);
505 // fill in padding bytes
506
while (headerSize < realHeaderSize) {
507         out.write(0);
508         ++headerSize;
509     }
510     
511 // --- write index to the file ---
512
// write the number of columns in the state table
513
writeSwappedInt(numCategories, out, littleEndian);
514     int fileEnd = 36;
515 // write the location in the file of the BreakIterator description string
516
writeSwappedInt(fileEnd, out, littleEndian);
517     fileEnd += (description.length() + 1) * 2;
518     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
519 // write the location of the character category table's index
520
writeSwappedInt(fileEnd, out, littleEndian);
521     fileEnd += charCategoryTable.getIndexArray().length * 2;
522     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
523 // write the location of the character category table's values array
524
writeSwappedInt(fileEnd, out, littleEndian);
525     fileEnd += charCategoryTable.getValueArray().length;
526     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
527 // write the location of the forward state table
528
writeSwappedInt(fileEnd, out, littleEndian);
529     fileEnd += stateTable.length * 2;
530     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
531 // write the location of the backward state table
532
writeSwappedInt(fileEnd, out, littleEndian);
533     fileEnd += backwardsStateTable.length * 2;
534     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
535 // write the location of the endStates flags
536
writeSwappedInt(fileEnd, out, littleEndian);
537     fileEnd += endStates.length;
538     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
539 // write the location of the lookaheadStates flags
540
writeSwappedInt(fileEnd, out, littleEndian);
541     fileEnd += lookaheadStates.length;
542     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
543 // write the length of the file
544
writeSwappedInt(fileEnd, out, littleEndian);
545     
546 // --- write the actual data ---
547
// write description string
548
for (int i = 0; i < description.length(); i++)
549         writeSwappedShort((short)description.charAt(i), out, littleEndian);
550     out.writeShort(0);
551     if ((description.length() + 1) % 2 == 1)
552         out.writeShort(0);
553 // write character category table
554
char[] temp1 = charCategoryTable.getIndexArray();
555     for (int i = 0; i < temp1.length; i++)
556         writeSwappedShort((short)temp1[i], out, littleEndian);
557     if (temp1.length % 2 == 1)
558         out.writeShort(0);
559     byte[] temp2 = charCategoryTable.getValueArray();
560     out.write(temp2);
561     switch (temp2.length % 4) {
562         case 1: out.write(0);
563         case 2: out.write(0);
564         case 3: out.write(0);
565         default: break;
566     }
567 // write the state transition tables
568
for (int i = 0; i < stateTable.length; i++)
569         writeSwappedShort(stateTable[i], out, littleEndian);
570     if (stateTable.length % 2 == 1)
571         out.writeShort(0);
572     for (int i = 0; i < backwardsStateTable.length; i++)
573         writeSwappedShort(backwardsStateTable[i], out, littleEndian);
574     if (backwardsStateTable.length % 2 == 1)
575         out.writeShort(0);
576 // write the flag arrays
577
for (int i = 0; i < endStates.length; i++)
578         out.writeBoolean(endStates[i]);
579     switch (endStates.length % 4) {
580         case 1: out.write(0);
581         case 2: out.write(0);
582         case 3: out.write(0);
583         default: break;
584     }
585     for (int i = 0; i < lookaheadStates.length; i++)
586         out.writeBoolean(lookaheadStates[i]);
587     switch (lookaheadStates.length % 4) {
588         case 1: out.write(0);
589         case 2: out.write(0);
590         case 3: out.write(0);
591         default: break;
592     }
593 }
594
595 /**
596  * @internal
597  */

598 protected void writeSwappedShort(short x, DataOutputStream out, boolean littleEndian)
599 throws IOException{
600     if (littleEndian) {
601         out.write((byte)(x & 0xff));
602         out.write((byte)((x >> 8) & 0xff));
603     }
604     else {
605         out.write((byte)((x >> 8) & 0xff));
606         out.write((byte)(x & 0xff));
607     }
608 }
609
610 /**
611  * @internal
612  */

613 protected void writeSwappedInt(int x, DataOutputStream out, boolean littleEndian)
614 throws IOException {
615     if (littleEndian) {
616         out.write((byte)(x & 0xff));
617         out.write((byte)((x >> 8) & 0xff));
618         out.write((byte)((x >> 16) & 0xff));
619         out.write((byte)((x >> 24) & 0xff));
620     }
621     else {
622         out.write((byte)((x >> 24) & 0xff));
623         out.write((byte)((x >> 16) & 0xff));
624         out.write((byte)((x >> 8) & 0xff));
625         out.write((byte)(x & 0xff));
626     }
627 }
628     ///CLOVER:ON
629

630     //=======================================================================
631
// BreakIterator overrides
632
//=======================================================================
633

634     /**
635      * Sets the current iteration position to the beginning of the text.
636      * (i.e., the CharacterIterator's starting offset).
637      * @return The offset of the beginning of the text.
638      * @stable ICU 2.0
639      */

640     public int first() {
641         CharacterIterator JavaDoc t = getText();
642
643         t.first();
644         return t.getIndex();
645     }
646
647     /**
648      * Sets the current iteration position to the end of the text.
649      * (i.e., the CharacterIterator's ending offset).
650      * @return The text's past-the-end offset.
651      * @stable ICU 2.0
652      */

653     public int last() {
654         CharacterIterator JavaDoc t = getText();
655
656         // I'm not sure why, but t.last() returns the offset of the last character,
657
// rather than the past-the-end offset
658
t.setIndex(t.getEndIndex());
659         return t.getIndex();
660     }
661
662     /**
663      * Advances the iterator either forward or backward the specified number of steps.
664      * Negative values move backward, and positive values move forward. This is
665      * equivalent to repeatedly calling next() or previous().
666      * @param n The number of steps to move. The sign indicates the direction
667      * (negative is backwards, and positive is forwards).
668      * @return The character offset of the boundary position n boundaries away from
669      * the current one.
670      * @stable ICU 2.0
671      */

672     public int next(int n) {
673         int result = current();
674         while (n > 0) {
675             result = handleNext();
676             --n;
677         }
678         while (n < 0) {
679             result = previous();
680             ++n;
681         }
682         return result;
683     }
684
685     /**
686      * Advances the iterator to the next boundary position.
687      * @return The position of the first boundary after this one.
688      * @stable ICU 2.0
689      */

690     public int next() {
691         return handleNext();
692     }
693
694     /**
695      * Advances the iterator backwards, to the last boundary preceding this one.
696      * @return The position of the last boundary position preceding this one.
697      * @stable ICU 2.0
698      */

699     public int previous() {
700         // if we're already sitting at the beginning of the text, return DONE
701
CharacterIterator JavaDoc text = getText();
702         if (current() == text.getBeginIndex()) {
703             return BreakIterator.DONE;
704         }
705
706         // set things up. handlePrevious() will back us up to some valid
707
// break position before the current position (we back our internal
708
// iterator up one step to prevent handlePrevious() from returning
709
// the current position), but not necessarily the last one before
710
// where we started
711
int start = current();
712         text.previous();
713         int lastResult = handlePrevious();
714         int result = lastResult;
715
716         // iterate forward from the known break position until we pass our
717
// starting point. The last break position before the starting
718
// point is our return value
719
while (result != BreakIterator.DONE && result < start) {
720             lastResult = result;
721             result = handleNext();
722         }
723
724         // set the current iteration position to be the last break position
725
// before where we started, and then return that value
726
text.setIndex(lastResult);
727         return lastResult;
728     }
729
730     /**
731      * Throw IllegalArgumentException unless begin <= offset < end.
732      * @stable ICU 2.0
733      */

734     protected static final void checkOffset(int offset, CharacterIterator JavaDoc text) {
735         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
736             throw new IllegalArgumentException JavaDoc("offset out of bounds");
737         }
738     }
739
740     /**
741      * Sets the iterator to refer to the first boundary position following
742      * the specified position.
743      * @param offset The position from which to begin searching for a break position.
744      * @return The position of the first break after the current position.
745      * @stable ICU 2.0
746      */

747     public int following(int offset) {
748         // if the offset passed in is already past the end of the text,
749
// just return DONE
750
CharacterIterator JavaDoc text = getText();
751         if (offset == text.getEndIndex()) {
752             return BreakIterator.DONE;
753         }
754         checkOffset(offset, text);
755
756         // otherwise, set our internal iteration position (temporarily)
757
// to the position passed in. If this is the _beginning_ position,
758
// then we can just use next() to get our return value
759
text.setIndex(offset);
760         if (offset == text.getBeginIndex()) {
761             return handleNext();
762         }
763
764         // otherwise, we have to sync up first. Use handlePrevious() to back
765
// us up to a known break position before the specified position (if
766
// we can determine that the specified position is a break position,
767
// we don't back up at all). This may or may not be the last break
768
// position at or before our starting position. Advance forward
769
// from here until we've passed the starting position. The position
770
// we stop on will be the first break position after the specified one.
771
int result = handlePrevious();
772         while (result != BreakIterator.DONE && result <= offset) {
773             result = handleNext();
774         }
775         return result;
776     }
777
778     /**
779      * Sets the iterator to refer to the last boundary position before the
780      * specified position.
781      * @param offset The position to begin searching for a break from.
782      * @return The position of the last boundary before the starting position.
783      * @stable ICU 2.0
784      */

785     public int preceding(int offset) {
786         // if we start by updating the current iteration position to the
787
// position specified by the caller, we can just use previous()
788
// to carry out this operation
789
CharacterIterator JavaDoc text = getText();
790         checkOffset(offset, text);
791         text.setIndex(offset);
792         return previous();
793     }
794
795     /**
796      * Returns true if the specfied position is a boundary position. As a side
797      * effect, leaves the iterator pointing to the first boundary position at
798      * or after "offset".
799      * @param offset the offset to check.
800      * @return True if "offset" is a boundary position.
801      * @stable ICU 2.0
802      */

803     public boolean isBoundary(int offset) {
804         CharacterIterator JavaDoc text = getText();
805         checkOffset(offset, text);
806         if (offset == text.getBeginIndex()) {
807             return true;
808         }
809
810         // to check whether this is a boundary, we can use following() on the
811
// position before the specified one and return true if the position we
812
// get back is the one the user specified
813
else {
814             return following(offset - 1) == offset;
815         }
816     }
817
818     /**
819      * Returns the current iteration position.
820      * @return The current iteration position.
821      * @stable ICU 2.0
822      */

823     public int current() {
824         return getText().getIndex();
825     }
826
827     
828     /**
829      * Return the status tag from the break rule that determined the most recently
830      * returned break position. The values appear in the rule source
831      * within brackets, {123}, for example. For rules that do not specify a
832      * status, a default value of 0 is returned. If more than one rule applies,
833      * the numerically largest of the possible status values is returned.
834      * <p>
835      * Note that for old style break iterators (implemented by this class), no
836      * status can be declared, and a status of zero is always assumed.
837      * <p>
838      *
839      * @draft ICU 3.0
840      * @provisional This API might change or be removed in a future release.
841      */

842     public int getRuleStatus() {
843         return 0;
844     }
845
846
847
848     /**
849      * Get the status (tag) values from the break rule(s) that determined the most
850      * recently returned break position. The values appear in the rule source
851      * within brackets, {123}, for example. The default status value for rules
852      * that do not explicitly provide one is zero.
853      * <p>
854      * Note that for old style break iterators (implemented by this class), no
855      * status can be declared, and a status of zero is always assumed.
856      * <p>
857      * @draft ICU 3.0
858      * @provisional This API might change or be removed in a future release.
859      */

860      public int getRuleStatusVec(int[] fillInArray) {
861          if (fillInArray != null && fillInArray.length >= 1) {
862              fillInArray[0] = 0;
863          }
864          return 1;
865      }
866
867
868
869     /**
870      * Return a CharacterIterator over the text being analyzed. This version
871      * of this method returns the actual CharacterIterator we're using internally.
872      * Changing the state of this iterator can have undefined consequences. If
873      * you need to change it, clone it first.
874      * @return An iterator over the text being analyzed.
875      * @stable ICU 2.0
876      */

877     public CharacterIterator JavaDoc getText() {
878         // The iterator is initialized pointing to no text at all, so if this
879
// function is called while we're in that state, we have to fudge an
880
// an iterator to return.
881
if (text == null) {
882             text = new StringCharacterIterator("");
883         }
884         return text;
885     }
886
887     /**
888      * Set the iterator to analyze a new piece of text. This function resets
889      * the current iteration position to the beginning of the text.
890      * @param newText An iterator over the text to analyze.
891      * @stable ICU 2.0
892      */

893     public void setText(CharacterIterator JavaDoc newText) {
894         // Test text to see if we need to wrap it in a SafeCharIterator:
895
int end = newText.getEndIndex();
896         newText.setIndex(end);
897         if (newText.getIndex() != end) {
898             // failed - wrap in correct implementation
899
text = new SafeCharIterator(newText);
900         }
901         else {
902             text = newText;
903         }
904         text.first();
905     }
906
907
908     //=======================================================================
909
// implementation
910
//=======================================================================
911

912     /**
913      * This method is the actual implementation of the next() method. All iteration
914      * vectors through here. This method initializes the state machine to state 1
915      * and advances through the text character by character until we reach the end
916      * of the text or the state machine transitions to state 0. We update our return
917      * value every time the state machine passes through a possible end state.
918      * @stable ICU 2.0
919      */

920     protected int handleNext() {
921         // if we're already at the end of the text, return DONE.
922
CharacterIterator JavaDoc text = getText();
923         if (text.getIndex() == text.getEndIndex()) {
924             return BreakIterator.DONE;
925         }
926
927         // no matter what, we always advance at least one character forward
928
int result = text.getIndex() + 1;
929         int lookaheadResult = 0;
930
931         // begin in state 1
932
int state = START_STATE;
933         int category;
934         char c = text.current();
935         char lastC = c;
936         int lastCPos = 0;
937
938         // if the first character in this segment is an ignore character (which can happen
939
// when it's either the first character in the file or follows a mandatory break
940
// character), and the first non-ignore character isn't a glue character, always
941
// put a break between the ignore characters and the rest of the text
942
if (lookupCategory(c) == IGNORE) {
943             while (lookupCategory(c) == IGNORE)
944                 c = text.next();
945
946             if (Character.getType(c) == Character.NON_SPACING_MARK || Character.getType(c)
947                     == Character.ENCLOSING_MARK) {
948                 return text.getIndex();
949             }
950         }
951
952         // loop until we reach the end of the text or transition to state 0
953
while (c != CharacterIterator.DONE && state != STOP_STATE) {
954
955             // look up the current character's character category (which tells us
956
// which column in the state table to look at)
957
category = lookupCategory(c);
958
959             // if the character isn't an ignore character, look up a state
960
// transition in the state table
961
if (category != IGNORE) {
962                 state = lookupState(state, category);
963             }
964
965             // if the state we've just transitioned to is a lookahead state,
966
// (but not also an end state), save its position. If it's
967
// both a lookahead state and an end state, update the break position
968
// to the last saved lookup-state position
969
if (lookaheadStates[state]) {
970                 if (endStates[state]) {
971                     if (lookaheadResult > 0) {
972                         result = lookaheadResult;
973                     }
974                     else {
975                         result = text.getIndex() + 1;
976                     }
977                 }
978                 else {
979                     lookaheadResult = text.getIndex() + 1;
980                 }
981             }
982
983             // otherwise, if the state we've just transitioned to is an accepting
984
// state, update the break position to be the current iteration position
985
else {
986                 if (endStates[state]) {
987                     result = text.getIndex() + 1;
988                 }
989             }
990
991             // keep track of the last "real" character we saw. If this character isn't an
992
// ignore character, take note of it and its position in the text
993
if (category != IGNORE && state != STOP_STATE) {
994                 lastC = c;
995                 lastCPos = text.getIndex();
996             }
997             c = text.next();
998         }
999
1000        // if we've run off the end of the text, and the very last character took us into
1001
// a lookahead state, advance the break position to the lookahead position
1002
// (the theory here is that if there are no characters at all after the lookahead
1003
// position, that always matches the lookahead criteria)
1004
if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
1005            result = lookaheadResult;
1006        }
1007
1008        // if the last character we saw before the one that took us into the stop state
1009
// was a mandatory breaking character, then the break position goes right after it
1010
// (this is here so that breaks come before, rather than after, a string of
1011
// ignore characters when they follow a mandatory break character)
1012
else if ("\n\r\f\u2028\u2029".indexOf(lastC) != -1) {
1013            result = lastCPos + 1;
1014        }
1015
1016        text.setIndex(result);
1017        return result;
1018    }
1019
1020    /**
1021     * This method backs the iterator back up to a "safe position" in the text.
1022     * This is a position that we know, without any context, must be a break position.
1023     * The various calling methods then iterate forward from this safe position to
1024     * the appropriate position to return. (For more information, see the description
1025     * of buildBackwardsStateTable() in RuleBasedBreakIterator_Old.Builder.)
1026     * @stable ICU 2.0
1027     */

1028    protected int handlePrevious() {
1029        CharacterIterator JavaDoc text = getText();
1030        int state = START_STATE;
1031        int category = 0;
1032        int lastCategory = 0;
1033        char c = text.current();
1034
1035        // loop until we reach the beginning of the text or transition to state 0
1036
while (c != CharacterIterator.DONE && state != STOP_STATE) {
1037//System.out.print(" " + text.getIndex());
1038

1039            // save the last character's category and look up the current
1040
// character's category
1041
lastCategory = category;
1042            category = lookupCategory(c);
1043
1044            // if the current character isn't an ignore character, look up a
1045
// state transition in the backwards state table
1046
if (category != IGNORE) {
1047                state = lookupBackwardState(state, category);
1048            }
1049
1050            // then advance one character backwards
1051
c = text.previous();
1052        }
1053
1054        // if we didn't march off the beginning of the text, we're either one or two
1055
// positions away from the real break position. (One because of the call to
1056
// previous() at the end of the loop above, and another because the character
1057
// that takes us into the stop state will always be the character BEFORE
1058
// the break position.)
1059
if (c != CharacterIterator.DONE) {
1060            if (lastCategory != IGNORE) {
1061                text.setIndex(text.getIndex() + 2);
1062            }
1063            else {
1064                text.next();
1065            }
1066        }
1067//System.out.print(",");
1068
return text.getIndex();
1069    }
1070
1071//static int visitedChars = 0;
1072
/**
1073     * Looks up a character's category (i.e., its category for breaking purposes,
1074     * not its Unicode category)
1075     * @internal
1076     */

1077    protected int lookupCategory(char c) {
1078//++visitedChars;
1079
return charCategoryTable.elementAt(c);
1080    }
1081
1082/*
1083static void printVisitedCharCount() {
1084System.out.println("Total number of characters visited = " + visitedChars);
1085visitedChars = 0;
1086}
1087*/

1088
1089    /**
1090     * Given a current state and a character category, looks up the
1091     * next state to transition to in the state table.
1092     * @internal
1093     */

1094    protected int lookupState(int state, int category) {
1095        return stateTable[state * numCategories + category];
1096    }
1097
1098    /**
1099     * Given a current state and a character category, looks up the
1100     * next state to transition to in the backwards state table.
1101     * @internal
1102     */

1103    protected int lookupBackwardState(int state, int category) {
1104        return backwardsStateTable[state * numCategories + category];
1105    }
1106
1107    /**
1108     * This is a helper function for computing the intersection of
1109     * two <code>UnicodeSet</code> objects.
1110     * @param a, b the two <code>UnicodeSet</code>s to intersect
1111     * @return a new <code>UnicodeSet</code> which is the intersection of a and b
1112     */

1113    private static UnicodeSet intersection(UnicodeSet a, UnicodeSet b)
1114    {
1115        UnicodeSet result = new UnicodeSet(a);
1116
1117        result.retainAll(b);
1118
1119        return result;
1120    }
1121
1122    //=======================================================================
1123
// RuleBasedBreakIterator.Builder
1124
//=======================================================================
1125
/**
1126     * The Builder class has the job of constructing a RuleBasedBreakIterator_Old from a
1127     * textual description. A Builder is constructed by RuleBasedBreakIterator_Old's
1128     * constructor, which uses it to construct the iterator itself and then throws it
1129     * away.
1130     * <p>The construction logic is separated out into its own class for two primary
1131     * reasons:
1132     * <ul><li>The construction logic is quite sophisticated and large. Separating it
1133     * out into its own class means the code must only be loaded into memory while a
1134     * RuleBasedBreakIterator_Old is being constructed, and can be purged after that.
1135     * <li>There is a fair amount of state that must be maintained throughout the
1136     * construction process that is not needed by the iterator after construction.
1137     * Separating this state out into another class prevents all of the functions that
1138     * construct the iterator from having to have really long parameter lists,
1139     * (hopefully) contributing to readability and maintainability.</ul>
1140     * <p>It'd be really nice if this could be an independent class rather than an
1141     * inner class, because that would shorten the source file considerably, but
1142     * making Builder an inner class of RuleBasedBreakIterator_Old allows it direct access
1143     * to RuleBasedBreakIterator_Old's private members, which saves us from having to
1144     * provide some kind of "back door" to the Builder class that could then also be
1145     * used by other classes.
1146     * @internal
1147     */

1148    protected class Builder {
1149        /**
1150         * A temporary holding place used for calculating the character categories.
1151         * This object contains UnicodeSet objects.
1152         * @internal
1153         */

1154        protected Vector JavaDoc categories = null;
1155
1156        /**
1157         * A table used to map parts of regexp text to lists of character categories,
1158         * rather than having to figure them out from scratch each time
1159         * @internal
1160         */

1161        protected Hashtable JavaDoc expressions = null;
1162
1163        /**
1164         * A temporary holding place for the list of ignore characters
1165         * @internal
1166         */

1167        protected UnicodeSet ignoreChars = null;
1168
1169        /**
1170         * A temporary holding place where the forward state table is built
1171         * @internal
1172         */

1173        protected Vector JavaDoc tempStateTable = null;
1174
1175        /**
1176         * A list of all the states that have to be filled in with transitions to the
1177         * next state that is created. Used when building the state table from the
1178         * regular expressions.
1179         * @internal
1180         */

1181        protected Vector JavaDoc decisionPointList = null;
1182
1183        /**
1184         * A stack for holding decision point lists. This is used to handle nested
1185         * parentheses and braces in regexps.
1186         * @internal
1187         */

1188        protected Stack JavaDoc decisionPointStack = null;
1189
1190        /**
1191         * A list of states that loop back on themselves. Used to handle .*?
1192         * @internal
1193         */

1194        protected Vector JavaDoc loopingStates = null;
1195
1196        /**
1197         * Looping states actually have to be backfilled later in the process
1198         * than everything else. This is where a the list of states to backfill
1199         * is accumulated. This is also used to handle .*?
1200         * @internal
1201         */

1202        protected Vector JavaDoc statesToBackfill = null;
1203
1204        /**
1205         * A list mapping pairs of state numbers for states that are to be combined
1206         * to the state number of the state representing their combination. Used
1207         * in the process of making the state table deterministic to prevent
1208         * infinite recursion.
1209         * @internal
1210         */

1211        protected Vector JavaDoc mergeList = null;
1212
1213        /**
1214         * A flag that is used to indicate when the list of looping states can
1215         * be reset.
1216         * @internal
1217         */

1218        protected boolean clearLoopingStates = false;
1219
1220        /**
1221         * A bit mask used to indicate a bit in the table's flags column that marks a
1222         * state as an accepting state.
1223         * @internal
1224         */

1225        protected static final int END_STATE_FLAG = 0x8000;
1226
1227        /**
1228         * A bit mask used to indicate a bit in the table's flags column that marks a
1229         * state as one the builder shouldn't loop to any looping states
1230         * @internal
1231         */

1232        protected static final int DONT_LOOP_FLAG = 0x4000;
1233
1234        /**
1235         * A bit mask used to indicate a bit in the table's flags column that marks a
1236         * state as a lookahead state.
1237         * @internal
1238         */

1239        protected static final int LOOKAHEAD_STATE_FLAG = 0x2000;
1240
1241        /**
1242         * A bit mask representing the union of the mask values listed above.
1243         * Used for clearing or masking off the flag bits.
1244         * @internal
1245         */

1246        protected static final int ALL_FLAGS = END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
1247                | DONT_LOOP_FLAG;
1248
1249        /**
1250         * No special construction is required for the Builder.
1251         * @internal
1252         */

1253        public Builder() {
1254        }
1255
1256        /**
1257         * This is the main function for setting up the BreakIterator's tables. It
1258         * just vectors different parts of the job off to other functions.
1259         * @internal
1260         */

1261        public void buildBreakIterator() {
1262            Vector JavaDoc tempRuleList = buildRuleList(description);
1263            buildCharCategories(tempRuleList);
1264            buildStateTable(tempRuleList);
1265            buildBackwardsStateTable(tempRuleList);
1266        }
1267
1268        /**
1269         * Thus function has three main purposes:
1270         * <ul><li>Perform general syntax checking on the description, so the rest of the
1271         * build code can assume that it's parsing a legal description.
1272         * <li>Split the description into separate rules
1273         * <li>Perform variable-name substitutions (so that no one else sees variable names)
1274         * </ul>
1275         */

1276        private Vector JavaDoc buildRuleList(String JavaDoc description) {
1277            // invariants:
1278
// - parentheses must be balanced: ()[]{}<>
1279
// - nothing can be nested inside {}
1280
// - nothing can be nested inside [] except more []s
1281
// - pairs of ()[]{}<> must not be empty
1282
// - ; can only occur at the outer level
1283
// - | can only appear inside ()
1284
// - only one = or / can occur in a single rule
1285
// - = and / cannot both occur in the same rule
1286
// - the right-hand side of a = expression must be enclosed in [] or ()
1287
// - * may not occur at the beginning of a rule, nor may it follow
1288
// =, /, (, (, |, }, ;, or *
1289
// - ? may only follow *
1290
// - the rule list must contain at least one / rule
1291
// - no rule may be empty
1292
// - all printing characters in the ASCII range except letters and digits
1293
// are reserved and must be preceded by \
1294
// - ! may only occur at the beginning of a rule
1295

1296            // set up a vector to contain the broken-up description (each entry in the
1297
// vector is a separate rule) and a stack for keeping track of opening
1298
// punctuation
1299
Vector JavaDoc tempRuleList = new Vector JavaDoc();
1300            Stack JavaDoc parenStack = new Stack JavaDoc();
1301
1302            int p = 0;
1303            int ruleStart = 0;
1304            char c = '\u0000';
1305            char lastC = '\u0000';
1306            char lastOpen = '\u0000';
1307            boolean haveEquals = false;
1308            boolean havePipe = false;
1309            boolean sawVarName = false;
1310            boolean sawIllegalChar = false;
1311            int illegalCharPos = 0;
1312            final String JavaDoc charsThatCantPrecedeAsterisk = "=/<(|>*+?;\u0000";
1313
1314            // if the description doesn't end with a semicolon, tack a semicolon onto the end
1315
if (description.length() != 0 && description.charAt(description.length() - 1) != ';') {
1316                description = description + ";";
1317            }
1318
1319            // for each character, do...
1320
while (p < description.length()) {
1321                c = description.charAt(p);
1322                switch (c) {
1323                    // if the character is opening punctuation, verify that no nesting
1324
// rules are broken, and push the character onto the stack
1325
case '{':
1326                    case '[':
1327                    case '(':
1328                        if (lastOpen == '{') {
1329                            error("Can't nest brackets inside {}", p, description);
1330                        }
1331                        if (lastOpen == '[' && c != '[') {
1332                            error("Can't nest anything in [] but []", p, description);
1333                        }
1334
1335                        // if we see { anywhere except on the left-hand side of =,
1336
// we must be seeing a variable name that was never defined
1337
if (c == '{' && (haveEquals || havePipe)) {
1338                            error("Unknown variable name", p, description);
1339                        }
1340
1341                        lastOpen = c;
1342                        parenStack.push(new Character JavaDoc(c));
1343                        if (c == '{') {
1344                            sawVarName = true;
1345                        }
1346                        break;
1347
1348                    // if the character is closing punctuation, verify that it matches the
1349
// last opening punctuation we saw, and that the brackets contain
1350
// something, then pop the stack
1351
case '}':
1352                    case ']':
1353                    case ')':
1354                        char expectedClose = '\u0000';
1355                        switch (lastOpen) {
1356                            case '{':
1357                                expectedClose = '}';
1358                                break;
1359                            case '[':
1360                                expectedClose = ']';
1361                                break;
1362                            case '(':
1363                                expectedClose = ')';
1364                                break;
1365                        }
1366                        if (c != expectedClose) {
1367                            error("Unbalanced parentheses", p, description);
1368                        }
1369                        if (lastC == lastOpen) {
1370                            error("Parens don't contain anything", p, description);
1371                        }
1372                        parenStack.pop();
1373                        if (!parenStack.empty()) {
1374                            lastOpen = ((Character JavaDoc)(parenStack.peek())).charValue();
1375                        }
1376                        else {
1377                            lastOpen = '\u0000';
1378                        }
1379
1380                        break;
1381
1382                    // if the character is an asterisk, make sure it occurs in a place
1383
// where an asterisk can legally go
1384
case '*': case '+': case '?':
1385                        if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1
1386                                && (c != '?' || lastC != '*')) {
1387                            error("Misplaced *, +, or ?", p, description);
1388                        }
1389                        break;
1390
1391                    // if the character is an equals sign, make sure we haven't seen another
1392
// equals sign or a slash yet
1393
case '=':
1394                        if (haveEquals || havePipe) {
1395                            error("More than one = or / in rule", p, description);
1396                        }
1397                        haveEquals = true;
1398                        sawIllegalChar = false;
1399                        break;
1400
1401                    // if the character is a slash, make sure we haven't seen another slash
1402
// or an equals sign yet
1403
case '/':
1404                        if (haveEquals || havePipe) {
1405                            error("More than one = or / in rule", p, description);
1406                        }
1407                        if (sawVarName) {
1408                            error("Unknown variable name", p, description);
1409                        }
1410                        havePipe = true;
1411                        break;
1412
1413                    // if the character is an exclamation point, make sure it occurs only
1414
// at the beginning of a rule
1415
case '!':
1416                        if (lastC != ';' && lastC != '\u0000') {
1417                            error("! can only occur at the beginning of a rule", p, description);
1418                        }
1419                        break;
1420
1421                    // if the character is a backslash, skip the character that follows it
1422
// (it'll get treated as a literal character)
1423
case '\\':
1424                        ++p;
1425                        break;
1426
1427                    // we don't have to do anything special on a period
1428
case '.':
1429                        break;
1430
1431                    // if the character is a syntax character that can only occur
1432
// inside [], make sure that it does in fact only occur inside []
1433
// (or in a variable name)
1434
case '^':
1435                    case '-':
1436                    case ':':
1437                    case '&':
1438                        if (lastOpen != '[' && lastOpen != '{' && !sawIllegalChar) {
1439                            sawIllegalChar = true;
1440                            illegalCharPos = p;
1441                        }
1442                        break;
1443
1444                    // if the character is a semicolon, do the following...
1445
case ';':
1446                        // if we saw any illegal characters along the way, throw
1447
// an error
1448
if (sawIllegalChar) {
1449                            error("Illegal character", illegalCharPos, description);
1450                        }
1451
1452                        // make sure the rule contains something and that there are no
1453
// unbalanced parentheses or brackets
1454
if (lastC == ';' || lastC == '\u0000') {
1455                            error("Empty rule", p, description);
1456                        }
1457                        if (!parenStack.empty()) {
1458                            error("Unbalanced parenheses", p, description);
1459                        }
1460
1461                        if (parenStack.empty()) {
1462                            // if the rule contained an = sign, call processSubstitution()
1463
// to replace the substitution name with the substitution text
1464
// wherever it appears in the description
1465
if (haveEquals) {
1466                                description = processSubstitution(description.substring(ruleStart,
1467                                                p), description, p + 1);
1468                            }
1469                            else {
1470                                // otherwise, check to make sure the rule doesn't reference
1471
// any undefined substitutions
1472
if (sawVarName) {
1473                                    error("Unknown variable name", p, description);
1474                                }
1475
1476                                // then add it to tempRuleList
1477
tempRuleList.addElement(description.substring(ruleStart, p));
1478                            }
1479
1480                            // and reset everything to process the next rule
1481
ruleStart = p + 1;
1482                            haveEquals = havePipe = sawVarName = sawIllegalChar = false;
1483                        }
1484                        break;
1485
1486                    // if the character is a vertical bar, check to make sure that it
1487
// occurs inside a () expression and that the character that precedes
1488
// it isn't also a vertical bar
1489
case '|':
1490                        if (lastC == '|') {
1491                            error("Empty alternative", p, description);
1492                        }
1493                        if (parenStack.empty() || lastOpen != '(') {
1494                            error("Misplaced |", p, description);
1495                        }
1496                        break;
1497
1498                    // if the character is anything else (escaped characters are
1499
// skipped and don't make it here), it's an error
1500
default:
1501                        if (c >= ' ' && c < '\u007f' && !Character.isLetter(c)
1502                                && !Character.isDigit(c) && !sawIllegalChar) {
1503                            sawIllegalChar = true;
1504                            illegalCharPos = p;
1505                        }
1506                        break;
1507                }
1508                lastC = c;
1509                ++p;
1510            }
1511            if (tempRuleList.size() == 0) {
1512                error("No valid rules in description", p, description);
1513            }
1514            return tempRuleList;
1515        }
1516
1517        /**
1518         * This function performs variable-name substitutions. First it does syntax
1519         * checking on the variable-name definition. If it's syntactically valid, it
1520         * then goes through the remainder of the description and does a simple
1521         * find-and-replace of the variable name with its text. (The variable text
1522         * must be enclosed in either [] or () for this to work.)
1523         * @internal
1524         */

1525        protected String JavaDoc processSubstitution(String JavaDoc substitutionRule, String JavaDoc description,
1526                        int startPos) {
1527            // isolate out the text on either side of the equals sign
1528
String JavaDoc replace;
1529            String JavaDoc replaceWith;
1530            int equalPos = substitutionRule.indexOf('=');
1531            if (substitutionRule.charAt(0) != '$') {
1532                error("Missing '$' on left-hand side of =", startPos, description);
1533            }
1534            replace = substitutionRule.substring(1, equalPos);
1535            replaceWith = substitutionRule.substring(equalPos + 1);
1536
1537            // check to see whether the substitution name is something we've declared
1538
// to be "special". For RuleBasedBreakIterator itself, this is IGNORE_VAR.
1539
// This function takes care of any extra processing that has to be done
1540
// with "special" substitution names.
1541
handleSpecialSubstitution(replace, replaceWith, startPos, description);
1542
1543            // perform various other syntax checks on the rule
1544
if (replaceWith.length() == 0) {
1545                error("Nothing on right-hand side of =", startPos, description);
1546            }
1547            if (replace.length() == 0) {
1548                error("Nothing on left-hand side of =", startPos, description);
1549            }
1550            if (!(replaceWith.charAt(0) == '[' && replaceWith.charAt(replaceWith.length() - 1)
1551                    == ']') && !(replaceWith.charAt(0) == '(' && replaceWith.charAt(
1552                    replaceWith.length() - 1) == ')')) {
1553                error("Illegal right-hand side for =", startPos, description);
1554            }
1555
1556            // now go through the rest of the description (which hasn't been broken up
1557
// into separate rules yet) and replace every occurrence of the
1558
// substitution name with the substitution body
1559
replace = "$" + replace;
1560            StringBuffer JavaDoc result = new StringBuffer JavaDoc();
1561            result.append(description.substring(0, startPos));
1562            int lastPos = startPos;
1563            int pos = description.indexOf(replace, startPos);
1564            while (pos != -1) {
1565                // [liu] Check that the string we've found isn't a redefinition
1566
// of the variable.
1567
if (description.charAt(pos-1) == ';' &&
1568                    description.charAt(pos + replace.length()) == '=') {
1569                    error("Attempt to redefine " + replace, pos, description);
1570                }
1571                result.append(description.substring(lastPos, pos));
1572                result.append(replaceWith);
1573                lastPos = pos + replace.length();
1574                pos = description.indexOf(replace, lastPos);
1575            }
1576            result.append(description.substring(lastPos));
1577            return result.toString();
1578        }
1579
1580        /**
1581         * This function defines a protocol for handling substitution names that
1582         * are "special," i.e., that have some property beyond just being
1583         * substitutions. At the RuleBasedBreakIterator_Old level, we have one
1584         * special substitution name, IGNORE_VAR. Subclasses can override this
1585         * function to add more. Any special processing that has to go on beyond
1586         * that which is done by the normal substitution-processing code is done
1587         * here.
1588         * @internal
1589         */

1590        protected void handleSpecialSubstitution(String JavaDoc replace, String JavaDoc replaceWith,
1591                    int startPos, String JavaDoc description) {
1592            // if we get a definition for a substitution called IGNORE_VAR, it defines
1593
// the ignore characters for the iterator. Check to make sure the expression
1594
// is a [] expression, and if it is, parse it and store the characters off
1595
// to the side.
1596
if (replace.equals(IGNORE_VAR)) {
1597                if (replaceWith.charAt(0) == '(') {
1598                    error("Ignore group can't be enclosed in (", startPos, description);
1599                }
1600                ignoreChars = new UnicodeSet(replaceWith, false);
1601            }
1602        }
1603
1604        /**
1605         * This function builds the character category table. On entry,
1606         * tempRuleList is a vector of break rules that has had variable names substituted.
1607         * On exit, the charCategoryTable data member has been initialized to hold the
1608         * character category table, and tempRuleList's rules have been munged to contain
1609         * character category numbers everywhere a literal character or a [] expression
1610         * originally occurred.
1611         * @internal
1612         */

1613        protected void buildCharCategories(Vector JavaDoc tempRuleList) {
1614            int bracketLevel = 0;
1615            int p = 0;
1616            int lineNum = 0;
1617
1618            // build hash table of every literal character or [] expression in the rule list
1619
// and derive a UnicodeSet object representing the characters each refers to
1620
expressions = new Hashtable JavaDoc();
1621            while (lineNum < tempRuleList.size()) {
1622                String JavaDoc line = (String JavaDoc)(tempRuleList.elementAt(lineNum));
1623                p = 0;
1624                while (p < line.length()) {
1625                    char c = line.charAt(p);
1626                    switch (c) {
1627                        // skip over all syntax characters except [
1628
case '(': case ')': case '*': case '.': case '/':
1629                        case '|': case ';': case '?': case '!': case '+':
1630                            break;
1631
1632                        // for [, find the matching ] (taking nested [] pairs into account)
1633
// and add the whole expression to the expression list
1634
case '[':
1635                            int q = p + 1;
1636                            ++bracketLevel;
1637                            while (q < line.length() && bracketLevel != 0) {
1638                                c = line.charAt(q);
1639                                if (c == '[') {
1640                                    ++bracketLevel;
1641                                }
1642                                else if (c == ']') {
1643                                    --bracketLevel;
1644                                }
1645                                ++q;
1646                            }
1647                            if (expressions.get(line.substring(p, q)) == null) {
1648                                expressions.put(line.substring(p, q), new UnicodeSet(line.
1649                                                substring(p, q), false));
1650//Test.debugPrintln("1. Adding expression: " + line.substring(p, q));
1651
}
1652                            p = q - 1;
1653                            break;
1654
1655                        // for \ sequences, just move to the next character and treat
1656
// it as a single character
1657
case '\\':
1658                            ++p;
1659                            c = line.charAt(p);
1660                            // DON'T break; fall through into "default" clause
1661

1662                        // for an isolated single character, add it to the expression list
1663
default:
1664                            UnicodeSet s = new UnicodeSet();
1665                            s.add(line.charAt(p));
1666                            expressions.put(line.substring(p, p + 1), s);
1667//Test.debugPrintln("2. Adding expression: " + line.substring(p, p + 1));
1668
break;
1669                    }
1670                    ++p;
1671                }
1672                ++lineNum;
1673            }
1674
1675            // create the temporary category table (which is a vector of UnicodeSet objects)
1676
categories = new Vector JavaDoc();
1677            if (ignoreChars != null) {
1678                categories.addElement(ignoreChars);
1679            }
1680            else {
1681                categories.addElement(new UnicodeSet());
1682            }
1683            ignoreChars = null;
1684
1685            // this is a hook to allow subclasses to add categories on their own
1686
mungeExpressionList(expressions);
1687
1688            // Derive the character categories. Go through the existing character categories
1689
// looking for overlap. Any time there's overlap, we create a new character
1690
// category for the characters that overlapped and remove them from their original
1691
// category. At the end, any characters that are left in the expression haven't
1692
// been mentioned in any category, so another new category is created for them.
1693
// For example, if the first expression is [abc], then a, b, and c will be placed
1694
// into a single character category. If the next expression is [bcd], we will first
1695
// remove b and c from their existing category (leaving a behind), create a new
1696
// category for b and c, and then create another new category for d (which hadn't
1697
// been mentioned in the previous expression).
1698
// At no time should a character ever occur in more than one character category.
1699

1700            // for each expression in the expressions list, do...
1701
Enumeration JavaDoc iter = expressions.elements();
1702            while (iter.hasMoreElements()) {
1703                // initialize the working char set to the chars in the current expression
1704
UnicodeSet work = new UnicodeSet((UnicodeSet)iter.nextElement());
1705
1706                // for each category in the category list, do...
1707
for (int j = categories.size() - 1; !work.isEmpty() && j > 0; j--) {
1708
1709                    // if there's overlap between the current working set of chars
1710
// and the current category...
1711
UnicodeSet cat = (UnicodeSet)(categories.elementAt(j));
1712                    UnicodeSet overlap = intersection(work, cat);
1713
1714                    if (!overlap.isEmpty()) {
1715                        // if the current category is not a subset of the current
1716
// working set of characters, then remove the overlapping
1717
// characters from the current category and create a new
1718
// category for them
1719
if (!overlap.equals(cat)) {
1720                            cat.removeAll(overlap);
1721                            categories.addElement(overlap);
1722                        }
1723
1724                        // and always remove the overlapping characters from the current
1725
// working set of characters
1726
work.removeAll(overlap);
1727                    }
1728                }
1729
1730                // if there are still characters left in the working char set,
1731
// add a new category containing them
1732
if (!work.isEmpty()) {
1733                    categories.addElement(work);
1734                }
1735            }
1736
1737            // we have the ignore characters stored in position 0. Make an extra pass through
1738
// the character category list and remove anything from the ignore list that shows
1739
// up in some other category
1740
UnicodeSet allChars = new UnicodeSet();
1741            for (int i = 1; i < categories.size(); i++)
1742                allChars.addAll((UnicodeSet)(categories.elementAt(i)));
1743            UnicodeSet ignoreChars = (UnicodeSet)(categories.elementAt(0));
1744            ignoreChars.removeAll(allChars);
1745
1746            // now that we've derived the character categories, go back through the expression
1747
// list and replace each UnicodeSet object with a String that represents the
1748
// character categories that expression refers to. The String is encoded: each
1749
// character is a character category number (plus 0x100 to avoid confusing them
1750
// with syntax characters in the rule grammar)
1751
iter = expressions.keys();
1752            while (iter.hasMoreElements()) {
1753                String JavaDoc key = (String JavaDoc)iter.nextElement();
1754                UnicodeSet cs = (UnicodeSet)expressions.get(key);
1755                StringBuffer JavaDoc cats = new StringBuffer JavaDoc();
1756
1757                // for each category...
1758
for (int j = 1; j < categories.size(); j++) {
1759                    UnicodeSet cat = new UnicodeSet((UnicodeSet) categories.elementAt(j));
1760
1761                    // if the current expression contains characters in that category...
1762
if (cs.containsAll(cat)) {
1763
1764                        // then add the encoded category number to the String for this
1765
// expression
1766
cats.append((char)(0x100 + j));
1767                        if (cs.equals(cat)) {
1768                            break;
1769                        }
1770                    }
1771                }
1772
1773                // once we've finished building the encoded String for this expression,
1774
// replace the UnicodeSet object with it
1775
expressions.put(key, cats.toString());
1776            }
1777
1778            // and finally, we turn the temporary category table into a permanent category
1779
// table, which is a CompactByteArray. (we skip category 0, which by definition
1780
// refers to all characters not mentioned specifically in the rules)
1781
charCategoryTable = new CompactByteArray((byte)0);
1782
1783            // for each category...
1784
for (int i = 0; i < categories.size(); i++) {
1785                UnicodeSet chars = (UnicodeSet)(categories.elementAt(i));
1786                int n = chars.getRangeCount();
1787
1788                // go through the character ranges in the category one by one...
1789
for (int j = 0; j < n; ++j) {
1790                    int rangeStart = chars.getRangeStart(j);
1791
1792                    // (ignore anything above the BMP for now...)
1793
if (rangeStart >= 0x10000) {
1794                        break;
1795                    }
1796
1797                    // and set the corresponding elements in the CompactArray accordingly
1798
if (i != 0) {
1799                        charCategoryTable.setElementAt((char)rangeStart,
1800                            (char)chars.getRangeEnd(j), (byte)i);
1801                    }
1802
1803                    // (category 0 is special-- it's the hiding place for the ignore
1804
// characters, whose real category number in the CompactArray is
1805
// -1 [this is because category 0 contains all characters not
1806
// specifically mentioned anywhere in the rules] )
1807
else {
1808                        charCategoryTable.setElementAt((char)rangeStart,
1809                            (char)chars.getRangeEnd(j), IGNORE);
1810                    }
1811                }
1812            }
1813
1814            // once we've populated the CompactArray, compact it
1815
charCategoryTable.compact();
1816
1817            // initialize numCategories
1818
numCategories = categories.size();
1819        }
1820
1821        /** @internal */
1822        protected void mungeExpressionList(Hashtable JavaDoc expressions) {
1823            // empty in the parent class. This function provides a hook for subclasses
1824
// to mess with the character category table.
1825
}
1826
1827        /**
1828         * This is the function that builds the forward state table. Most of the real
1829         * work is done in parseRule(), which is called once for each rule in the
1830         * description.
1831         */

1832        private void buildStateTable(Vector JavaDoc tempRuleList) {
1833            // initialize our temporary state table, and fill it with two states:
1834
// state 0 is a dummy state that allows state 1 to be the starting state
1835
// and 0 to represent "stop". State 1 is added here to seed things
1836
// before we start parsing
1837
tempStateTable = new Vector JavaDoc();
1838            tempStateTable.addElement(new short[numCategories + 1]);
1839            tempStateTable.addElement(new short[numCategories + 1]);
1840
1841            // call parseRule() for every rule in the rule list (except those which
1842
// start with !, which are actually backwards-iteration rules)
1843
// variable not used int n = tempRuleList.size();
1844
for (int i = 0; i < tempRuleList.size(); i++) {
1845                String JavaDoc rule = (String JavaDoc)tempRuleList.elementAt(i);
1846                if (rule.charAt(0) != '!') {
1847                    parseRule(rule, true);
1848                }
1849            }
1850
1851            // finally, use finishBuildingStateTable() to minimize the number of
1852
// states in the table and perform some other cleanup work
1853
finishBuildingStateTable(true);
1854/*
1855System.out.print("C:\t");
1856for (int i = 0; i < numCategories; i++)
1857System.out.print(Integer.toString(i) + "\t");
1858System.out.println(); System.out.print("=================================================");
1859for (int i = 0; i < stateTable.length; i++) {
1860if (i % numCategories == 0) {
1861System.out.println();
1862if (endStates[i / numCategories])
1863System.out.print("*");
1864else
1865System.out.print(" ");
1866if (lookaheadStates[i / numCategories]) {
1867System.out.print("%");
1868}
1869else
1870System.out.print(" ");
1871System.out.print(Integer.toString(i / numCategories) + ":\t");
1872}
1873if (stateTable[i] == 0) System.out.print(".\t"); else System.out.print(Integer.toString(stateTable[i]) + "\t");
1874}
1875System.out.println();
1876*/

1877    }
1878
1879        /**
1880         * This is where most of the work really happens. This routine parses a single
1881         * rule in the rule description, adding and modifying states in the state
1882         * table according to the new expression. The state table is kept deterministic
1883         * throughout the whole operation, although some ugly postprocessing is needed
1884         * to handle the *? token.
1885         */

1886        private void parseRule(String JavaDoc rule, boolean forward) {
1887            // algorithm notes:
1888
// - The basic idea here is to read successive character-category groups
1889
// from the input string. For each group, you create a state and point
1890
// the appropriate entries in the previous state to it. This produces a
1891
// straight line from the start state to the end state. The ?, +, *, and (|)
1892
// idioms produce branches in this straight line. These branches (states
1893
// that can transition to more than one other state) are called "decision
1894
// points." A list of decision points is kept. This contains a list of
1895
// all states that can transition to the next state to be created. For a
1896
// straight line progression, the only thing in the decision-point list is
1897
// the current state. But if there's a branch, the decision-point list
1898
// will contain all of the beginning points of the branch when the next
1899
// state to be created represents the end point of the branch. A stack is
1900
// used to save decision point lists in the presence of nested parentheses
1901
// and the like. For example, when a { is encountered, the current decision
1902
// point list is saved on the stack and restored when the corresponding }
1903
// is encountered. This way, after the } is read, the decision point list
1904
// will contain both the state right before the } _and_ the state before
1905
// the whole {} expression. Both of these states can transition to the next
1906
// state after the {} expression.
1907
// - one complication arises when we have to stamp a transition value into
1908
// an array cell that already contains one. The updateStateTable() and
1909
// mergeStates() functions handle this case. Their basic approach is to
1910
// create a new state that combines the two states that conflict and point
1911
// at it when necessary. This happens recursively, so if the merged states
1912
// also conflict, they're resolved in the same way, and so on. There are
1913
// a number of tests aimed at preventing infinite recursion.
1914
// - another complication arises with repeating characters. It's somewhat
1915
// ambiguous whether the user wants a greedy or non-greedy match in these cases.
1916
// (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in
1917
// "abc" or the LONGEST sequence of letters ending in "abc". We've adopted
1918
// the *? to mean "shortest" and * by itself to mean "longest". (You get the
1919
// same result with both if there's no overlap between the repeating character
1920
// group and the group immediately following it.) Handling the *? token is
1921
// rather complicated and involves keeping track of whether a state needs to
1922
// be merged (as described above) or merely overwritten when you update one of
1923
// its cells, and copying the contents of a state that loops with a *? token
1924
// into some of the states that follow it after the rest of the table-building
1925
// process is complete ("backfilling").
1926
// implementation notes:
1927
// - This function assumes syntax checking has been performed on the input string
1928
// prior to its being passed in here. It assumes that parentheses are
1929
// balanced, all literal characters are enclosed in [] and turned into category
1930
// numbers, that there are no illegal characters or character sequences, and so
1931
// on. Violation of these invariants will lead to undefined behavior.
1932
// - It'd probably be better to use linked lists rather than Vector and Stack
1933
// to maintain the decision point list and stack. I went for simplicity in
1934
// this initial implementation. If performance is critical enough, we can go
1935
// back and fix this later.
1936
// -There are a number of important limitations on the *? token. It does not work
1937
// right when followed by a repeating character sequence (e.g., ".*?(abc)*")
1938
// (although it does work right when followed by a single repeating character).
1939
// It will not always work right when nested in parentheses or braces (although
1940
// sometimes it will). It also will not work right if the group of repeating
1941
// characters and the group of characters that follows overlap partially
1942
// (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for
1943
// describing breaking rules we know about, so we left them out for
1944
// expeditiousness.
1945
// - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"--
1946
// that is, if the string ends in "aaaabc", the break will go before the first
1947
// "a" rather than the last one. Both of these are limitations in the design
1948
// of RuleBasedBreakIterator and not limitations of the rule parser.
1949

1950            int p = 0;
1951            int currentState = 1; // don't use state number 0; 0 means "stop"
1952
int lastState = currentState;
1953            String JavaDoc pendingChars = "";
1954
1955            decisionPointStack = new Stack JavaDoc();
1956            decisionPointList = new Vector JavaDoc();
1957            loopingStates = new Vector JavaDoc();
1958            statesToBackfill = new Vector JavaDoc();
1959
1960            short[] state;
1961            boolean sawEarlyBreak = false;
1962
1963            // if we're adding rules to the backward state table, mark the initial state
1964
// as a looping state
1965
if (!forward) {
1966                loopingStates.addElement(new Integer JavaDoc(1));
1967            }
1968
1969            // put the current state on the decision point list before we start
1970
decisionPointList.addElement(new Integer JavaDoc(currentState)); // we want currentState to
1971
// be 1 here...
1972
currentState = tempStateTable.size() - 1; // but after that, we want it to be
1973
// 1 less than the state number of the next state
1974
while (p < rule.length()) {
1975                char c = rule.charAt(p);
1976                clearLoopingStates = false;
1977
1978                // this section handles literal characters, escaped characters (which are
1979
// effectively literal characters too), the . token, and [] expressions
1980
if (c == '['
1981                    || c == '\\'
1982                    || Character.isLetter(c)
1983                    || Character.isDigit(c)
1984                    || c < ' '
1985                    || c == '.'
1986                    || c >= '\u007f') {
1987
1988                    // if we're not on a period, isolate the expression and look up
1989
// the corresponding category list
1990
if (c != '.') {
1991                        int q = p;
1992
1993                        // if we're on a backslash, the expression is the character
1994
// after the backslash
1995
if (c == '\\') {
1996                            q = p + 2;
1997                            ++p;
1998                        }
1999
2000                        // if we're on an opening bracket, scan to the closing bracket
2001
// to isolate the expression
2002
else if (c == '[') {
2003                            int bracketLevel = 1;
2004                            while (bracketLevel > 0) {
2005                                ++q;
2006                                c = rule.charAt(q);
2007                                if (c == '[') {
2008                                    ++bracketLevel;
2009                                }
2010                                else if (c == ']') {
2011                                    --bracketLevel;
2012                                }
2013                                else if (c == '\\') {
2014                                    ++q;
2015                                }
2016                            }
2017                            ++q;
2018                        }
2019
2020                        // otherwise, the expression is just the character itself
2021
else {
2022                            q = p + 1;
2023                        }
2024
2025                        // look up the category list for the expression and store it
2026
// in pendingChars
2027
pendingChars = (String JavaDoc)expressions.get(rule.substring(p, q));
2028
2029                        // advance the current position past the expression
2030
p = q - 1;
2031                    }
2032
2033                    // if the character we're on is a period, we end up down here
2034
else {
2035                        int rowNum = ((Integer JavaDoc)decisionPointList.lastElement()).intValue();
2036                        state = (short[])tempStateTable.elementAt(rowNum);
2037
2038                        // if the period is followed by an asterisk, then just set the current
2039
// state to loop back on itself
2040
if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
2041                            decisionPointList.addElement(new Integer JavaDoc(state[0]));
2042                            pendingChars = "";
2043                            ++p;
2044                            if (p + 1 < rule.length() && rule.charAt(p + 1) == '?') {
2045//System.out.println("Saw *?");
2046
setLoopingStates(decisionPointList, decisionPointList);
2047                                ++p;
2048                            }
2049//System.out.println("Saw .*");
2050
}
2051
2052                        // otherwise, fabricate a category list ("pendingChars") with
2053
// every category in it
2054
else {
2055                            StringBuffer JavaDoc temp = new StringBuffer JavaDoc();
2056                            for (int i = 0; i < numCategories; i++)
2057                                temp.append((char)(i + 0x100));
2058                            pendingChars = temp.toString();
2059                        }
2060                    }
2061
2062                    // we'll end up in here for all expressions except for .*, which is
2063
// special-cased above
2064
if (pendingChars.length() != 0) {
2065
2066                        // if the expression is followed by an asterisk or a question mark,
2067
// then push a copy of the current decision point list onto the stack
2068
if (p + 1 < rule.length() && (
2069                            rule.charAt(p + 1) == '*' ||
2070                            rule.charAt(p + 1) == '?'
2071                        )) {
2072                            decisionPointStack.push(decisionPointList.clone());
2073                        }
2074
2075                        // create a new state, add it to the list of states to backfill
2076
// if we have looping states to worry about, set its "don't make
2077
// me an accepting state" flag if we've seen a slash, and add
2078
// it to the end of the state table
2079
int newState = tempStateTable.size();
2080                        if (loopingStates.size() != 0) {
2081                            statesToBackfill.addElement(new Integer JavaDoc(newState));
2082                        }
2083                        state = new short[numCategories + 1];
2084                        if (sawEarlyBreak) {
2085                            state[numCategories] = DONT_LOOP_FLAG;
2086                        }
2087                        tempStateTable.addElement(state);
2088
2089                        // update everybody in the decision point list to point to
2090
// the new state (this also performs all the reconciliation
2091
// needed to make the table deterministic), then clear the
2092
// decision point list
2093
updateStateTable(decisionPointList, pendingChars, (short)newState);
2094                        decisionPointList.removeAllElements();
2095
2096                        // add all states created since the last literal character we've
2097
// seen to the decision point list
2098
lastState = currentState;
2099                        do {
2100                            ++currentState;
2101                            decisionPointList.addElement(new Integer JavaDoc(currentState));
2102                        } while (currentState + 1 < tempStateTable.size());
2103                    }
2104                }
2105
2106                // a * or a + denotes a repeating character or group, and a ? denotes an
2107
// optional character group. (*, + and ? after () are handled separately below.)
2108
if (c == '+' || c == '*' || c == '?') {
2109                    // when there's a * or a +, update the current state to loop back on itself
2110
// on the character categories that caused us to enter this state
2111
if (c == '*' || c == '+') {
2112                        // Note: we process one state at a time because updateStateTable
2113
// may add new states, and we want to process them as well.
2114
for (int i = lastState + 1; i < tempStateTable.size(); i++) {
2115                            Vector JavaDoc temp = new Vector JavaDoc();
2116                            temp.addElement(new Integer JavaDoc(i));
2117                            updateStateTable(temp, pendingChars, (short)(lastState + 1));
2118                        }
2119
2120                        // If we just added any new states, add them to the decison point list
2121
// Note: it might be a good idea to avoid adding new states to the
2122
// decision point list in more than one place...
2123
while (currentState + 1 < tempStateTable.size()) {
2124                            decisionPointList.addElement(new Integer JavaDoc(++currentState));
2125                        }
2126                    }
2127
2128                    // for * and ? pop the top element off the decision point stack and merge
2129
// it with the current decision point list (this causes the divergent
2130
// paths through the state table to come together again on the next
2131
// new state)
2132
if (c == '*' || c == '?') {
2133                        Vector JavaDoc temp = (Vector JavaDoc)decisionPointStack.pop();
2134                        for (int i = 0; i < decisionPointList.size(); i++)
2135                            temp.addElement(decisionPointList.elementAt(i));
2136                        decisionPointList = temp;
2137
2138                        // a ? after a * modifies the behavior of * in cases where there is overlap
2139
// between the set of characters that repeat and the characters which follow.
2140
// Without the ?, all states following the repeating state, up to a state which
2141
// is reached by a character that doesn't overlap, will loop back into the
2142
// repeating state. With the ?, the mark states following the *? DON'T loop
2143
// back into the repeating state. Thus, "[a-z]*xyz" will match the longest
2144
// sequence of letters that ends in "xyz," while "[a-z]*? will match the
2145
// _shortest_ sequence of letters that ends in "xyz".
2146
// We use extra bookkeeping to achieve this effect, since everything else works
2147
// according to the "longest possible match" principle. The basic principle
2148
// is that transitions out of a looping state are written in over the looping
2149
// value instead of being reconciled, and that we copy the contents of the
2150
// looping state into empty cells of all non-terminal states that follow the
2151
// looping state.
2152
//System.out.println("c = " + c + ", p = " + p + ", rule.length() = " + rule.length());
2153
if (c == '*' && p + 1 < rule.length() && rule.charAt(p + 1) == '?') {
2154//System.out.println("Saw *?");
2155
setLoopingStates(decisionPointList, decisionPointList);
2156                            ++p;
2157                        }
2158                    }
2159                }
2160
2161                // a ( marks the beginning of a sequence of characters. Parentheses can either
2162
// contain several alternative character sequences (i.e., "(ab|cd|ef)"), or
2163
// they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus,
2164
// A () group can have multiple entry and exit points. To keep track of this,
2165
// we reserve TWO spots on the decision-point stack. The top of the stack is
2166
// the list of exit points, which becomes the current decision point list when
2167
// the ) is reached. The next entry down is the decision point list at the
2168
// beginning of the (), which becomes the current decision point list at every
2169
// entry point.
2170
// In addition to keeping track of the exit points and the active decision
2171
// points before the ( (i.e., the places from which the () can be entered),
2172
// we need to keep track of the entry points in case the expression loops
2173
// (i.e., is followed by *). We do that by creating a dummy state in the
2174
// state table and adding it to the decision point list (BEFORE it's duplicated
2175
// on the stack). Nobody points to this state, so it'll get optimized out
2176
// at the end. It exists only to hold the entry points in case the ()
2177
// expression loops.
2178
if (c == '(') {
2179
2180                    // add a new state to the state table to hold the entry points into
2181
// the () expression
2182
tempStateTable.addElement(new short[numCategories + 1]);
2183
2184                    // we have to adjust lastState and currentState to account for the
2185
// new dummy state
2186
lastState = currentState;
2187                    ++currentState;
2188
2189                    // add the current state to the decision point list (add it at the
2190
// BEGINNING so we can find it later)
2191
decisionPointList.insertElementAt(new Integer JavaDoc(currentState), 0);
2192
2193                    // finally, push a copy of the current decision point list onto the
2194
// stack (this keeps track of the active decision point list before
2195
// the () expression), followed by an empty decision point list
2196
// (this will hold the exit points)
2197
decisionPointStack.push(decisionPointList.clone());
2198                    decisionPointStack.push(new Vector JavaDoc());
2199                }
2200
2201                // a | separates alternative character sequences in a () expression. When
2202
// a | is encountered, we add the current decision point list to the exit-point
2203
// list, and restore the decision point list to its state prior to the (.
2204
if (c == '|') {
2205
2206                    // pick out the top two decision point lists on the stack
2207
Vector JavaDoc oneDown = (Vector JavaDoc)decisionPointStack.pop();
2208                    Vector JavaDoc twoDown = (Vector JavaDoc)decisionPointStack.peek();
2209                    decisionPointStack.push(oneDown);
2210
2211                    // append the current decision point list to the list below it
2212
// on the stack (the list of exit points), and restore the
2213
// current decision point list to its state before the () expression
2214
for (int i = 0; i < decisionPointList.size(); i++)
2215                        oneDown.addElement(decisionPointList.elementAt(i));
2216                    decisionPointList = (Vector JavaDoc)twoDown.clone();
2217                }
2218
2219                // a ) marks the end of a sequence of characters. We do one of two things
2220
// depending on whether the sequence repeats (i.e., whether the ) is followed
2221
// by *): If the sequence doesn't repeat, then the exit-point list is merged
2222
// with the current decision point list and the decision point list from before
2223
// the () is thrown away. If the sequence does repeat, then we fish out the
2224
// state we were in before the ( and copy all of its forward transitions
2225
// (i.e., every transition added by the () expression) into every state in the
2226
// exit-point list and the current decision point list. The current decision
2227
// point list is then merged with both the exit-point list AND the saved version
2228
// of the decision point list from before the (). Then we throw out the *.
2229
if (c == ')') {
2230
2231                    // pull the exit point list off the stack, merge it with the current
2232
// decision point list, and make the merged version the current
2233
// decision point list
2234
Vector JavaDoc exitPoints = (Vector JavaDoc)decisionPointStack.pop();
2235                    for (int i = 0; i < decisionPointList.size(); i++)
2236                        exitPoints.addElement(decisionPointList.elementAt(i));
2237                    decisionPointList = exitPoints;
2238
2239                    // if the ) isn't followed by a *, + or ?, then all we have to do is throw
2240
// away the other list on the decision point stack, and we're done
2241
if (p + 1 >= rule.length() || (
2242                            rule.charAt(p + 1) != '*' &&
2243                            rule.charAt(p + 1) != '+' &&
2244                            rule.charAt(p + 1) != '?')
2245                    ) {
2246                        decisionPointStack.pop();
2247                    }
2248
2249                    // but if the sequence is conditional or it repeats,
2250
// we have a lot more work to do...
2251
else {
2252
2253                        // now exitPoints and decisionPointList have to point to equivalent
2254
// vectors, but not the SAME vector
2255
exitPoints = (Vector JavaDoc)decisionPointList.clone();
2256
2257                        // pop the original decision point list off the stack
2258
Vector JavaDoc temp = (Vector JavaDoc)decisionPointStack.pop();
2259
2260                        // we squirreled away the row number of our entry point list
2261
// at the beginning of the original decision point list. Fish
2262
// that state number out and retrieve the entry point list
2263
int tempStateNum = ((Integer JavaDoc)temp.firstElement()).intValue();
2264                        short[] tempState = (short[])tempStateTable.elementAt(tempStateNum);
2265
2266                        // merge the original decision point list with the current
2267
// decision point list
2268
if (rule.charAt(p + 1) == '?' || rule.charAt(p + 1) == '*') {
2269                            for (int i = 0; i < decisionPointList.size(); i++)
2270                                temp.addElement(decisionPointList.elementAt(i));
2271                            decisionPointList = temp;
2272                        }
2273
2274                        // finally, for * and + copy every forward reference from the entry point
2275
// list into every state in the new decision point list
2276
if (rule.charAt(p + 1) == '+' || rule.charAt(p + 1) == '*') {
2277                            for (int i = 0; i < tempState.length; i++) {
2278                                if (tempState[i] > tempStateNum) {
2279                                    updateStateTable(exitPoints,
2280                                                     new Character JavaDoc((char)(i + 0x100)).toString(),
2281                                                     tempState[i]);
2282                                }
2283                            }
2284                        }
2285
2286                        // update lastState and currentState, and throw away the *, +, or ?
2287
lastState = currentState;
2288                        currentState = tempStateTable.size() - 1;
2289                        ++p;
2290                    }
2291                }
2292
2293                // a / marks the position where the break is to go if the character sequence
2294
// matches this rule. We update the flag word of every state on the decision
2295
// point list to mark them as ending states, and take note of the fact that
2296
// we've seen the slash
2297
if (c == '/') {
2298                    sawEarlyBreak = true;
2299                    for (int i = 0; i < decisionPointList.size(); i++) {
2300                        state = (short[])tempStateTable.elementAt(((Integer JavaDoc)decisionPointList.
2301                                        elementAt(i)).intValue());
2302                        state[numCategories] |= LOOKAHEAD_STATE_FLAG;
2303                    }
2304                }
2305
2306                // if we get here without executing any of the above clauses, we have a
2307
// syntax error. However, for now we just ignore the offending character
2308
// and move on
2309
/*
2310debugPrintln("====Parsed \"" + rule.substring(0, p + 1) + "\"...");
2311System.out.println(" currentState = " + currentState);
2312debugPrintVectorOfVectors(" decisionPointStack:", " ", decisionPointStack);
2313debugPrintVector(" ", decisionPointList);
2314debugPrintVector(" loopingStates = ", loopingStates);
2315debugPrintVector(" statesToBackfill = ", statesToBackfill);
2316System.out.println(" sawEarlyBreak = " + sawEarlyBreak);
2317debugPrintTempStateTable();
2318*/

2319
2320                // clearLoopingStates is a signal back from updateStateTable() that we've
2321
// transitioned to a state that won't loop back to the current looping
2322
// state. (In other words, we've gotten to a point where we can no longer
2323
// go back into a *? we saw earlier.) Clear out the list of looping states
2324
// and backfill any states that need to be backfilled.
2325
if (clearLoopingStates) {
2326                    setLoopingStates(null, decisionPointList);
2327                }
2328
2329                // advance to the next character, now that we've processed the current
2330
// character
2331
++p;
2332            }
2333
2334            // this takes care of backfilling any states that still need to be backfilled
2335
setLoopingStates(null, decisionPointList);
2336
2337            // when we reach the end of the string, we do a postprocessing step to mark the
2338
// end states. The decision point list contains every state that can transition
2339
// to the end state-- that is, every state that is the last state in a sequence
2340
// that matches the rule. All of these states are considered "mark states"
2341
// or "accepting states"-- that is, states that cause the position returned from
2342
// next() to be updated. A mark state represents a possible break position.
2343
// This allows us to look ahead and remember how far the rule matched
2344
// before following the new branch (see next() for more information).
2345
// The temporary state table has an extra "flag column" at the end where this
2346
// information is stored. We mark the end states by setting a flag in their
2347
// flag column.
2348
// Now if we saw the / in the rule, then everything after it is lookahead
2349
// material and the break really goes where the slash is. In this case,
2350
// we mark these states as BOTH accepting states and lookahead states. This
2351
// signals that these states cause the break position to be updated to the
2352
// position of the slash rather than the current break position.
2353
for (int i = 0; i < decisionPointList.size(); i++) {
2354                int rowNum = ((Integer JavaDoc)decisionPointList.elementAt(i)).intValue();
2355                state = (short[])tempStateTable.elementAt(rowNum);
2356                state[numCategories] |= END_STATE_FLAG;
2357                if (sawEarlyBreak) {
2358                    state[numCategories] |= LOOKAHEAD_STATE_FLAG;
2359                }
2360            }
2361/*
2362debugPrintln("====Parsed \"" + rule + ";");
2363System.out.println();
2364System.out.println(" currentState = " + currentState);
2365debugPrintVectorOfVectors(" decisionPointStack:", " ", decisionPointStack);
2366debugPrintVector(" ", decisionPointList);
2367debugPrintVector(" loopingStates = ", loopingStates);
2368debugPrintVector(" statesToBackfill = ", statesToBackfill);
2369System.out.println(" sawEarlyBreak = " + sawEarlyBreak);
2370debugPrintTempStateTable();
2371*/

2372        }
2373
2374
2375        /**
2376         * Update entries in the state table, and merge states when necessary to keep
2377         * the table deterministic.
2378         * @param rows The list of rows that need updating (the decision point list)
2379         * @param pendingChars A character category list, encoded in a String. This is the
2380         * list of the columns that need updating.
2381         * @param newValue Update the cells specfied above to contain this value
2382         */

2383        private void updateStateTable(Vector JavaDoc rows,
2384                                      String JavaDoc pendingChars,
2385                                      short newValue) {
2386            // create a dummy state that has the specified row number (newValue) in
2387
// the cells that need to be updated (those specified by pendingChars)
2388
// and 0 in the other cells
2389
short[] newValues = new short[numCategories + 1];
2390            for (int i = 0; i < pendingChars.length(); i++)
2391                newValues[(int)(pendingChars.charAt(i)) - 0x100] = newValue;
2392
2393            // go through the list of rows to update, and update them by calling
2394
// mergeStates() to merge them the the dummy state we created
2395
for (int i = 0; i < rows.size(); i++) {
2396                mergeStates(((Integer JavaDoc)rows.elementAt(i)).intValue(), newValues, rows);
2397            }
2398        }
2399
2400        /**
2401         * The real work of making the state table deterministic happens here. This function
2402         * merges a state in the state table (specified by rowNum) with a state that is
2403         * passed in (newValues). The basic process is to copy the nonzero cells in newStates
2404         * into the state in the state table (we'll call that oldValues). If there's a
2405         * collision (i.e., if the same cell has a nonzero value in both states, and it's
2406         * not the SAME value), then we have to reconcile the collision. We do this by
2407         * creating a new state, adding it to the end of the state table, and using this
2408         * function recursively to merge the original two states into a single, combined
2409         * state. This process may happen recursively (i.e., each successive level may
2410         * involve collisions). To prevent infinite recursion, we keep a log of merge
2411         * operations. Any time we're merging two states we've merged before, we can just
2412         * supply the row number for the result of that merge operation rather than creating
2413         * a new state just like it.
2414         * @param rowNum The row number in the state table of the state to be updated
2415         * @param newValues The state to merge it with.
2416         * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
2417         * (itself a copy of the decision point list from parseRule()). Newly-created
2418         * states get added to the decision point list if their "parents" were on it.
2419         */

2420        private void mergeStates(int rowNum,
2421                                 short[] newValues,
2422                                 Vector JavaDoc rowsBeingUpdated) {
2423            short[] oldValues = (short[])(tempStateTable.elementAt(rowNum));
2424/*
2425System.out.print("***Merging " + rowNum + ":");
2426for (int i = 0; i < oldValues.length; i++) System.out.print("\t" + oldValues[i]);
2427System.out.println();
2428System.out.print(" with \t");
2429for (int i = 0; i < newValues.length; i++) System.out.print("\t" + newValues[i]);
2430System.out.println();
2431*/

2432
2433            boolean isLoopingState = loopingStates.contains(new Integer JavaDoc(rowNum));
2434
2435            // for each of the cells in the rows we're reconciling, do...
2436
for (int i = 0; i < oldValues.length; i++) {
2437
2438                // if they contain the same value, we don't have to do anything
2439
if (oldValues[i] == newValues[i]) {
2440                    continue;
2441                }
2442
2443                // if oldValues is a looping state and the state the current cell points to
2444
// is too, then we can just stomp over the current value of that cell (and
2445
// set the clear-looping-states flag if necessary)
2446
else if (isLoopingState && loopingStates.contains(new Integer JavaDoc(oldValues[i]))) {
2447                    if (newValues[i] != 0) {
2448                        if (oldValues[i] == 0) {
2449                            clearLoopingStates = true;
2450                        }
2451                        oldValues[i] = newValues[i];
2452                    }
2453                }
2454
2455                // if the current cell in oldValues is 0, copy in the corresponding value
2456
// from newValues
2457
else if (oldValues[i] == 0) {
2458                    oldValues[i] = newValues[i];
2459                }
2460
2461                // the last column of each row is the flag column. Take care to merge the
2462
// flag words correctly
2463
else if (i == numCategories) {
2464                    oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
2465                }
2466
2467                // if both newValues and oldValues have a nonzero value in the current
2468
// cell, and it isn't the same value both places...
2469
else if (oldValues[i] != 0 && newValues[i] != 0) {
2470
2471                    // look up this pair of cell values in the merge list. If it's
2472
// found, update the cell in oldValues to point to the merged state
2473
int combinedRowNum = searchMergeList(oldValues[i], newValues[i]);
2474                    if (combinedRowNum != 0) {
2475                        oldValues[i] = (short)combinedRowNum;
2476                    }
2477
2478                    // otherwise, we have to reconcile them...
2479
else {
2480                        // copy our row numbers into variables to make things easier
2481
int oldRowNum = oldValues[i];
2482                        int newRowNum = newValues[i];
2483                        combinedRowNum = tempStateTable.size();
2484
2485                        // add this pair of row numbers to the merge list (create it first
2486
// if we haven't created the merge list yet)
2487
if (mergeList == null) {
2488                            mergeList = new Vector JavaDoc();
2489                        }
2490                        mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
2491
2492//System.out.println("***At " + rowNum + ", merging " + oldRowNum + " and " + newRowNum + " into " + combinedRowNum);
2493

2494                        // create a new row to represent the merged state, and copy the
2495
// contents of oldRow into it, then add it to the end of the
2496
// state table and update the original row (oldValues) to point
2497
// to the new, merged, state
2498
short[] newRow = new short[numCategories + 1];
2499                        short[] oldRow = (short[])(tempStateTable.elementAt(oldRowNum));
2500                        System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
2501                        tempStateTable.addElement(newRow);
2502                        oldValues[i] = (short)combinedRowNum;
2503
2504
2505//System.out.println("lastOldRowNum = " + lastOldRowNum);
2506
//System.out.println("lastCombinedRowNum = " + lastCombinedRowNum);
2507
//System.out.println("decisionPointList.contains(lastOldRowNum) = " + decisionPointList.contains(new Integer(lastOldRowNum)));
2508
//System.out.println("decisionPointList.contains(lastCombinedRowNum) = " + decisionPointList.contains(new Integer(lastCombinedRowNum)));
2509

2510                        // if the decision point list contains either of the parent rows,
2511
// update it to include the new row as well
2512
if ((decisionPointList.contains(new Integer JavaDoc(oldRowNum))
2513                                || decisionPointList.contains(new Integer JavaDoc(newRowNum)))
2514                            && !decisionPointList.contains(new Integer JavaDoc(combinedRowNum))
2515                        ) {
2516                            decisionPointList.addElement(new Integer JavaDoc(combinedRowNum));
2517                        }
2518
2519                        // do the same thing with the list of rows being updated
2520
if ((rowsBeingUpdated.contains(new Integer JavaDoc(oldRowNum))
2521                                || rowsBeingUpdated.contains(new Integer JavaDoc(newRowNum)))
2522                            && !rowsBeingUpdated.contains(new Integer JavaDoc(combinedRowNum))
2523                        ) {
2524                            decisionPointList.addElement(new Integer JavaDoc(combinedRowNum));
2525                        }
2526                        // now (groan) do the same thing for all the entries on the
2527
// decision point stack
2528
for (int k = 0; k < decisionPointStack.size(); k++) {
2529                            Vector JavaDoc dpl = (Vector JavaDoc)decisionPointStack.elementAt(k);
2530                            if ((dpl.contains(new Integer JavaDoc(oldRowNum))
2531                                    || dpl.contains(new Integer JavaDoc(newRowNum)))
2532                                && !dpl.contains(new Integer JavaDoc(combinedRowNum))
2533                            ) {
2534                                dpl.addElement(new Integer JavaDoc(combinedRowNum));
2535                            }
2536                        }
2537
2538                        // FINALLY (puff puff puff), call mergeStates() recursively to copy
2539
// the row referred to by newValues into the new row and resolve any
2540
// conflicts that come up at that level
2541
mergeStates(combinedRowNum, (short[])(tempStateTable.elementAt(
2542                                        newValues[i])), rowsBeingUpdated);
2543                    }
2544                }
2545            }
2546            return;
2547        }
2548
2549        /**
2550         * The merge list is a list of pairs of rows that have been merged somewhere in
2551         * the process of building this state table, along with the row number of the
2552         * row containing the merged state. This function looks up a pair of row numbers
2553         * and returns the row number of the row they combine into. (It returns 0 if
2554         * this pair of rows isn't in the merge list.)
2555         */

2556        private int searchMergeList(int a, int b) {
2557            // if there is no merge list, there obviously isn't anything in it
2558
if (mergeList == null) {
2559                return 0;
2560            }
2561
2562            // otherwise, for each element in the merge list...
2563
else {
2564                int[] entry;
2565                for (int i = 0; i < mergeList.size(); i++) {
2566                    entry = (int[])(mergeList.elementAt(i));
2567
2568                    // we have a hit if the two row numbers match the two row numbers
2569
// in the beginning of the entry (the two that combine), in either
2570
// order
2571
if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) {
2572                        return entry[2];
2573                    }
2574
2575                    // we also have a hit if one of the two row numbers matches the marged
2576
// row number and the other one matches one of the original row numbers
2577
if ((entry[2] == a && (entry[0] == b || entry[1] == b))) {
2578                        return entry[2];
2579                    }
2580                    if ((entry[2] == b && (entry[0] == a || entry[1] == a))) {
2581                        return entry[2];
2582                    }
2583                }
2584                return 0;
2585            }
2586        }
2587
2588        /**
2589         * This function is used to update the list of current loooping states (i.e.,
2590         * states that are controlled by a *? construct). It backfills values from
2591         * the looping states into unpopulated cells of the states that are currently
2592         * marked for backfilling, and then updates the list of looping states to be
2593         * the new list
2594         * @param newLoopingStates The list of new looping states
2595         * @param endStates The list of states to treat as end states (states that
2596         * can exit the loop).
2597         */

2598        private void setLoopingStates(Vector JavaDoc newLoopingStates, Vector JavaDoc endStates) {
2599
2600            // if the current list of looping states isn't empty, we have to backfill
2601
// values from the looping states into the states that are waiting to be
2602
// backfilled
2603
if (!loopingStates.isEmpty()) {
2604                int loopingState = ((Integer JavaDoc)loopingStates.lastElement()).intValue();
2605                int rowNum;
2606
2607                // don't backfill into an end state OR any state reachable from an end state
2608
// (since the search for reachable states is recursive, it's split out into
2609
// a separate function, eliminateBackfillStates(), below)
2610
for (int i = 0; i < endStates.size(); i++) {
2611                    eliminateBackfillStates(((Integer JavaDoc)endStates.elementAt(i)).intValue());
2612                }
2613
2614                // we DON'T actually backfill the states that need to be backfilled here.
2615
// Instead, we MARK them for backfilling. The reason for this is that if
2616
// there are multiple rules in the state-table description, the looping
2617
// states may have some of their values changed by a succeeding rule, and
2618
// this wouldn't be reflected in the backfilled states. We mark a state
2619
// for backfilling by putting the row number of the state to copy from
2620
// into the flag cell at the end of the row
2621
for (int i = 0; i < statesToBackfill.size(); i++) {
2622                    rowNum = ((Integer JavaDoc)statesToBackfill.elementAt(i)).intValue();
2623                    short[] state = (short[])tempStateTable.elementAt(rowNum);
2624                    state[numCategories] =
2625                        (short)((state[numCategories] & ALL_FLAGS) | loopingState);
2626                }
2627                statesToBackfill.removeAllElements();
2628                loopingStates.removeAllElements();
2629            }
2630
2631            if (newLoopingStates != null) {
2632                loopingStates = (Vector JavaDoc)newLoopingStates.clone();
2633            }
2634        }
2635
2636        /**
2637         * This removes "ending states" and states reachable from them from the
2638         * list of states to backfill.
2639         * @param The row number of the state to remove from the backfill list
2640         */

2641        private void eliminateBackfillStates(int baseState) {
2642
2643            // don't do anything unless this state is actually in the backfill list...
2644
if (statesToBackfill.contains(new Integer JavaDoc(baseState))) {
2645
2646                // if it is, take it out
2647
statesToBackfill.removeElement(new Integer JavaDoc(baseState));
2648
2649                // then go through and recursively call this function for every
2650
// state that the base state points to
2651
short[] state = (short[])tempStateTable.elementAt(baseState);
2652                for (int i = 0; i < numCategories; i++) {
2653                    if (state[i] != 0) {
2654                        eliminateBackfillStates(state[i]);
2655                    }
2656                }
2657            }
2658        }
2659
2660        /**
2661         * This function completes the backfilling process by actually doing the
2662         * backfilling on the states that are marked for it
2663         */

2664        private void backfillLoopingStates() {
2665            short[] state;
2666            short[] loopingState = null;
2667            int loopingStateRowNum = 0;
2668            int fromState;
2669
2670            // for each state in the state table...
2671
for (int i = 0; i < tempStateTable.size(); i++) {
2672                state = (short[])tempStateTable.elementAt(i);
2673
2674                // check the state's flag word to see if it's marked for backfilling
2675
// (it's marked for backfilling if any bits other than the two high-order
2676
// bits are set-- if they are, then the flag word, minus the two high bits,
2677
// is the row number to copy from)
2678
fromState = state[numCategories] & ~ALL_FLAGS;
2679                if (fromState > 0) {
2680
2681                    // load up the state to copy from (if we haven't already)
2682
if (fromState != loopingStateRowNum) {
2683                        loopingStateRowNum = fromState;
2684                        loopingState = (short[])tempStateTable.elementAt(loopingStateRowNum);
2685                    }
2686
2687                    // clear out the backfill part of the flag word
2688
state[numCategories] &= ALL_FLAGS;
2689
2690                    // then fill all zero cells in the current state with values
2691
// from the corresponding cells of the fromState
2692
for (int j = 0; j < state.length; j++) {
2693                        if (state[j] == 0) {
2694                            state[j] = loopingState[j];
2695                        }
2696                        else if (state[j] == DONT_LOOP_FLAG) {
2697                            state[j] = 0;
2698                        }
2699                    }
2700                }
2701            }
2702        }
2703
2704        /**
2705         * This function completes the state-table-building process by doing several
2706         * postprocessing steps and copying everything into its final resting place
2707         * in the iterator itself
2708         * @param forward True if we're working on the forward state table
2709         */

2710        private void finishBuildingStateTable(boolean forward) {
2711//debugPrintTempStateTable();
2712
// start by backfilling the looping states
2713
backfillLoopingStates();
2714//debugPrintTempStateTable();
2715

2716            int[] rowNumMap = new int[tempStateTable.size()];
2717            Stack JavaDoc rowsToFollow = new Stack JavaDoc();
2718            rowsToFollow.push(new Integer JavaDoc(1));
2719            rowNumMap[1] = 1;
2720
2721            // determine which states are no longer reachable from the start state
2722
// (the reachable states will have their row numbers in the row number
2723
// map, and the nonreachable states will have zero in the row number map)
2724
while (rowsToFollow.size() != 0) {
2725                int rowNum = ((Integer JavaDoc)rowsToFollow.pop()).intValue();
2726                short[] row = (short[])(tempStateTable.elementAt(rowNum));
2727
2728                for (int i = 0; i < numCategories; i++) {
2729                    if (row[i] != 0) {
2730                        if (rowNumMap[row[i]] == 0) {
2731                            rowNumMap[row[i]] = row[i];
2732                            rowsToFollow.push(new Integer JavaDoc(row[i]));
2733                        }
2734                    }
2735                }
2736            }
2737/*
2738System.out.println("The following rows are not reachable:");
2739for (int i = 1; i < rowNumMap.length; i++)
2740if (rowNumMap[i] == 0) System.out.print("\t" + i);
2741System.out.println();
2742*/

2743
2744            // variable not used boolean madeChange;
2745
int newRowNum;
2746
2747            // algorithm for minimizing the number of states in the table adapted from
2748
// Aho & Ullman, "Principles of Compiler Design"
2749
// The basic idea here is to organize the states into classes. When we're done,
2750
// all states in the same class can be considered identical and all but one eliminated.
2751

2752            // initially assign states to classes based on the number of populated cells they
2753
// contain (the class number is the number of populated cells)
2754
int[] stateClasses = new int[tempStateTable.size()];
2755            int nextClass = numCategories + 1;
2756            short[] state1, state2;
2757            for (int i = 1; i < stateClasses.length; i++) {
2758                if (rowNumMap[i] == 0) {
2759                    continue;
2760                }
2761                state1 = (short[])tempStateTable.elementAt(i);
2762                for (int j = 0; j < numCategories; j++) {
2763                    if (state1[j] != 0) {
2764                        ++stateClasses[i];
2765                    }
2766                }
2767                if (stateClasses[i] == 0) {
2768                    stateClasses[i] = nextClass;
2769                }
2770            }
2771            ++nextClass;
2772
2773            // then, for each class, elect the first member of that class as that class's
2774
// "representative". For each member of the class, compare it to the "representative."
2775
// If there's a column position where the state being tested transitions to a
2776
// state in a DIFFERENT class from the class where the "representative" transitions,
2777
// then move the state into a new class. Repeat this process until no new classes
2778
// are created.
2779
int currentClass;
2780            int lastClass;
2781            boolean split;
2782
2783            do {
2784//System.out.println("Making a pass...");
2785
currentClass = 1;
2786                lastClass = nextClass;
2787                while (currentClass < nextClass) {
2788//System.out.print("States in class #" + currentClass +":");
2789
split = false;
2790                    state1 = state2 = null;
2791                    for (int i = 0; i < stateClasses.length; i++) {
2792                        if (stateClasses[i] == currentClass) {
2793//System.out.print("\t" + i);
2794
if (state1 == null) {
2795                                state1 = (short[])tempStateTable.elementAt(i);
2796                            }
2797                            else {
2798                                state2 = (short[])tempStateTable.elementAt(i);
2799                                for (int j = 0; j < state2.length; j++)
2800                                    if ((j == numCategories && state1[j] != state2[j] && forward)
2801                                            || (j != numCategories && stateClasses[state1[j]]
2802                                            != stateClasses[state2[j]])) {
2803                                        stateClasses[i] = nextClass;
2804                                        split = true;
2805                                        break;
2806                                    }
2807                            }
2808                        }
2809                    }
2810                    if (split) {
2811                        ++nextClass;
2812                    }
2813                    ++currentClass;
2814//System.out.println();
2815
}
2816            } while (lastClass != nextClass);
2817
2818            // at this point, all of the states in a class except the first one (the
2819
//"representative") can be eliminated, so update the row-number map accordingly
2820
int[] representatives = new int[nextClass];
2821            for (int i = 1; i < stateClasses.length; i++)
2822                if (representatives[stateClasses[i]] == 0) {
2823                    representatives[stateClasses[i]] = i;
2824                }
2825                else {
2826                    rowNumMap[i] = representatives[stateClasses[i]];
2827                }
2828//System.out.println("Renumbering...");
2829

2830            // renumber all remaining rows...
2831
// first drop all that are either unreferenced or not a class representative
2832
for (int i = 1; i < rowNumMap.length; i++) {
2833                if (rowNumMap[i] != i) {
2834                    tempStateTable.setElementAt(null, i);
2835                }
2836            }
2837
2838            // then calculate everybody's new row number and update the row
2839
// number map appropriately (the first pass updates the row numbers
2840
// of all the class representatives [the rows we're keeping], and the
2841
// second pass updates the cross references for all the rows that
2842
// are being deleted)
2843
newRowNum = 1;
2844            for (int i = 1; i < rowNumMap.length; i++) {
2845                if (tempStateTable.elementAt(i) != null) {
2846                    rowNumMap[i] = newRowNum++;
2847                }
2848            }
2849            for (int i = 1; i < rowNumMap.length; i++) {
2850                if (tempStateTable.elementAt(i) == null) {
2851                    rowNumMap[i] = rowNumMap[rowNumMap[i]];
2852                }
2853            }
2854//for (int i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int newRowNum = rowNumMap.length;
2855

2856            // allocate the permanent state table, and copy the remaining rows into it
2857
// (adjusting all the cell values, of course)
2858

2859            // this section does that for the forward state table
2860
if (forward) {
2861                endStates = new boolean[newRowNum];
2862                lookaheadStates = new boolean[newRowNum];
2863                stateTable = new short[newRowNum * numCategories];
2864                int p = 0;
2865                int p2 = 0;
2866                for (int i = 0; i < tempStateTable.size(); i++) {
2867                    short[] row = (short[])(tempStateTable.elementAt(i));
2868                    if (row == null) {
2869                        continue;
2870                    }
2871                    for (int j = 0; j < numCategories; j++) {
2872                        stateTable[p] = (short)(rowNumMap[row[j]]);
2873                        ++p;
2874                    }
2875                    endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0);
2876                    lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0);
2877                    ++p2;
2878                }
2879            }
2880
2881            // and this section does it for the backward state table
2882
else {
2883                backwardsStateTable = new short[newRowNum * numCategories];
2884                int p = 0;
2885                for (int i = 0; i < tempStateTable.size(); i++) {
2886                    short[] row = (short[])(tempStateTable.elementAt(i));
2887                    if (row == null) {
2888                        continue;
2889                    }
2890                    for (int j = 0; j < numCategories; j++) {
2891                        backwardsStateTable[p] = (short)(rowNumMap[row[j]]);
2892                        ++p;
2893                    }
2894                }
2895            }
2896        }
2897
2898        /**
2899         * This function builds the backward state table from the forward state
2900         * table and any additional rules (identified by the ! on the front)
2901         * supplied in the description
2902         */

2903        private void buildBackwardsStateTable(Vector JavaDoc tempRuleList) {
2904
2905            // create the temporary state table and seed it with two rows (row 0
2906
// isn't used for anything, and we have to create row 1 (the initial
2907
// state) before we can do anything else
2908
tempStateTable = new Vector JavaDoc();
2909            tempStateTable.addElement(new short[numCategories + 1]);
2910            tempStateTable.addElement(new short[numCategories + 1]);
2911
2912            // although the backwards state table is built automatically from the forward
2913
// state table, there are some situations (the default sentence-break rules,
2914
// for example) where this doesn't yield enough stop states, causing a dramatic
2915
// drop in performance. To help with these cases, the user may supply
2916
// supplemental rules that are added to the backward state table. These have
2917
// the same syntax as the normal break rules, but begin with '!' to distinguish
2918
// them from normal break rules
2919
for (int i = 0; i < tempRuleList.size(); i++) {
2920                String JavaDoc rule = (String JavaDoc)tempRuleList.elementAt(i);
2921                if (rule.charAt(0) == '!') {
2922                    parseRule(rule.substring(1), false);
2923                }
2924            }
2925            backfillLoopingStates();
2926
2927            // Backwards iteration is qualitatively different from forwards iteration.
2928
// This is because backwards iteration has to be made to operate from no context
2929
// at all-- the user should be able to ask BreakIterator for the break position
2930
// immediately on either side of some arbitrary offset in the text. The
2931
// forward iteration table doesn't let us do that-- it assumes complete
2932
// information on the context, which means starting from the beginning of the
2933
// document.
2934
// The way we do backward and random-access iteration is to back up from the
2935
// current (or user-specified) position until we see something we're sure is
2936
// a break position (it may not be the last break position immediately
2937
// preceding our starting point, however). Then we roll forward from there to
2938
// locate the actual break position we're after.
2939
// This means that the backwards state table doesn't have to identify every
2940
// break position, allowing the building algorithm to be much simpler. Here,
2941
// we use a "pairs" approach, scanning the forward-iteration state table for
2942
// pairs of character categories we ALWAYS break between, and building a state
2943
// table from that information. No context is required-- all this state table
2944
// looks at is a pair of adjacent characters.
2945

2946            // It's possible that the user has supplied supplementary rules (see above).
2947
// This has to be done first to keep parseRule() and friends from becoming
2948
// EVEN MORE complicated. The automatically-generated states are appended
2949
// onto the end of the state table, and then the two sets of rules are
2950
// stitched together at the end. Take note of the row number of the
2951
// first row of the auromatically-generated part.
2952
int backTableOffset = tempStateTable.size();
2953            if (backTableOffset > 2) {
2954                ++backTableOffset;
2955            }
2956
2957            // the automatically-generated part of the table models a two-dimensional
2958
// array where the two dimensions represent the two characters we're currently
2959
// looking at. To model this as a state table, we actually need one additional
2960
// row to represent the initial state. It gets populated with the row numbers
2961
// of the other rows (in order).
2962
for (int i = 0; i < numCategories + 1; i++)
2963                tempStateTable.addElement(new short[numCategories + 1]);
2964
2965            short[] state = (short[])tempStateTable.elementAt(backTableOffset - 1);
2966            for (int i = 0; i < numCategories; i++)
2967                state[i] = (short)(i + backTableOffset);
2968
2969            // scavenge the forward state table for pairs of character categories
2970
// that always have a break between them. The algorithm is as follows:
2971
// Look down each column in the state table. For each nonzero cell in
2972
// that column, look up the row it points to. For each nonzero cell in
2973
// that row, populate a cell in the backwards state table: the row number
2974
// of that cell is the number of the column we were scanning (plus the
2975
// offset that locates this sub-table), and the column number of that cell
2976
// is the column number of the nonzero cell we just found. This cell is
2977
// populated with its own column number (adjusted according to the actual
2978
// location of the sub-table). This process will produce a state table
2979
// whose behavior is the same as looking up successive pairs of characters
2980
// in an array of Booleans to determine whether there is a break.
2981
int numRows = stateTable.length / numCategories;
2982            for (int column = 0; column < numCategories; column++) {
2983                for (int row = 0; row < numRows; row++) {
2984                    int nextRow = lookupState(row, column);
2985                    if (nextRow != 0) {
2986                        for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) {
2987                            int cellValue = lookupState(nextRow, nextColumn);
2988                            if (cellValue != 0) {
2989                                state = (short[])tempStateTable.elementAt(nextColumn +
2990                                                backTableOffset);
2991                                state[column] = (short)(column + backTableOffset);
2992                            }
2993                        }
2994                    }
2995                }
2996            }
2997
2998//debugPrintTempStateTable();
2999
// if the user specified some backward-iteration rules with the ! token,
3000
// we have to merge the resulting state table with the auto-generated one
3001
// above. First copy the populated cells from row 1 over the populated
3002
// cells in the auto-generated table. Then copy values from row 1 of the
3003
// auto-generated table into all of the the unpopulated cells of the
3004
// rule-based table.
3005
if (backTableOffset > 1) {
3006
3007                // for every row in the auto-generated sub-table, if a cell is
3008
// populated that is also populated in row 1 of the rule-based
3009
// sub-table, copy the value from row 1 over the value in the
3010
// auto-generated sub-table
3011
state = (short[])tempStateTable.elementAt(1);
3012                for (int i = backTableOffset - 1; i < tempStateTable.size(); i++) {
3013                    short[] state2 = (short[])tempStateTable.elementAt(i);
3014                    for (int j = 0; j < numCategories; j++) {
3015                        if (state[j] != 0 && state2[j] != 0) {
3016                            state2[j] = state[j];
3017                        }
3018                    }
3019                }
3020
3021                // now, for every row in the rule-based sub-table that is not
3022
// an end state, fill in all unpopulated cells with the values
3023
// of the corresponding cells in the first row of the auto-
3024
// generated sub-table.
3025
state = (short[])tempStateTable.elementAt(backTableOffset - 1);
3026                for (int i = 1; i < backTableOffset - 1; i++) {
3027                    short[] state2 = (short[])tempStateTable.elementAt(i);
3028                    if ((state2[numCategories] & END_STATE_FLAG) == 0) {
3029                        for (int j = 0; j < numCategories; j++) {
3030                            if (state2[j] == 0) {
3031                                state2[j] = state[j];
3032                            }
3033                        }
3034                    }
3035                }
3036            }
3037
3038//debugPrintTempStateTable();
3039

3040            // finally, clean everything up and copy it into the actual BreakIterator
3041
// by calling finishBuildingStateTable()
3042
finishBuildingStateTable(false);
3043/*
3044System.out.print("C:\t");
3045for (int i = 0; i < numCategories; i++)
3046System.out.print(Integer.toString(i) + "\t");
3047System.out.println(); System.out.print("=================================================");
3048for (int i = 0; i < backwardsStateTable.length; i++) {
3049if (i % numCategories == 0) {
3050System.out.println();
3051System.out.print(Integer.toString(i / numCategories) + ":\t");
3052}
3053if (backwardsStateTable[i] == 0) System.out.print(".\t"); else System.out.print(Integer.toString(backwardsStateTable[i]) + "\t");
3054}
3055System.out.println();
3056*/

3057        }
3058
3059        /**
3060         * Throws an IllegalArgumentException representing a syntax error in the rule
3061         * description. The exception's message contains some debugging information.
3062         * @param message A message describing the problem
3063         * @param position The position in the description where the problem was
3064         * discovered
3065         * @param context The string containing the error
3066         * @internal
3067         */

3068        protected void error(String JavaDoc message, int position, String JavaDoc context) {
3069            throw new IllegalArgumentException JavaDoc("Parse error: " + message + "\n" +
3070                    Utility.escape(context.substring(0, position)) + "\n\n" +
3071                    Utility.escape(context.substring(position)));
3072        }
3073
3074    ///CLOVER:OFF
3075
/**
3076         * @internal
3077         */

3078        protected void debugPrintVector(String JavaDoc label, Vector JavaDoc v) {
3079            System.out.print(label);
3080            for (int i = 0; i < v.size(); i++)
3081                System.out.print(v.elementAt(i).toString() + "\t");
3082            System.out.println();
3083        }
3084
3085        /**
3086         * @internal
3087         */

3088        protected void debugPrintVectorOfVectors(String JavaDoc label1, String JavaDoc label2, Vector JavaDoc v) {
3089            System.out.println(label1);
3090            for (int i = 0; i < v.size(); i++)
3091                debugPrintVector(label2, (Vector JavaDoc)v.elementAt(i));
3092        }
3093
3094        /**
3095         * @internal
3096         */

3097        protected void debugPrintTempStateTable() {
3098            System.out.println(" tempStateTable:");
3099            System.out.print(" C:\t");
3100            for (int i = 0; i <= numCategories; i++)
3101                System.out.print(Integer.toString(i) + "\t");
3102            System.out.println();
3103            for (int i = 1; i < tempStateTable.size(); i++) {
3104                short[] row = (short[])(tempStateTable.elementAt(i));
3105                System.out.print(" " + i + ":\t");
3106                for (int j = 0; j < row.length; j++) {
3107                    if (row[j] == 0) {
3108                        System.out.print(".\t");
3109                    }
3110                    else {
3111                        System.out.print(Integer.toString(row[j]) + "\t");
3112                    }
3113                }
3114                System.out.println();
3115            }
3116        }
3117
3118    }
3119    ///CLOVER:ON
3120

3121    /*
3122     * This class exists to work around a bug in HotJava's implementation
3123     * of CharacterIterator, which incorrectly handles setIndex(endIndex).
3124     * This iterator relies only on base.setIndex(n) where n is less than
3125     * endIndex.
3126     *
3127     * One caveat: if the base iterator's begin and end indices change
3128     * the change will not be reflected by this wrapper. Does that matter?
3129     */

3130    ///CLOVER:OFF
3131
// Only used for HotJava, so clover won't encounter it
3132
private static final class SafeCharIterator implements CharacterIterator JavaDoc,
3133                                                           Cloneable JavaDoc {
3134
3135        private CharacterIterator JavaDoc base;
3136        private int rangeStart;
3137        private int rangeLimit;
3138        private int currentIndex;
3139
3140        SafeCharIterator(CharacterIterator JavaDoc base) {
3141            this.base = base;
3142            this.rangeStart = base.getBeginIndex();
3143            this.rangeLimit = base.getEndIndex();
3144            this.currentIndex = base.getIndex();
3145        }
3146
3147        public char first() {
3148            return setIndex(rangeStart);
3149        }
3150
3151        public char last() {
3152            return setIndex(rangeLimit - 1);
3153        }
3154
3155        public char current() {
3156            if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
3157                return DONE;
3158            }
3159            else {
3160                return base.setIndex(currentIndex);
3161            }
3162        }
3163
3164        public char next() {
3165
3166            currentIndex++;
3167            if (currentIndex >= rangeLimit) {
3168                currentIndex = rangeLimit;
3169                return DONE;
3170            }
3171            else {
3172                return base.setIndex(currentIndex);
3173            }
3174        }
3175
3176        public char previous() {
3177
3178            currentIndex--;
3179            if (currentIndex < rangeStart) {
3180                currentIndex = rangeStart;
3181                return DONE;
3182            }
3183            else {
3184                return base.setIndex(currentIndex);
3185            }
3186        }
3187
3188        public char setIndex(int i) {
3189
3190            if (i < rangeStart || i > rangeLimit) {
3191                throw new IllegalArgumentException JavaDoc("Invalid position");
3192            }
3193            currentIndex = i;
3194            return current();
3195        }
3196
3197        public int getBeginIndex() {
3198            return rangeStart;
3199        }
3200
3201        public int getEndIndex() {
3202            return rangeLimit;
3203        }
3204
3205        public int getIndex() {
3206            return currentIndex;
3207        }
3208
3209        public Object JavaDoc clone() {
3210
3211            SafeCharIterator copy = null;
3212            try {
3213                copy = (SafeCharIterator) super.clone();
3214            }
3215            catch(CloneNotSupportedException JavaDoc e) {
3216                throw new Error JavaDoc("Clone not supported: " + e);
3217            }
3218
3219            CharacterIterator JavaDoc copyOfBase = (CharacterIterator JavaDoc) base.clone();
3220            copy.base = copyOfBase;
3221            return copy;
3222        }
3223    }
3224    ///CLOVER:ON
3225

3226    ///CLOVER:OFF
3227
/**
3228     * @internal
3229     */

3230    public static void debugPrintln(String JavaDoc s) {
3231        final String JavaDoc zeros = "0000";
3232        String JavaDoc temp;
3233        StringBuffer JavaDoc out = new StringBuffer JavaDoc();
3234        for (int i = 0; i < s.length(); i++) {
3235            char c = s.charAt(i);
3236            if (c >= ' ' && c < '\u007f') {
3237                out.append(c);
3238            }
3239            else {
3240                out.append("\\u");
3241                temp = Integer.toHexString((int)c);
3242                out.append(zeros.substring(0, 4 - temp.length()));
3243                out.append(temp);
3244            }
3245        }
3246        System.out.println(out);
3247    }
3248    ///CLOVER:ON
3249
}
3250
3251
Popular Tags