RuleBasedBreakIterator_Old


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   
8   
9   /*
10   * @(#)RuleBasedBreakIterator.java    1.3 99/04/07
11   *
12   */
13  
14  package com.ibm.icu.text;
15  
16  import com.ibm.icu.util.CompactByteArray;
17  import com.ibm.icu.impl.Utility;
18  import java.util.Vector  ;
19  import java.util.Stack  ;
20  import java.util.Hashtable  ;
21  import java.util.Enumeration  ;
22  import java.text.CharacterIterator  ;
23  import java.text.StringCharacterIterator  ;
24  
25  import java.io.*;
26  
27  /**
28   * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
29   *
30   * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
31   * and <i>regular expressions.</i></p>
32   *
33   * <p>A substitution rule defines a name that can be used in place of an expression. It
34   * consists of a name, an equals sign, and an expression. (There can be no whitespace on
35   * either side of the equals sign.)  To keep its syntactic meaning intact, the expression
36   * must be enclosed in parentheses or square brackets. A substitution is visible after its
37   * definition, and is filled in using simple textual substitution (when a substitution is
38   * used, its name is enclosed in curly braces.  The curly braces are optional in the
39   * substition's definition). Substitution definitions can contain other substitutions, as
40   * long as those substitutions have been defined first. Substitutions are generally used to
41   * make the regular expressions (which can get quite complex) shorter and easier to read.
42   * They typically define either character categories or commonly-used subexpressions.</p>
43   *
44   * <p>There is one special substitution.&nbsp; If the description defines a substitution
45   * called &quot;_ignore_&quot;, the expression must be a [] expression, and the
46   * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
47   * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
48   * same way it would if any ignore characters it contains are taken out.&nbsp; Break
49   * positions never occur before ignore characters, except when the character before the
50   * ignore characters is a line or paragraph terminator.</p>
51   *
52   * <p>A regular expression uses a syntax similar to the normal Unix regular-expression
53   * syntax, and defines a sequence of characters to be kept together. With one significant
54   * exception, the iterator uses a longest-possible-match algorithm when matching text to regular
55   * expressions. The iterator also treats descriptions containing multiple regular expressions
56   * as if they were ORed together (i.e., as if they were separated by |).</p>
57   *
58   * <p>The special characters recognized by the regular-expression parser are as follows:</p>
59   *
60   * <blockquote>
61   *   <table border="1" width="100%">
62   *     <tr>
63   *       <td width="6%">*</td>
64   *       <td width="94%">Specifies that the expression preceding the asterisk may occur any number
65   *       of times (including not at all).</td>
66   *     </tr>
67   *     <tr>
68   *       <td width="6%">+</td>
69   *       <td width="94%">Specifies that the expression preceding the asterisk may occur one or
70   *       more times, but must occur at least once.</td>
71   *     </tr>
72   *     <tr>
73   *       <td width="6%">?</td>
74   *       <td width="94%">Specifies that the expression preceding the asterisk may occur once
75   *       or not at all (i.e., it makes the preceding expression optional).</td>
76   *     </tr>
77   *     <tr>
78   *       <td width="6%">()</td>
79   *       <td width="94%">Encloses a sequence of characters.  If followed by * or +, the
80   *       sequence repeats.  If followed by ?, the sequence is optional.  Otherwise, the
81   *       parentheses are just a grouping device and a way to delimit the ends of expressions
82   *       containing |.</td>
83   *     </tr>
84   *     <tr>
85   *       <td width="6%">|</td>
86   *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
87   *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
88   *       only occur inside ().</td>
89   *     </tr>
90   *     <tr>
91   *       <td width="6%">.</td>
92   *       <td width="94%">Matches any character.</td>
93   *     </tr>
94   *     <tr>
95   *       <td width="6%">*?</td>
96   *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
97   *       when there is overlap between the last group of characters in the expression preceding the
98   *       * and the first group of characters following the *.&nbsp; When there is this kind of
99   *       overlap, * will match the longest sequence of characters that match the expression before
100  *       the *, and *? will match the shortest sequence of characters matching the expression
101  *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
102  *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
103  *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
104  *     </tr>
105  *     <tr>
106  *       <td width="6%">[]</td>
107  *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
108  *       match any single character that is specified in the [] expression.&nbsp; For more on the
109  *       syntax of [] expressions, see below.</td>
110  *     </tr>
111  *     <tr>
112  *       <td width="6%">/</td>
113  *       <td width="94%">Specifies where the break position should go if text matches this
114  *       expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*[1-0]&quot; will match if the iterator sees a run
115  *       of letters, followed by a run of whitespace, followed by a digit, but the break position
116  *       will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
117  *       break position at the end of the matching text.</td>
118  *     </tr>
119  *     <tr>
120  *       <td width="6%">\</td>
121  *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
122  *       character to be treated as literal character.&nbsp; This has no effect for many
123  *       characters, but for the characters listed above, this deprives them of their special
124  *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
125  *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
126  *       &quot;\n&quot; will be converted to a literal newline character by the time the
127  *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
128  *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
129  *       characters in the ASCII range except for letters, digits, and control characters are
130  *       reserved characters to the parser and must be preceded by \ even if they currently don't
131  *       mean anything.</td>
132  *     </tr>
133  *     <tr>
134  *       <td width="6%">!</td>
135  *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
136  *       parser that this expression specifies the backwards-iteration behavior of the iterator,
137  *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
138  *       where the automatically-generated backwards-iteration behavior doesn't produce
139  *       satisfactory results and must be supplemented with extra client-specified rules.</td>
140  *     </tr>
141  *     <tr>
142  *       <td width="6%"><em>(all others)</em></td>
143  *       <td width="94%">All other characters are treated as literal characters, which must match
144  *       the corresponding character(s) in the text exactly.</td>
145  *     </tr>
146  *   </table>
147  * </blockquote>
148  *
149  * <p>Within a [] expression, a number of other special characters can be used to specify
150  * groups of characters:</p>
151  *
152  * <blockquote>
153  *   <table border="1" width="100%">
154  *     <tr>
155  *       <td width="6%">-</td>
156  *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
157  *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
158  *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
159  *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
160  *       it include accented letters such as a-umlaut.</td>
161  *     </tr>
162  *     <tr>
163  *       <td width="6%">^</td>
164  *       <td width="94%">Inverts the expression.  All characters the expression includes are
165  *       excluded, and vice versa.  (i.e., it has the effect of saying "all Unicode characters
166  *       except...")  This character only has its special meaning when it's the first character
167  *       in the [] expression.  (Generally, you only see the ^ character inside a nested []
168  *       expression used in conjunction with the syntax below.)</td>
169  *     </tr>
170  *     <tr>
171  *       <td width="6%"><em>(all others)</em></td>
172  *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
173  *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
174  *     </tr>
175  *   </table>
176  * </blockquote>
177  *
178  * <p>[] expressions can nest.  There are some other characters that have special meaning only
179  * when used in conjunction with nester [] expressions:</p>
180  *
181  * <blockquote>
182  *   <table border="1" width="100%">
183  *     <tr>
184  *       <td width="6%">::</td>
185  *       <td width="94%">Within a nested [] expression, a pair of colons containing a one- or
186  *       two-letter code matches all characters in the corresponding Unicode category.&nbsp;
187  *       The :: expression has to be the only thing inside the [] expression. The two-letter codes
188  *       are the same as the two-letter codes in the Unicode database (for example,
189  *       &quot;[[:Sc:][:Sm:]]&quot; matches all currency symbols and all math symbols).&nbsp;
190  *       Specifying a one-letter code is the same as specifying all two-letter codes that begin
191  *       with that letter (for example, &quot;[[:L:]]&quot; matches all letters, and is equivalent
192  *       to &quot;[[:Lu:][:Ll:][:Lo:][:Lm:][:Lt:]]&quot;).&nbsp; Anything other than a valid
193  *       two-letter Unicode category code or a single letter that begins a valide Unicode category
194  *       code is illegal within the colons.</td>
195  *     </tr>
196  *     <tr>
197  *       <td width="6%">|</td>
198  *       <td width="94%">Two nested [] expressions juxtaposed or separated only by a | character
199  *       are merged together into a single [] expression matching all the characters in either
200  *       of the original [] expressions.  (e.g., "[[ab][bc]]" is equivalent to "[abc]", and so
201  *       is "[[ab]|[bc]]". <b>NOTE:</b>  "[ab][bc]" is NOT the same thing as "[[ab][bc]]".
202  *       The first expression will match two characters: an a or b followed by either another
203  *       b or a c.  The second expression will match a single character, which may be a, b, or c.
204  *       The nesting is <em>required</em> for the expressions to merge together.</td>
205  *     </tr>
206  *     <tr>
207  *       <td width="6%">&</td>
208  *       <td width="94%">Two nested [] expressions with only & between them will match any
209  *       character that appears in both nested [] expressions (this is a set intersection).
210  *       (e.g., "[[ab]&[bc]]" will only match the letter b.)</td>
211  *     </tr>
212  *     <tr>
213  *       <td width="6%">-</td>
214  *       <td width="94%">Two nested [] expressions with - between them will match any
215  *       character that appears in the first nested [] expression <em>but not</em> the
216  *       second one (this is an asymmetrical set difference).  (e.g., "[[:Sc:]-[$]]"
217  *       matches any currency symbol except the dollar sign.  "[[ab]-[bc]] will match
218  *       only the letter a.  This has exactly the same effect as "[[ab]&[^bc]]".)</td>
219  *     </tr>
220  *
221  * <p>For a more complete explanation, see <a
222  * HREF="http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/">http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/</a>.
223  * &nbsp; For examples, see the resource data (which is annotated).</p>
224  *
225  * @author Richard Gillam
226  * @internal ICU 2.0
227  */
228 public class RuleBasedBreakIterator_Old extends RuleBasedBreakIterator {
229 
230     /**
231      * A token used as a character-category value to identify ignore characters
232      * @stable ICU 2.0
233      */
234     protected static final byte IGNORE = -1;
235 
236     /**
237      * Special variable used to define ignore characters
238      * @stable ICU 2.0
239      */
240     private static final String   IGNORE_VAR = "_ignore_";
241 
242     /**
243      * The state number of the starting state
244      */
245     private static final short START_STATE = 1;
246 
247     /**
248      * The state-transition value indicating "stop"
249      */
250     private static final short STOP_STATE = 0;
251 
252     /**
253      * The textual description this iterator was created from
254      */
255     private String   description;
256 
257     /**
258      * A table that indexes from character values to character category numbers
259      */
260     private CompactByteArray charCategoryTable = null;
261 
262     /**
263      * The table of state transitions used for forward iteration
264      */
265     private short[] stateTable = null;
266 
267     /**
268      * The table of state transitions used to sync up the iterator with the
269      * text in backwards and random-access iteration
270      */
271     private short[] backwardsStateTable = null;
272 
273     /**
274      * A list of flags indicating which states in the state table are accepting
275      * ("end") states
276      */
277     private boolean[] endStates = null;
278 
279     /**
280      * A list of flags indicating which states in the state table are
281      * lookahead states (states which turn lookahead on and off)
282      */
283     private boolean[] lookaheadStates = null;
284 
285     /**
286      * The number of character categories (and, thus, the number of columns in
287      * the state tables)
288      */
289     private int numCategories;
290 
291     /**
292      * The character iterator through which this BreakIterator accesses the text
293      */
294     private CharacterIterator   text = null;
295 
296     //=======================================================================
297     // constructors
298     //=======================================================================
299 
300     /**
301      * Constructs a RuleBasedBreakIterator_Old according to the description
302      * provided.  If the description is malformed, throws an
303      * IllegalArgumentException.  Normally, instead of constructing a
304      * RuleBasedBreakIterator_Old directory, you'll use the factory methods
305      * on BreakIterator to create one indirectly from a description
306      * in the framework's resource files.  You'd use this when you want
307      * special behavior not provided by the built-in iterators.
308      * @stable ICU 2.0
309      */
310     public RuleBasedBreakIterator_Old(String   description) {
311 //System.out.println(">>>RBBI constructor");
312         this.description = description;
313 
314         // the actual work is done by the Builder class
315         Builder builder = makeBuilder();
316         builder.buildBreakIterator();
317 //System.out.println("<<<RBBI constructor");
318     }
319 
320     /**
321      * Creates a Builder.
322      * @stable ICU 2.0
323      */
324     protected Builder makeBuilder() {
325         return new Builder();
326     }
327 
328     //=======================================================================
329     // boilerplate
330     //=======================================================================
331     /**
332      * Clones this iterator.
333      * @return A newly-constructed RuleBasedBreakIterator_Old with the same
334      * behavior as this one.
335      * @stable ICU 2.0
336      */
337     public Object   clone()
338     {
339         RuleBasedBreakIterator_Old result = (RuleBasedBreakIterator_Old) super.clone();
340         if (text != null) {
341             result.text = (CharacterIterator  ) text.clone();
342         }
343         return result;
344     }
345 
346     /**
347      * Returns true if both BreakIterators are of the same class, have the same
348      * rules, and iterate over the same text.
349      * @stable ICU 2.0
350      */
351     public boolean equals(Object   that) {
352         try {
353             RuleBasedBreakIterator_Old other = (RuleBasedBreakIterator_Old) that;
354             if (!description.equals(other.description)) {
355                 return false;
356             }
357             return getText().equals(other.getText());
358         }
359         catch(ClassCastException   e) {
360             return false;
361         }
362     }
363 
364     /**
365      * Returns the description used to create this iterator
366      * @stable ICU 2.0
367      */
368     public String   toString() {
369         return description;
370     }
371 
372     /**
373      * Compute a hashcode for this BreakIterator
374      * @return A hash code
375      * @stable ICU 2.0
376      */
377     public int hashCode()
378     {
379         return description.hashCode();
380     }
381 
382 /**
383  *   Dump out a more-or-less human readable form of the
384  *   complete state table and character class definitions
385  *   @internal
386  */
387     ///CLOVER:OFF
388 public void debugDumpTables() {
389     System.out.println("Character Classes:");
390     int currentCharClass = 257;
391     int startCurrentRange = 0;
392     int initialStringLength = 0;
393 
394     StringBuffer  [] charClassRanges = new StringBuffer  [numCategories];
395     for (int i=0; i<numCategories; i++) {
396         charClassRanges[i] = new StringBuffer  ();
397     }
398 
399     for (int i = 0; i < 0xffff; i++) {
400         if ((int)charCategoryTable.elementAt((char)i) != currentCharClass) {
401             if (currentCharClass != 257) {
402                 // Complete the output of the previous range.
403                 if (i != startCurrentRange+1) {
404                     charClassRanges[currentCharClass].append("-"+ Integer.toHexString(i-1));
405                 }
406                 if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
407                     charClassRanges[currentCharClass].append("\n     ");
408                 }
409             }
410 
411             // Output the start of the new range.
412             currentCharClass = (int)charCategoryTable.elementAt((char)i);
413             startCurrentRange = i;
414             initialStringLength = charClassRanges[currentCharClass].length();
415             if (charClassRanges[currentCharClass].length() > 0)
416                 charClassRanges[currentCharClass].append(", ");
417             charClassRanges[currentCharClass].append(Integer.toHexString(i));
418         }
419     }
420 
421     for (int i=0; i<numCategories; i++) {
422         System.out.println(i + ":     " + charClassRanges[i]);
423     }
424 
425 
426     System.out.println("\n\nState Table.   *: end state     %: look ahead state");
427     System.out.print("C:\t");
428     for (int i = 0; i < numCategories; i++)
429         System.out.print(Integer.toString(i) + "\t");
430     System.out.println(); System.out.print("=================================================");
431     for (int i = 0; i < stateTable.length; i++) {
432         if (i % numCategories == 0) {
433             System.out.println();
434             if (endStates[i / numCategories])
435                 System.out.print("*");
436             else
437                 System.out.print(" ");
438             if (lookaheadStates[i / numCategories]) {
439                 System.out.print("%");
440             }
441             else
442                 System.out.print(" ");
443             System.out.print(Integer.toString(i / numCategories) + ":\t");
444         }
445         if (stateTable[i] == 0) {
446             System.out.print(".\t");
447         } else {
448             System.out.print(Integer.toString(stateTable[i]) + "\t");
449         }
450     }
451     System.out.println();
452 }
453     ///CLOVER:ON
454 
455     ///CLOVER:OFF
456 // DELETE ME BEFORE RELEASE!!!
457     /**
458      * Write the RBBI runtime engine state transition tables to a file.
459      *  Formerly used to export the tables to the C++ RBBI Implementation.
460      *  Now obsolete, as C++ builds its own tables.
461      * @internal
462      */
463 public void writeTablesToFile(FileOutputStream file, boolean littleEndian) throws IOException {
464     // NOTE: The format being written here is designed to be compatible with
465     // the ICU udata interfaces and may not be useful for much else
466     DataOutputStream out = new DataOutputStream(file);
467     
468 //    --- write the file header ---
469     byte[] comment = "Copyright (C) 1999, International Business Machines Corp. and others. All Rights Reserved.".getBytes("US-ASCII");
470 //    write the size of the header (rounded up to the next 16-byte boundary)
471     short headerSize = (short)(comment.length + 1 // length of comment
472             + 24); // size of static header data
473     short realHeaderSize = (short)(headerSize + ((headerSize % 16 == 0) ? 0 : 16 - (headerSize % 16)));
474     writeSwappedShort(realHeaderSize, out, littleEndian);
475 //    write magic byte values
476     out.write(0xda);
477     out.write(0x27);
478 //    write size of core header data
479     writeSwappedShort((short)20, out, littleEndian);
480 //    write reserved bytes
481     writeSwappedShort((short)0, out, littleEndian);
482     
483 //    write flag indicating whether we're big-endian
484     if (littleEndian) {
485         out.write(0);
486     } else {
487         out.write(1);
488     }
489     
490 //    write character set family code (0 means ASCII)
491     out.write(0);
492 //    write size of UChar in this file
493     out.write(2);
494 //    write reserved byte
495     out.write(0);
496 //    write data format identifier (this is an array of bytes in ICU, so the value is NOT swapped!)
497     out.writeInt(0x42524b53);   // ("BRKS")
498 //    write file format version number (NOT swapped!)
499     out.writeInt(0);
500 //    write data version number (NOT swapped!)
501     out.writeInt(0);
502 //    write copyright notice
503     out.write(comment);
504     out.write(0);
505 //    fill in padding bytes
506     while (headerSize < realHeaderSize) {
507         out.write(0);
508         ++headerSize;
509     }
510     
511 //    --- write index to the file ---
512 //    write the number of columns in the state table
513     writeSwappedInt(numCategories, out, littleEndian);
514     int fileEnd = 36;
515 //    write the location in the file of the BreakIterator description string
516     writeSwappedInt(fileEnd, out, littleEndian);
517     fileEnd += (description.length() + 1) * 2;
518     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
519 //    write the location of the character category table's index
520     writeSwappedInt(fileEnd, out, littleEndian);
521     fileEnd += charCategoryTable.getIndexArray().length * 2;
522     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
523 //    write the location of the character category table's values array
524     writeSwappedInt(fileEnd, out, littleEndian);
525     fileEnd += charCategoryTable.getValueArray().length;
526     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
527 //    write the location of the forward state table
528     writeSwappedInt(fileEnd, out, littleEndian);
529     fileEnd += stateTable.length * 2;
530     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
531 //    write the location of the backward state table
532     writeSwappedInt(fileEnd, out, littleEndian);
533     fileEnd += backwardsStateTable.length * 2;
534     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
535 //    write the location of the endStates flags
536     writeSwappedInt(fileEnd, out, littleEndian);
537     fileEnd += endStates.length;
538     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
539 //    write the location of the lookaheadStates flags
540     writeSwappedInt(fileEnd, out, littleEndian);
541     fileEnd += lookaheadStates.length;
542     fileEnd += (fileEnd % 4 == 0) ? 0 : 4 - (fileEnd % 4);
543 //    write the length of the file
544     writeSwappedInt(fileEnd, out, littleEndian);
545     
546 //    --- write the actual data ---
547 //    write description string
548     for (int i = 0; i < description.length(); i++)
549         writeSwappedShort((short)description.charAt(i), out, littleEndian);
550     out.writeShort(0);
551     if ((description.length() + 1) % 2 == 1)
552         out.writeShort(0);
553 //    write character category table
554     char[] temp1 = charCategoryTable.getIndexArray();
555     for (int i = 0; i < temp1.length; i++)
556         writeSwappedShort((short)temp1[i], out, littleEndian);
557     if (temp1.length % 2 == 1)
558         out.writeShort(0);
559     byte[] temp2 = charCategoryTable.getValueArray();
560     out.write(temp2);
561     switch (temp2.length % 4) {
562         case 1: out.write(0);
563         case 2: out.write(0);
564         case 3: out.write(0);
565         default: break;
566     }
567 //    write the state transition tables
568     for (int i = 0; i < stateTable.length; i++)
569         writeSwappedShort(stateTable[i], out, littleEndian);
570     if (stateTable.length % 2 == 1)
571         out.writeShort(0);
572     for (int i = 0; i < backwardsStateTable.length; i++)
573         writeSwappedShort(backwardsStateTable[i], out, littleEndian);
574     if (backwardsStateTable.length % 2 == 1)
575         out.writeShort(0);
576 //    write the flag arrays
577     for (int i = 0; i < endStates.length; i++)
578         out.writeBoolean(endStates[i]);
579     switch (endStates.length % 4) {
580         case 1: out.write(0);
581         case 2: out.write(0);
582         case 3: out.write(0);
583         default: break;
584     }
585     for (int i = 0; i < lookaheadStates.length; i++)
586         out.writeBoolean(lookaheadStates[i]);
587     switch (lookaheadStates.length % 4) {
588         case 1: out.write(0);
589         case 2: out.write(0);
590         case 3: out.write(0);
591         default: break;
592     }
593 }
594 
595 /**
596  * @internal
597  */
598 protected void writeSwappedShort(short x, DataOutputStream out, boolean littleEndian)
599 throws IOException{
600     if (littleEndian) {
601         out.write((byte)(x & 0xff));
602         out.write((byte)((x >> 8) & 0xff));
603     }
604     else {
605         out.write((byte)((x >> 8) & 0xff));
606         out.write((byte)(x & 0xff));
607     }
608 }
609 
610 /**
611  * @internal
612  */
613 protected void writeSwappedInt(int x, DataOutputStream out, boolean littleEndian)
614 throws IOException {
615     if (littleEndian) {
616         out.write((byte)(x & 0xff));
617         out.write((byte)((x >> 8) & 0xff));
618         out.write((byte)((x >> 16) & 0xff));
619         out.write((byte)((x >> 24) & 0xff));
620     }
621     else {
622         out.write((byte)((x >> 24) & 0xff));
623         out.write((byte)((x >> 16) & 0xff));
624         out.write((byte)((x >> 8) & 0xff));
625         out.write((byte)(x & 0xff));
626     }
627 }
628     ///CLOVER:ON
629 
630     //=======================================================================
631     // BreakIterator overrides
632     //=======================================================================
633 
634     /**
635      * Sets the current iteration position to the beginning of the text.
636      * (i.e., the CharacterIterator's starting offset).
637      * @return The offset of the beginning of the text.
638      * @stable ICU 2.0
639      */
640     public int first() {
641         CharacterIterator   t = getText();
642 
643         t.first();
644         return t.getIndex();
645     }
646 
647     /**
648      * Sets the current iteration position to the end of the text.
649      * (i.e., the CharacterIterator's ending offset).
650      * @return The text's past-the-end offset.
651      * @stable ICU 2.0
652      */
653     public int last() {
654         CharacterIterator   t = getText();
655 
656         // I'm not sure why, but t.last() returns the offset of the last character,
657         // rather than the past-the-end offset
658         t.setIndex(t.getEndIndex());
659         return t.getIndex();
660     }
661 
662     /**
663      * Advances the iterator either forward or backward the specified number of steps.
664      * Negative values move backward, and positive values move forward.  This is
665      * equivalent to repeatedly calling next() or previous().
666      * @param n The number of steps to move.  The sign indicates the direction
667      * (negative is backwards, and positive is forwards).
668      * @return The character offset of the boundary position n boundaries away from
669      * the current one.
670      * @stable ICU 2.0
671      */
672     public int next(int n) {
673         int result = current();
674         while (n > 0) {
675             result = handleNext();
676             --n;
677         }
678         while (n < 0) {
679             result = previous();
680             ++n;
681         }
682         return result;
683     }
684 
685     /**
686      * Advances the iterator to the next boundary position.
687      * @return The position of the first boundary after this one.
688      * @stable ICU 2.0
689      */
690     public int next() {
691         return handleNext();
692     }
693 
694     /**
695      * Advances the iterator backwards, to the last boundary preceding this one.
696      * @return The position of the last boundary position preceding this one.
697      * @stable ICU 2.0
698      */
699     public int previous() {
700         // if we're already sitting at the beginning of the text, return DONE
701         CharacterIterator   text = getText();
702         if (current() == text.getBeginIndex()) {
703             return BreakIterator.DONE;
704         }
705 
706         // set things up.  handlePrevious() will back us up to some valid
707         // break position before the current position (we back our internal
708         // iterator up one step to prevent handlePrevious() from returning
709         // the current position), but not necessarily the last one before
710         // where we started
711         int start = current();
712         text.previous();
713         int lastResult = handlePrevious();
714         int result = lastResult;
715 
716         // iterate forward from the known break position until we pass our
717         // starting point.  The last break position before the starting
718         // point is our return value
719         while (result != BreakIterator.DONE && result < start) {
720             lastResult = result;
721             result = handleNext();
722         }
723 
724         // set the current iteration position to be the last break position
725         // before where we started, and then return that value
726         text.setIndex(lastResult);
727         return lastResult;
728     }
729 
730     /**
731      * Throw IllegalArgumentException unless begin <= offset < end.
732      * @stable ICU 2.0
733      */
734     protected static final void checkOffset(int offset, CharacterIterator   text) {
735         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
736             throw new IllegalArgumentException  ("offset out of bounds");
737         }
738     }
739 
740     /**
741      * Sets the iterator to refer to the first boundary position following
742      * the specified position.
743      * @param offset The position from which to begin searching for a break position.
744      * @return The position of the first break after the current position.
745      * @stable ICU 2.0
746      */
747     public int following(int offset) {
748         // if the offset passed in is already past the end of the text,
749         // just return DONE
750         CharacterIterator   text = getText();
751         if (offset == text.getEndIndex()) {
752             return BreakIterator.DONE;
753         }
754         checkOffset(offset, text);
755 
756         // otherwise, set our internal iteration position (temporarily)
757         // to the position passed in.  If this is the _beginning_ position,
758         // then we can just use next() to get our return value
759         text.setIndex(offset);
760         if (offset == text.getBeginIndex()) {
761             return handleNext();
762         }
763 
764         // otherwise, we have to sync up first.  Use handlePrevious() to back
765         // us up to a known break position before the specified position (if
766         // we can determine that the specified position is a break position,
767         // we don't back up at all).  This may or may not be the last break
768         // position at or before our starting position.  Advance forward
769         // from here until we've passed the starting position.  The position
770         // we stop on will be the first break position after the specified one.
771         int result = handlePrevious();
772         while (result != BreakIterator.DONE && result <= offset) {
773             result = handleNext();
774         }
775         return result;
776     }
777 
778     /**
779      * Sets the iterator to refer to the last boundary position before the
780      * specified position.
781      * @param offset The position to begin searching for a break from.
782      * @return The position of the last boundary before the starting position.
783      * @stable ICU 2.0
784      */
785     public int preceding(int offset) {
786         // if we start by updating the current iteration position to the
787         // position specified by the caller, we can just use previous()
788         // to carry out this operation
789         CharacterIterator   text = getText();
790         checkOffset(offset, text);
791         text.setIndex(offset);
792         return previous();
793     }
794 
795     /**
796      * Returns true if the specfied position is a boundary position.  As a side
797      * effect, leaves the iterator pointing to the first boundary position at
798      * or after "offset".
799      * @param offset the offset to check.
800      * @return True if "offset" is a boundary position.
801      * @stable ICU 2.0
802      */
803     public boolean isBoundary(int offset) {
804         CharacterIterator   text = getText();
805         checkOffset(offset, text);
806         if (offset == text.getBeginIndex()) {
807             return true;
808         }
809 
810         // to check whether this is a boundary, we can use following() on the
811         // position before the specified one and return true if the position we
812         // get back is the one the user specified
813         else {
814             return following(offset - 1) == offset;
815         }
816     }
817 
818     /**
819      * Returns the current iteration position.
820      * @return The current iteration position.
821      * @stable ICU 2.0
822      */
823     public int current() {
824         return getText().getIndex();
825     }
826 
827     
828     /**
829      * Return the status tag from the break rule that determined the most recently
830      * returned break position.  The values appear in the rule source
831      * within brackets, {123}, for example.  For rules that do not specify a
832      * status, a default value of 0 is returned.  If more than one rule applies,
833      * the numerically largest of the possible status values is returned.
834      * <p>
835      * Note that for old style break iterators (implemented by this class), no
836      * status can be declared, and a status of zero is always assumed.
837      * <p>
838      *
839      * @draft ICU 3.0
840      * @provisional This API might change or be removed in a future release.
841      */
842     public int  getRuleStatus() {
843         return 0;
844     }
845 
846 
847 
848     /**
849      * Get the status (tag) values from the break rule(s) that determined the most 
850      * recently returned break position.  The values appear in the rule source
851      * within brackets, {123}, for example.  The default status value for rules
852      * that do not explicitly provide one is zero.
853      * <p>
854      * Note that for old style break iterators (implemented by this class), no
855      * status can be declared, and a status of zero is always assumed.
856      * <p>
857      * @draft ICU 3.0
858      * @provisional This API might change or be removed in a future release.
859      */
860      public int getRuleStatusVec(int[] fillInArray) {
861          if (fillInArray != null && fillInArray.length >= 1) {
862              fillInArray[0] = 0;
863          }
864          return 1;
865      }
866 
867 
868 
869     /**
870      * Return a CharacterIterator over the text being analyzed.  This version
871      * of this method returns the actual CharacterIterator we're using internally.
872      * Changing the state of this iterator can have undefined consequences.  If
873      * you need to change it, clone it first.
874      * @return An iterator over the text being analyzed.
875      * @stable ICU 2.0
876      */
877     public CharacterIterator   getText() {
878         // The iterator is initialized pointing to no text at all, so if this
879         // function is called while we're in that state, we have to fudge an
880         // an iterator to return.
881         if (text == null) {
882             text = new StringCharacterIterator("");
883         }
884         return text;
885     }
886 
887     /**
888      * Set the iterator to analyze a new piece of text.  This function resets
889      * the current iteration position to the beginning of the text.
890      * @param newText An iterator over the text to analyze.
891      * @stable ICU 2.0
892      */
893     public void setText(CharacterIterator   newText) {
894         // Test text to see if we need to wrap it in a SafeCharIterator:
895         int end = newText.getEndIndex();
896         newText.setIndex(end);
897         if (newText.getIndex() != end) {
898             // failed - wrap in correct implementation
899             text = new SafeCharIterator(newText);
900         }
901         else {
902             text = newText;
903         }
904         text.first();
905     }
906 
907 
908     //=======================================================================
909     // implementation
910     //=======================================================================
911 
912     /**
913      * This method is the actual implementation of the next() method.  All iteration
914      * vectors through here.  This method initializes the state machine to state 1
915      * and advances through the text character by character until we reach the end
916      * of the text or the state machine transitions to state 0.  We update our return
917      * value every time the state machine passes through a possible end state.
918      * @stable ICU 2.0
919      */
920     protected int handleNext() {
921         // if we're already at the end of the text, return DONE.
922         CharacterIterator   text = getText();
923         if (text.getIndex() == text.getEndIndex()) {
924             return BreakIterator.DONE;
925         }
926 
927         // no matter what, we always advance at least one character forward
928         int result = text.getIndex() + 1;
929         int lookaheadResult = 0;
930 
931         // begin in state 1
932         int state = START_STATE;
933         int category;
934         char c = text.current();
935         char lastC = c;
936         int lastCPos = 0;
937 
938         // if the first character in this segment is an ignore character (which can happen
939         // when it's either the first character in the file or follows a mandatory break
940         // character), and the first non-ignore character isn't a glue character, always
941         // put a break between the ignore characters and the rest of the text
942         if (lookupCategory(c) == IGNORE) {
943             while (lookupCategory(c) == IGNORE)
944                 c = text.next();
945 
946             if (Character.getType(c) == Character.NON_SPACING_MARK || Character.getType(c)
947                     == Character.ENCLOSING_MARK) {
948                 return text.getIndex();
949             }
950         }
951 
952         // loop until we reach the end of the text or transition to state 0
953         while (c != CharacterIterator.DONE && state != STOP_STATE) {
954 
955             // look up the current character's character category (which tells us
956             // which column in the state table to look at)
957             category = lookupCategory(c);
958 
959             // if the character isn't an ignore character, look up a state
960             // transition in the state table
961             if (category != IGNORE) {
962                 state = lookupState(state, category);
963             }
964 
965             // if the state we've just transitioned to is a lookahead state,
966             // (but not also an end state), save its position.  If it's
967             // both a lookahead state and an end state, update the break position
968             // to the last saved lookup-state position
969             if (lookaheadStates[state]) {
970                 if (endStates[state]) {
971                     if (lookaheadResult > 0) {
972                         result = lookaheadResult;
973                     }
974                     else {
975                         result = text.getIndex() + 1;
976                     }
977                 }
978                 else {
979                     lookaheadResult = text.getIndex() + 1;
980                 }
981             }
982 
983             // otherwise, if the state we've just transitioned to is an accepting
984             // state, update the break position to be the current iteration position
985             else {
986                 if (endStates[state]) {
987                     result = text.getIndex() + 1;
988                 }
989             }
990 
991             // keep track of the last "real" character we saw.  If this character isn't an
992             // ignore character, take note of it and its position in the text
993             if (category != IGNORE && state != STOP_STATE) {
994                 lastC = c;
995                 lastCPos = text.getIndex();
996             }
997             c = text.next();
998         }
999 
1000        // if we've run off the end of the text, and the very last character took us into
1001        // a lookahead state, advance the break position to the lookahead position
1002        // (the theory here is that if there are no characters at all after the lookahead
1003        // position, that always matches the lookahead criteria)
1004        if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
1005            result = lookaheadResult;
1006        }
1007
1008        // if the last character we saw before the one that took us into the stop state
1009        // was a mandatory breaking character, then the break position goes right after it
1010        // (this is here so that breaks come before, rather than after, a string of
1011        // ignore characters when they follow a mandatory break character)
1012        else if ("\n\r\f\u2028\u2029".indexOf(lastC) != -1) {
1013            result = lastCPos + 1;
1014        }
1015
1016        text.setIndex(result);
1017        return result;
1018    }
1019
1020    /**
1021     * This method backs the iterator back up to a "safe position" in the text.
1022     * This is a position that we know, without any context, must be a break position.
1023     * The various calling methods then iterate forward from this safe position to
1024     * the appropriate position to return.  (For more information, see the description
1025     * of buildBackwardsStateTable() in RuleBasedBreakIterator_Old.Builder.)
1026     * @stable ICU 2.0
1027     */
1028    protected int handlePrevious() {
1029        CharacterIterator   text = getText();
1030        int state = START_STATE;
1031        int category = 0;
1032        int lastCategory = 0;
1033        char c = text.current();
1034
1035        // loop until we reach the beginning of the text or transition to state 0
1036        while (c != CharacterIterator.DONE && state != STOP_STATE) {
1037//System.out.print(" " + text.getIndex());
1038
1039            // save the last character's category and look up the current
1040            // character's category
1041            lastCategory = category;
1042            category = lookupCategory(c);
1043
1044            // if the current character isn't an ignore character, look up a
1045            // state transition in the backwards state table
1046            if (category != IGNORE) {
1047                state = lookupBackwardState(state, category);
1048            }
1049
1050            // then advance one character backwards
1051            c = text.previous();
1052        }
1053
1054        // if we didn't march off the beginning of the text, we're either one or two
1055        // positions away from the real break position.  (One because of the call to
1056        // previous() at the end of the loop above, and another because the character
1057        // that takes us into the stop state will always be the character BEFORE
1058        // the break position.)
1059        if (c != CharacterIterator.DONE) {
1060            if (lastCategory != IGNORE) {
1061                text.setIndex(text.getIndex() + 2);
1062            }
1063            else {
1064                text.next();
1065            }
1066        }
1067//System.out.print(",");
1068        return text.getIndex();
1069    }
1070
1071//static int visitedChars = 0;
1072    /**
1073     * Looks up a character's category (i.e., its category for breaking purposes,
1074     * not its Unicode category)
1075     * @internal
1076     */
1077    protected int lookupCategory(char c) {
1078//++visitedChars;
1079        return charCategoryTable.elementAt(c);
1080    }
1081
1082/*
1083static void printVisitedCharCount() {
1084System.out.println("Total number of characters visited = " + visitedChars);
1085visitedChars = 0;
1086}
1087*/
1088
1089    /**
1090     * Given a current state and a character category, looks up the
1091     * next state to transition to in the state table.
1092     * @internal
1093     */
1094    protected int lookupState(int state, int category) {
1095        return stateTable[state * numCategories + category];
1096    }
1097
1098    /**
1099     * Given a current state and a character category, looks up the
1100     * next state to transition to in the backwards state table.
1101     * @internal
1102     */
1103    protected int lookupBackwardState(int state, int category) {
1104        return backwardsStateTable[state * numCategories + category];
1105    }
1106
1107    /**
1108     * This is a helper function for computing the intersection of
1109     * two <code>UnicodeSet</code> objects.
1110     * @param a, b the two <code>UnicodeSet</code>s to intersect
1111     * @return a new <code>UnicodeSet</code> which is the intersection of a and b
1112     */
1113    private static UnicodeSet intersection(UnicodeSet a, UnicodeSet b)
1114    {
1115        UnicodeSet result = new UnicodeSet(a);
1116
1117        result.retainAll(b);
1118
1119        return result;
1120    }
1121
1122    //=======================================================================
1123    // RuleBasedBreakIterator.Builder
1124    //=======================================================================
1125    /**
1126     * The Builder class has the job of constructing a RuleBasedBreakIterator_Old from a
1127     * textual description.  A Builder is constructed by RuleBasedBreakIterator_Old's
1128     * constructor, which uses it to construct the iterator itself and then throws it
1129     * away.
1130     * <p>The construction logic is separated out into its own class for two primary
1131     * reasons:
1132     * <ul><li>The construction logic is quite sophisticated and large.  Separating it
1133     * out into its own class means the code must only be loaded into memory while a
1134     * RuleBasedBreakIterator_Old is being constructed, and can be purged after that.
1135     * <li>There is a fair amount of state that must be maintained throughout the
1136     * construction process that is not needed by the iterator after construction.
1137     * Separating this state out into another class prevents all of the functions that
1138     * construct the iterator from having to have really long parameter lists,
1139     * (hopefully) contributing to readability and maintainability.</ul>
1140     * <p>It'd be really nice if this could be an independent class rather than an
1141     * inner class, because that would shorten the source file considerably, but
1142     * making Builder an inner class of RuleBasedBreakIterator_Old allows it direct access
1143     * to RuleBasedBreakIterator_Old's private members, which saves us from having to
1144     * provide some kind of "back door" to the Builder class that could then also be
1145     * used by other classes.
1146     * @internal
1147     */
1148    protected class Builder {
1149        /**
1150         * A temporary holding place used for calculating the character categories.
1151         * This object contains UnicodeSet objects.
1152         * @internal
1153         */
1154        protected Vector   categories = null;
1155
1156        /**
1157         * A table used to map parts of regexp text to lists of character categories,
1158         * rather than having to figure them out from scratch each time
1159         * @internal
1160         */
1161        protected Hashtable   expressions = null;
1162
1163        /**
1164         * A temporary holding place for the list of ignore characters
1165         * @internal
1166         */
1167        protected UnicodeSet ignoreChars = null;
1168
1169        /**
1170         * A temporary holding place where the forward state table is built
1171         * @internal
1172         */
1173        protected Vector   tempStateTable = null;
1174
1175        /**
1176         * A list of all the states that have to be filled in with transitions to the
1177         * next state that is created.  Used when building the state table from the
1178         * regular expressions.
1179         * @internal
1180         */
1181        protected Vector   decisionPointList = null;
1182
1183        /**
1184         * A stack for holding decision point lists.  This is used to handle nested
1185         * parentheses and braces in regexps.
1186         * @internal
1187         */
1188        protected Stack   decisionPointStack = null;
1189
1190        /**
1191         * A list of states that loop back on themselves.  Used to handle .*?
1192         * @internal
1193         */
1194        protected Vector   loopingStates = null;
1195
1196        /**
1197         * Looping states actually have to be backfilled later in the process
1198         * than everything else.  This is where a the list of states to backfill
1199         * is accumulated.  This is also used to handle .*?
1200         * @internal
1201         */
1202        protected Vector   statesToBackfill = null;
1203
1204        /**
1205         * A list mapping pairs of state numbers for states that are to be combined
1206         * to the state number of the state representing their combination.  Used
1207         * in the process of making the state table deterministic to prevent
1208         * infinite recursion.
1209         * @internal
1210         */
1211        protected Vector   mergeList = null;
1212
1213        /**
1214         * A flag that is used to indicate when the list of looping states can
1215         * be reset.
1216         * @internal
1217         */
1218        protected boolean clearLoopingStates = false;
1219
1220        /**
1221         * A bit mask used to indicate a bit in the table's flags column that marks a
1222         * state as an accepting state.
1223         * @internal
1224         */
1225        protected static final int END_STATE_FLAG = 0x8000;
1226
1227        /**
1228         * A bit mask used to indicate a bit in the table's flags column that marks a
1229         * state as one the builder shouldn't loop to any looping states
1230         * @internal
1231         */
1232        protected static final int DONT_LOOP_FLAG = 0x4000;
1233
1234        /**
1235         * A bit mask used to indicate a bit in the table's flags column that marks a
1236         * state as a lookahead state.
1237         * @internal
1238         */
1239        protected static final int LOOKAHEAD_STATE_FLAG = 0x2000;
1240
1241        /**
1242         * A bit mask representing the union of the mask values listed above.
1243         * Used for clearing or masking off the flag bits.
1244         * @internal
1245         */
1246        protected static final int ALL_FLAGS = END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
1247                | DONT_LOOP_FLAG;
1248
1249        /**
1250         * No special construction is required for the Builder.
1251         * @internal
1252         */
1253        public Builder() {
1254        }
1255
1256        /**
1257         * This is the main function for setting up the BreakIterator's tables.  It
1258         * just vectors different parts of the job off to other functions.
1259         * @internal
1260         */
1261        public void buildBreakIterator() {
1262            Vector   tempRuleList = buildRuleList(description);
1263            buildCharCategories(tempRuleList);
1264            buildStateTable(tempRuleList);
1265            buildBackwardsStateTable(tempRuleList);
1266        }
1267
1268        /**
1269         * Thus function has three main purposes:
1270         * <ul><li>Perform general syntax checking on the description, so the rest of the
1271         * build code can assume that it's parsing a legal description.
1272         * <li>Split the description into separate rules
1273         * <li>Perform variable-name substitutions (so that no one else sees variable names)
1274         * </ul>
1275         */
1276        private Vector   buildRuleList(String   description) {
1277            // invariants:
1278            // - parentheses must be balanced: ()[]{}<>
1279            // - nothing can be nested inside {}
1280            // - nothing can be nested inside [] except more []s
1281            // - pairs of ()[]{}<> must not be empty
1282            // - ; can only occur at the outer level
1283            // - | can only appear inside ()
1284            // - only one = or / can occur in a single rule
1285            // - = and / cannot both occur in the same rule
1286            // - the right-hand side of a = expression must be enclosed in [] or ()
1287            // - * may not occur at the beginning of a rule, nor may it follow
1288            //   =, /, (, (, |, }, ;, or *
1289            // - ? may only follow *
1290            // - the rule list must contain at least one / rule
1291            // - no rule may be empty
1292            // - all printing characters in the ASCII range except letters and digits
1293            //   are reserved and must be preceded by \
1294            // - ! may only occur at the beginning of a rule
1295
1296            // set up a vector to contain the broken-up description (each entry in the
1297            // vector is a separate rule) and a stack for keeping track of opening
1298            // punctuation
1299            Vector   tempRuleList = new Vector  ();
1300            Stack   parenStack = new Stack  ();
1301
1302            int p = 0;
1303            int ruleStart = 0;
1304            char c = '\u0000';
1305            char lastC = '\u0000';
1306            char lastOpen = '\u0000';
1307            boolean haveEquals = false;
1308            boolean havePipe = false;
1309            boolean sawVarName = false;
1310            boolean sawIllegalChar = false;
1311            int illegalCharPos = 0;
1312            final String   charsThatCantPrecedeAsterisk = "=/<(|>*+?;\u0000";
1313
1314            // if the description doesn't end with a semicolon, tack a semicolon onto the end
1315            if (description.length() != 0 && description.charAt(description.length() - 1) != ';') {
1316                description = description + ";";
1317            }
1318
1319            // for each character, do...
1320            while (p < description.length()) {
1321                c = description.charAt(p);
1322                switch (c) {
1323                    // if the character is opening punctuation, verify that no nesting
1324                    // rules are broken, and push the character onto the stack
1325                    case '{':
1326                    case '[':
1327                    case '(':
1328                        if (lastOpen == '{') {
1329                            error("Can't nest brackets inside {}", p, description);
1330                        }
1331                        if (lastOpen == '[' && c != '[') {
1332                            error("Can't nest anything in [] but []", p, description);
1333                        }
1334
1335                        // if we see { anywhere except on the left-hand side of =,
1336                        // we must be seeing a variable name that was never defined
1337                        if (c == '{' && (haveEquals || havePipe)) {
1338                            error("Unknown variable name", p, description);
1339                        }
1340
1341                        lastOpen = c;
1342                        parenStack.push(new Character  (c));
1343                        if (c == '{') {
1344                            sawVarName = true;
1345                        }
1346                        break;
1347
1348                    // if the character is closing punctuation, verify that it matches the
1349                    // last opening punctuation we saw, and that the brackets contain
1350                    // something, then pop the stack
1351                    case '}':
1352                    case ']':
1353                    case ')':
1354                        char expectedClose = '\u0000';
1355                        switch (lastOpen) {
1356                            case '{':
1357                                expectedClose = '}';
1358                                break;
1359                            case '[':
1360                                expectedClose = ']';
1361                                break;
1362                            case '(':
1363                                expectedClose = ')';
1364                                break;
1365                        }
1366                        if (c != expectedClose) {
1367                            error("Unbalanced parentheses", p, description);
1368                        }
1369                        if (lastC == lastOpen) {
1370                            error("Parens don't contain anything", p, description);
1371                        }
1372                        parenStack.pop();
1373                        if (!parenStack.empty()) {
1374                            lastOpen = ((Character  )(parenStack.peek())).charValue();
1375                        }
1376                        else {
1377                            lastOpen = '\u0000';
1378                        }
1379
1380                        break;
1381
1382                    // if the character is an asterisk, make sure it occurs in a place
1383                    // where an asterisk can legally go
1384                    case '*': case '+': case '?':
1385                        if (charsThatCantPrecedeAsterisk.indexOf(lastC) != -1
1386                                && (c != '?' || lastC != '*')) {
1387                            error("Misplaced *, +, or ?", p, description);
1388                        }
1389                        break;
1390
1391                    // if the character is an equals sign, make sure we haven't seen another
1392                    // equals sign or a slash yet
1393                    case '=':
1394                        if (haveEquals || havePipe) {
1395                            error("More than one = or / in rule", p, description);
1396                        }
1397                        haveEquals = true;
1398                        sawIllegalChar = false;
1399                        break;
1400
1401                    // if the character is a slash, make sure we haven't seen another slash
1402                    // or an equals sign yet
1403                    case '/':
1404                        if (haveEquals || havePipe) {
1405                            error("More than one = or / in rule", p, description);
1406                        }
1407                        if (sawVarName) {
1408                            error("Unknown variable name", p, description);
1409                        }
1410                        havePipe = true;
1411                        break;
1412
1413                    // if the character is an exclamation point, make sure it occurs only
1414                    // at the beginning of a rule
1415                    case '!':
1416                        if (lastC != ';' && lastC != '\u0000') {
1417                            error("! can only occur at the beginning of a rule", p, description);
1418                        }
1419                        break;
1420
1421                    // if the character is a backslash, skip the character that follows it
1422                    // (it'll get treated as a literal character)
1423                    case '\\':
1424                        ++p;
1425                        break;
1426
1427                    // we don't have to do anything special on a period
1428                    case '.':
1429                        break;
1430
1431                    // if the character is a syntax character that can only occur
1432                    // inside [], make sure that it does in fact only occur inside []
1433                    // (or in a variable name)
1434                    case '^':
1435                    case '-':
1436                    case ':':
1437                    case '&':
1438                        if (lastOpen != '[' && lastOpen != '{' && !sawIllegalChar) {
1439                            sawIllegalChar = true;
1440                            illegalCharPos = p;
1441                        }
1442                        break;
1443
1444                    // if the character is a semicolon, do the following...
1445                    case ';':
1446                        // if we saw any illegal characters along the way, throw
1447                        // an error
1448                        if (sawIllegalChar) {
1449                            error("Illegal character", illegalCharPos, description);
1450                        }
1451
1452                        // make sure the rule contains something and that there are no
1453                        // unbalanced parentheses or brackets
1454                        if (lastC == ';' || lastC == '\u0000') {
1455                            error("Empty rule", p, description);
1456                        }
1457                        if (!parenStack.empty()) {
1458                            error("Unbalanced parenheses", p, description);
1459                        }
1460
1461                        if (parenStack.empty()) {
1462                            // if the rule contained an = sign, call processSubstitution()
1463                            // to replace the substitution name with the substitution text
1464                            // wherever it appears in the description
1465                            if (haveEquals) {
1466                                description = processSubstitution(description.substring(ruleStart,
1467                                                p), description, p + 1);
1468                            }
1469                            else {
1470                                // otherwise, check to make sure the rule doesn't reference
1471                                // any undefined substitutions
1472                                if (sawVarName) {
1473                                    error("Unknown variable name", p, description);
1474                                }
1475
1476                                // then add it to tempRuleList
1477                                tempRuleList.addElement(description.substring(ruleStart, p));
1478                            }
1479
1480                            // and reset everything to process the next rule
1481                            ruleStart = p + 1;
1482                            haveEquals = havePipe = sawVarName = sawIllegalChar = false;
1483                        }
1484                        break;
1485
1486                    // if the character is a vertical bar, check to make sure that it
1487                    // occurs inside a () expression and that the character that precedes
1488                    // it isn't also a vertical bar
1489                    case '|':
1490                        if (lastC == '|') {
1491                            error("Empty alternative", p, description);
1492                        }
1493                        if (parenStack.empty() || lastOpen != '(') {
1494                            error("Misplaced |", p, description);
1495                        }
1496                        break;
1497
1498                    // if the character is anything else (escaped characters are
1499                    // skipped and don't make it here), it's an error
1500                    default:
1501                        if (c >= ' ' && c < '\u007f' && !Character.isLetter(c)
1502                                && !Character.isDigit(c) && !sawIllegalChar) {
1503                            sawIllegalChar = true;
1504                            illegalCharPos = p;
1505                        }
1506                        break;
1507                }
1508                lastC = c;
1509                ++p;
1510            }
1511            if (tempRuleList.size() == 0) {
1512                error("No valid rules in description", p, description);
1513            }
1514            return tempRuleList;
1515        }
1516
1517        /**
1518         * This function performs variable-name substitutions.  First it does syntax
1519         * checking on the variable-name definition.  If it's syntactically valid, it
1520         * then goes through the remainder of the description and does a simple
1521         * find-and-replace of the variable name with its text.  (The variable text
1522         * must be enclosed in either [] or () for this to work.)
1523         * @internal
1524         */
1525        protected String   processSubstitution(String   substitutionRule, String   description,
1526                        int startPos) {
1527            // isolate out the text on either side of the equals sign
1528            String   replace;
1529            String   replaceWith;
1530            int equalPos = substitutionRule.indexOf('=');
1531            if (substitutionRule.charAt(0) != '$') {
1532                error("Missing '$' on left-hand side of =", startPos, description);
1533            }
1534            replace = substitutionRule.substring(1, equalPos);
1535            replaceWith = substitutionRule.substring(equalPos + 1);
1536
1537            // check to see whether the substitution name is something we've declared
1538            // to be "special".  For RuleBasedBreakIterator itself, this is IGNORE_VAR.
1539            // This function takes care of any extra processing that has to be done
1540            // with "special" substitution names.
1541            handleSpecialSubstitution(replace, replaceWith, startPos, description);
1542
1543            // perform various other syntax checks on the rule
1544            if (replaceWith.length() == 0) {
1545                error("Nothing on right-hand side of =", startPos, description);
1546            }
1547            if (replace.length() == 0) {
1548                error("Nothing on left-hand side of =", startPos, description);
1549            }
1550            if (!(replaceWith.charAt(0) == '[' && replaceWith.charAt(replaceWith.length() - 1)
1551                    == ']') && !(replaceWith.charAt(0) == '(' && replaceWith.charAt(
1552                    replaceWith.length() - 1) == ')')) {
1553                error("Illegal right-hand side for =", startPos, description);
1554            }
1555
1556            // now go through the rest of the description (which hasn't been broken up
1557            // into separate rules yet) and replace every occurrence of the
1558            // substitution name with the substitution body
1559            replace = "$" + replace;
1560            StringBuffer   result = new StringBuffer  ();
1561            result.append(description.substring(0, startPos));
1562            int lastPos = startPos;
1563            int pos = description.indexOf(replace, startPos);
1564            while (pos != -1) {
1565                // [liu] Check that the string we've found isn't a redefinition
1566                // of the variable.
1567                if (description.charAt(pos-1) == ';' &&
1568                    description.charAt(pos + replace.length()) == '=') {
1569                    error("Attempt to redefine " + replace, pos, description);
1570                }
1571                result.append(description.substring(lastPos, pos));
1572                result.append(replaceWith);
1573                lastPos = pos + replace.length();
1574                pos = description.indexOf(replace, lastPos);
1575            }
1576            result.append(description.substring(lastPos));
1577            return result.toString();
1578        }
1579
1580        /**
1581         * This function defines a protocol for handling substitution names that
1582         * are "special," i.e., that have some property beyond just being
1583         * substitutions.  At the RuleBasedBreakIterator_Old level, we have one
1584         * special substitution name, IGNORE_VAR.  Subclasses can override this
1585         * function to add more.  Any special processing that has to go on beyond
1586         * that which is done by the normal substitution-processing code is done
1587         * here.
1588         * @internal
1589         */
1590        protected void handleSpecialSubstitution(String   replace, String   replaceWith,
1591                    int startPos, String   description) {
1592            // if we get a definition for a substitution called IGNORE_VAR, it defines
1593            // the ignore characters for the iterator.  Check to make sure the expression
1594            // is a [] expression, and if it is, parse it and store the characters off
1595            // to the side.
1596            if (replace.equals(IGNORE_VAR)) {
1597                if (replaceWith.charAt(0) == '(') {
1598                    error("Ignore group can't be enclosed in (", startPos, description);
1599                }
1600                ignoreChars = new UnicodeSet(replaceWith, false);
1601            }
1602        }
1603
1604        /**
1605         * This function builds the character category table.  On entry,
1606         * tempRuleList is a vector of break rules that has had variable names substituted.
1607         * On exit, the charCategoryTable data member has been initialized to hold the
1608         * character category table, and tempRuleList's rules have been munged to contain
1609         * character category numbers everywhere a literal character or a [] expression
1610         * originally occurred.
1611         * @internal
1612         */
1613        protected void buildCharCategories(Vector   tempRuleList) {
1614            int bracketLevel = 0;
1615            int p = 0;
1616            int lineNum = 0;
1617
1618            // build hash table of every literal character or [] expression in the rule list
1619            // and derive a UnicodeSet object representing the characters each refers to
1620            expressions = new Hashtable  ();
1621            while (lineNum < tempRuleList.size()) {
1622                String   line = (String  )(tempRuleList.elementAt(lineNum));
1623                p = 0;
1624                while (p < line.length()) {
1625                    char c = line.charAt(p);
1626                    switch (c) {
1627                        // skip over all syntax characters except [
1628                        case '(': case ')': case '*': case '.': case '/':
1629                        case '|': case ';': case '?': case '!': case '+':
1630                            break;
1631
1632                        // for [, find the matching ] (taking nested [] pairs into account)
1633                        // and add the whole expression to the expression list
1634                        case '[':
1635                            int q = p + 1;
1636                            ++bracketLevel;
1637                            while (q < line.length() && bracketLevel != 0) {
1638                                c = line.charAt(q);
1639                                if (c == '[') {
1640                                    ++bracketLevel;
1641                                }
1642                                else if (c == ']') {
1643                                    --bracketLevel;
1644                                }
1645                                ++q;
1646                            }
1647                            if (expressions.get(line.substring(p, q)) == null) {
1648                                expressions.put(line.substring(p, q), new UnicodeSet(line.
1649                                                substring(p, q), false));
1650//Test.debugPrintln("1. Adding expression: " + line.substring(p, q));
1651                            }
1652                            p = q - 1;
1653                            break;
1654
1655                        // for \ sequences, just move to the next character and treat
1656                        // it as a single character
1657                        case '\\':
1658                            ++p;
1659                            c = line.charAt(p);
1660                            // DON'T break; fall through into "default" clause
1661
1662                        // for an isolated single character, add it to the expression list
1663                        default:
1664                            UnicodeSet s = new UnicodeSet();
1665                            s.add(line.charAt(p));
1666                            expressions.put(line.substring(p, p + 1), s);
1667//Test.debugPrintln("2. Adding expression: " + line.substring(p, p + 1));
1668                            break;
1669                    }
1670                    ++p;
1671                }
1672                ++lineNum;
1673            }
1674
1675            // create the temporary category table (which is a vector of UnicodeSet objects)
1676            categories = new Vector  ();
1677            if (ignoreChars != null) {
1678                categories.addElement(ignoreChars);
1679            }
1680            else {
1681                categories.addElement(new UnicodeSet());
1682            }
1683            ignoreChars = null;
1684
1685            // this is a hook to allow subclasses to add categories on their own
1686            mungeExpressionList(expressions);
1687
1688            // Derive the character categories.  Go through the existing character categories
1689            // looking for overlap.  Any time there's overlap, we create a new character
1690            // category for the characters that overlapped and remove them from their original
1691            // category.  At the end, any characters that are left in the expression haven't
1692            // been mentioned in any category, so another new category is created for them.
1693            // For example, if the first expression is [abc], then a, b, and c will be placed
1694            // into a single character category.  If the next expression is [bcd], we will first
1695            // remove b and c from their existing category (leaving a behind), create a new
1696            // category for b and c, and then create another new category for d (which hadn't
1697            // been mentioned in the previous expression).
1698            // At no time should a character ever occur in more than one character category.
1699
1700            // for each expression in the expressions list, do...
1701            Enumeration   iter = expressions.elements();
1702            while (iter.hasMoreElements()) {
1703                // initialize the working char set to the chars in the current expression
1704                UnicodeSet work = new UnicodeSet((UnicodeSet)iter.nextElement());
1705
1706                // for each category in the category list, do...
1707                for (int j = categories.size() - 1; !work.isEmpty() && j > 0; j--) {
1708
1709                    // if there's overlap between the current working set of chars
1710                    // and the current category...
1711                    UnicodeSet cat = (UnicodeSet)(categories.elementAt(j));
1712                    UnicodeSet overlap = intersection(work, cat);
1713
1714                    if (!overlap.isEmpty()) {
1715                        // if the current category is not a subset of the current
1716                        // working set of characters, then remove the overlapping
1717                        // characters from the current category and create a new
1718                        // category for them
1719                        if (!overlap.equals(cat)) {
1720                            cat.removeAll(overlap);
1721                            categories.addElement(overlap);
1722                        }
1723
1724                        // and always remove the overlapping characters from the current
1725                        // working set of characters
1726                        work.removeAll(overlap);
1727                    }
1728                }
1729
1730                // if there are still characters left in the working char set,
1731                // add a new category containing them
1732                if (!work.isEmpty()) {
1733                    categories.addElement(work);
1734                }
1735            }
1736
1737            // we have the ignore characters stored in position 0.  Make an extra pass through
1738            // the character category list and remove anything from the ignore list that shows
1739            // up in some other category
1740            UnicodeSet allChars = new UnicodeSet();
1741            for (int i = 1; i < categories.size(); i++)
1742                allChars.addAll((UnicodeSet)(categories.elementAt(i)));
1743            UnicodeSet ignoreChars = (UnicodeSet)(categories.elementAt(0));
1744            ignoreChars.removeAll(allChars);
1745
1746            // now that we've derived the character categories, go back through the expression
1747            // list and replace each UnicodeSet object with a String that represents the
1748            // character categories that expression refers to.  The String is encoded: each
1749            // character is a character category number (plus 0x100 to avoid confusing them
1750            // with syntax characters in the rule grammar)
1751            iter = expressions.keys();
1752            while (iter.hasMoreElements()) {
1753                String   key = (String  )iter.nextElement();
1754                UnicodeSet cs = (UnicodeSet)expressions.get(key);
1755                StringBuffer   cats = new StringBuffer  ();
1756
1757                // for each category...
1758                for (int j = 1; j < categories.size(); j++) {
1759                    UnicodeSet cat = new UnicodeSet((UnicodeSet) categories.elementAt(j));
1760
1761                    // if the current expression contains characters in that category...
1762                    if (cs.containsAll(cat)) {
1763
1764                        // then add the encoded category number to the String for this
1765                        // expression
1766                        cats.append((char)(0x100 + j));
1767                        if (cs.equals(cat)) {
1768                            break;
1769                        }
1770                    }
1771                }
1772
1773                // once we've finished building the encoded String for this expression,
1774                // replace the UnicodeSet object with it
1775                expressions.put(key, cats.toString());
1776            }
1777
1778            // and finally, we turn the temporary category table into a permanent category
1779            // table, which is a CompactByteArray. (we skip category 0, which by definition
1780            // refers to all characters not mentioned specifically in the rules)
1781            charCategoryTable = new CompactByteArray((byte)0);
1782
1783            // for each category...
1784            for (int i = 0; i < categories.size(); i++) {
1785                UnicodeSet chars = (UnicodeSet)(categories.elementAt(i));
1786                int n = chars.getRangeCount();
1787
1788                // go through the character ranges in the category one by one...
1789                for (int j = 0; j < n; ++j) {
1790                    int rangeStart = chars.getRangeStart(j);
1791
1792                    // (ignore anything above the BMP for now...)
1793                    if (rangeStart >= 0x10000) {
1794                        break;
1795                    }
1796
1797                    // and set the corresponding elements in the CompactArray accordingly
1798                    if (i != 0) {
1799                        charCategoryTable.setElementAt((char)rangeStart,
1800                            (char)chars.getRangeEnd(j), (byte)i);
1801                    }
1802
1803                    // (category 0 is special-- it's the hiding place for the ignore
1804                    // characters, whose real category number in the CompactArray is
1805                    // -1 [this is because category 0 contains all characters not
1806                    // specifically mentioned anywhere in the rules] )
1807                    else {
1808                        charCategoryTable.setElementAt((char)rangeStart,
1809                            (char)chars.getRangeEnd(j), IGNORE);
1810                    }
1811                }
1812            }
1813
1814            // once we've populated the CompactArray, compact it
1815            charCategoryTable.compact();
1816
1817            // initialize numCategories
1818            numCategories = categories.size();
1819        }
1820
1821        /**   @internal */
1822        protected void mungeExpressionList(Hashtable   expressions) {
1823            // empty in the parent class.  This function provides a hook for subclasses
1824            // to mess with the character category table.
1825        }
1826
1827        /**
1828         * This is the function that builds the forward state table.  Most of the real
1829         * work is done in parseRule(), which is called once for each rule in the
1830         * description.
1831         */
1832        private void buildStateTable(Vector   tempRuleList) {
1833            // initialize our temporary state table, and fill it with two states:
1834            // state 0 is a dummy state that allows state 1 to be the starting state
1835            // and 0 to represent "stop".  State 1 is added here to seed things
1836            // before we start parsing
1837            tempStateTable = new Vector  ();
1838            tempStateTable.addElement(new short[numCategories + 1]);
1839            tempStateTable.addElement(new short[numCategories + 1]);
1840
1841            // call parseRule() for every rule in the rule list (except those which
1842            // start with !, which are actually backwards-iteration rules)
1843            // variable not used int n = tempRuleList.size();
1844            for (int i = 0; i < tempRuleList.size(); i++) {
1845                String   rule = (String  )tempRuleList.elementAt(i);
1846                if (rule.charAt(0) != '!') {
1847                    parseRule(rule, true);
1848                }
1849            }
1850
1851            // finally, use finishBuildingStateTable() to minimize the number of
1852            // states in the table and perform some other cleanup work
1853            finishBuildingStateTable(true);
1854/*
1855System.out.print("C:\t");
1856for (int i = 0; i < numCategories; i++)
1857System.out.print(Integer.toString(i) + "\t");
1858System.out.println(); System.out.print("=================================================");
1859for (int i = 0; i < stateTable.length; i++) {
1860if (i % numCategories == 0) {
1861System.out.println();
1862if (endStates[i / numCategories])
1863System.out.print("*");
1864else
1865System.out.print(" ");
1866if (lookaheadStates[i / numCategories]) {
1867System.out.print("%");
1868}
1869else
1870System.out.print(" ");
1871System.out.print(Integer.toString(i / numCategories) + ":\t");
1872}
1873if (stateTable[i] == 0) System.out.print(".\t"); else System.out.print(Integer.toString(stateTable[i]) + "\t");
1874}
1875System.out.println();
1876*/
1877    }
1878
1879        /**
1880         * This is where most of the work really happens.  This routine parses a single
1881         * rule in the rule description, adding and modifying states in the state
1882         * table according to the new expression.  The state table is kept deterministic
1883         * throughout the whole operation, although some ugly postprocessing is needed
1884         * to handle the *? token.
1885         */
1886        private void parseRule(String   rule, boolean forward) {
1887            // algorithm notes:
1888            //   - The basic idea here is to read successive character-category groups
1889            //   from the input string.  For each group, you create a state and point
1890            //   the appropriate entries in the previous state to it.  This produces a
1891            //   straight line from the start state to the end state.  The ?, +, *, and (|)
1892            //   idioms produce branches in this straight line.  These branches (states
1893            //   that can transition to more than one other state) are called "decision
1894            //   points."  A list of decision points is kept.  This contains a list of
1895            //   all states that can transition to the next state to be created.  For a
1896            //   straight line progression, the only thing in the decision-point list is
1897            //   the current state.  But if there's a branch, the decision-point list
1898            //   will contain all of the beginning points of the branch when the next
1899            //   state to be created represents the end point of the branch.  A stack is
1900            //   used to save decision point lists in the presence of nested parentheses
1901            //   and the like.  For example, when a { is encountered, the current decision
1902            //   point list is saved on the stack and restored when the corresponding }
1903            //   is encountered.  This way, after the } is read, the decision point list
1904            //   will contain both the state right before the } _and_ the state before
1905            //   the whole {} expression.  Both of these states can transition to the next
1906            //   state after the {} expression.
1907            //   - one complication arises when we have to stamp a transition value into
1908            //   an array cell that already contains one.  The updateStateTable() and
1909            //   mergeStates() functions handle this case.  Their basic approach is to
1910            //   create a new state that combines the two states that conflict and point
1911            //   at it when necessary.  This happens recursively, so if the merged states
1912            //   also conflict, they're resolved in the same way, and so on.  There are
1913            //   a number of tests aimed at preventing infinite recursion.
1914            //   - another complication arises with repeating characters.  It's somewhat
1915            //   ambiguous whether the user wants a greedy or non-greedy match in these cases.
1916            //   (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in
1917            //   "abc" or the LONGEST sequence of letters ending in "abc".  We've adopted
1918            //   the *? to mean "shortest" and * by itself to mean "longest".  (You get the
1919            //   same result with both if there's no overlap between the repeating character
1920            //   group and the group immediately following it.)  Handling the *? token is
1921            //   rather complicated and involves keeping track of whether a state needs to
1922            //   be merged (as described above) or merely overwritten when you update one of
1923            //   its cells, and copying the contents of a state that loops with a *? token
1924            //   into some of the states that follow it after the rest of the table-building
1925            //   process is complete ("backfilling").
1926            // implementation notes:
1927            //   - This function assumes syntax checking has been performed on the input string
1928            //   prior to its being passed in here.  It assumes that parentheses are
1929            //   balanced, all literal characters are enclosed in [] and turned into category
1930            //   numbers, that there are no illegal characters or character sequences, and so
1931            //   on.  Violation of these invariants will lead to undefined behavior.
1932            //   - It'd probably be better to use linked lists rather than Vector and Stack
1933            //   to maintain the decision point list and stack.  I went for simplicity in
1934            //   this initial implementation.  If performance is critical enough, we can go
1935            //   back and fix this later.
1936            //   -There are a number of important limitations on the *? token.  It does not work
1937            //   right when followed by a repeating character sequence (e.g., ".*?(abc)*")
1938            //   (although it does work right when followed by a single repeating character).
1939            //   It will not always work right when nested in parentheses or braces (although
1940            //   sometimes it will).  It also will not work right if the group of repeating
1941            //   characters and the group of characters that follows overlap partially
1942            //   (e.g., "[a-g]*?[e-j]").  None of these capabilites was deemed necessary for
1943            //   describing breaking rules we know about, so we left them out for
1944            //   expeditiousness.
1945            //   - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"--
1946            //   that is, if the string ends in "aaaabc", the break will go before the first
1947            //   "a" rather than the last one.  Both of these are limitations in the design
1948            //   of RuleBasedBreakIterator and not limitations of the rule parser.
1949
1950            int p = 0;
1951            int currentState = 1;   // don't use state number 0; 0 means "stop"
1952            int lastState = currentState;
1953            String   pendingChars = "";
1954
1955            decisionPointStack = new Stack  ();
1956            decisionPointList = new Vector  ();
1957            loopingStates = new Vector  ();
1958            statesToBackfill = new Vector  ();
1959
1960            short[] state;
1961            boolean sawEarlyBreak = false;
1962
1963            // if we're adding rules to the backward state table, mark the initial state
1964            // as a looping state
1965            if (!forward) {
1966                loopingStates.addElement(new Integer  (1));
1967            }
1968
1969            // put the current state on the decision point list before we start
1970            decisionPointList.addElement(new Integer  (currentState)); // we want currentState to
1971                                                                     // be 1 here...
1972            currentState = tempStateTable.size() - 1;   // but after that, we want it to be
1973                                                        // 1 less than the state number of the next state
1974            while (p < rule.length()) {
1975                char c = rule.charAt(p);
1976                clearLoopingStates = false;
1977
1978                // this section handles literal characters, escaped characters (which are
1979                // effectively literal characters too), the . token, and [] expressions
1980                if (c == '['
1981                    || c == '\\'
1982                    || Character.isLetter(c)
1983                    || Character.isDigit(c)
1984                    || c < ' '
1985                    || c == '.'
1986                    || c >= '\u007f') {
1987
1988                    // if we're not on a period, isolate the expression and look up
1989                    // the corresponding category list
1990                    if (c != '.') {
1991                        int q = p;
1992
1993                        // if we're on a backslash, the expression is the character
1994                        // after the backslash
1995                        if (c == '\\') {
1996                            q = p + 2;
1997                            ++p;
1998                        }
1999
2000                        // if we're on an opening bracket, scan to the closing bracket
2001                        // to isolate the expression
2002                        else if (c == '[') {
2003                            int bracketLevel = 1;
2004                            while (bracketLevel > 0) {
2005                                ++q;
2006                                c = rule.charAt(q);
2007                                if (c == '[') {
2008                                    ++bracketLevel;
2009                                }
2010                                else if (c == ']') {
2011                                    --bracketLevel;
2012                                }
2013                                else if (c == '\\') {
2014                                    ++q;
2015                                }
2016                            }
2017                            ++q;
2018                        }
2019
2020                        // otherwise, the expression is just the character itself
2021                        else {
2022                            q = p + 1;
2023                        }
2024
2025                        // look up the category list for the expression and store it
2026                        // in pendingChars
2027                        pendingChars = (String  )expressions.get(rule.substring(p, q));
2028
2029                        // advance the current position past the expression
2030                        p = q - 1;
2031                    }
2032
2033                    // if the character we're on is a period, we end up down here
2034                    else {
2035                        int rowNum = ((Integer  )decisionPointList.lastElement()).intValue();
2036                        state = (short[])tempStateTable.elementAt(rowNum);
2037
2038                        // if the period is followed by an asterisk, then just set the current
2039                        // state to loop back on itself
2040                        if (p + 1 < rule.length() && rule.charAt(p + 1) == '*' && state[0] != 0) {
2041                            decisionPointList.addElement(new Integer  (state[0]));
2042                            pendingChars = "";
2043                            ++p;
2044                            if (p + 1 < rule.length() && rule.charAt(p + 1) == '?') {
2045//System.out.println("Saw *?");
2046                                setLoopingStates(decisionPointList, decisionPointList);
2047                                ++p;
2048                            }
2049//System.out.println("Saw .*");
2050                        }
2051
2052                        // otherwise, fabricate a category list ("pendingChars") with
2053                        // every category in it
2054                        else {
2055                            StringBuffer   temp = new StringBuffer  ();
2056                            for (int i = 0; i < numCategories; i++)
2057                                temp.append((char)(i + 0x100));
2058                            pendingChars = temp.toString();
2059                        }
2060                    }
2061
2062                    // we'll end up in here for all expressions except for .*, which is
2063                    // special-cased above
2064                    if (pendingChars.length() != 0) {
2065
2066                        // if the expression is followed by an asterisk or a question mark,
2067                        //  then push a copy of the current decision point list onto the stack
2068                        if (p + 1 < rule.length() && (
2069                            rule.charAt(p + 1) == '*' ||
2070                            rule.charAt(p + 1) == '?'
2071                        )) {
2072                            decisionPointStack.push(decisionPointList.clone());
2073                        }
2074
2075                        // create a new state, add it to the list of states to backfill
2076                        // if we have looping states to worry about, set its "don't make
2077                        // me an accepting state" flag if we've seen a slash, and add
2078                        // it to the end of the state table
2079                        int newState = tempStateTable.size();
2080                        if (loopingStates.size() != 0) {
2081                            statesToBackfill.addElement(new Integer  (newState));
2082                        }
2083                        state = new short[numCategories + 1];
2084                        if (sawEarlyBreak) {
2085                            state[numCategories] = DONT_LOOP_FLAG;
2086                        }
2087                        tempStateTable.addElement(state);
2088
2089                        // update everybody in the decision point list to point to
2090                        // the new state (this also performs all the reconciliation
2091                        // needed to make the table deterministic), then clear the
2092                        // decision point list
2093                        updateStateTable(decisionPointList, pendingChars, (short)newState);
2094                        decisionPointList.removeAllElements();
2095
2096                        // add all states created since the last literal character we've
2097                        // seen to the decision point list
2098                        lastState = currentState;
2099                        do {
2100                            ++currentState;
2101                            decisionPointList.addElement(new Integer  (currentState));
2102                        } while (currentState + 1 < tempStateTable.size());
2103                    }
2104                }
2105
2106                // a * or a + denotes a repeating character or group, and a ? denotes an
2107                // optional character group. (*, + and ? after () are handled separately below.)
2108                if (c == '+' || c == '*' || c == '?') {
2109                    // when there's a * or a +, update the current state to loop back on itself
2110                    // on the character categories that caused us to enter this state
2111                    if (c == '*' || c == '+') {
2112                        // Note: we process one state at a time because updateStateTable
2113                        // may add new states, and we want to process them as well.
2114                        for (int i = lastState + 1; i < tempStateTable.size(); i++) {
2115                            Vector   temp = new Vector  ();
2116                            temp.addElement(new Integer  (i));
2117                            updateStateTable(temp, pendingChars, (short)(lastState + 1));
2118                        }
2119
2120                        // If we just added any new states, add them to the decison point list
2121                        // Note: it might be a good idea to avoid adding new states to the
2122                        // decision point list in more than one place...
2123                        while (currentState + 1 < tempStateTable.size()) {
2124                            decisionPointList.addElement(new Integer  (++currentState));
2125                        }
2126                    }
2127
2128                    // for * and ? pop the top element off the decision point stack and merge
2129                    // it with the current decision point list (this causes the divergent
2130                    // paths through the state table to come together again on the next
2131                    // new state)
2132                    if (c == '*' || c == '?') {
2133                        Vector   temp = (Vector  )decisionPointStack.pop();
2134                        for (int i = 0; i < decisionPointList.size(); i++)
2135                            temp.addElement(decisionPointList.elementAt(i));
2136                        decisionPointList = temp;
2137
2138                        // a ? after a * modifies the behavior of * in cases where there is overlap
2139                        // between the set of characters that repeat and the characters which follow.
2140                        // Without the ?, all states following the repeating state, up to a state which
2141                        // is reached by a character that doesn't overlap, will loop back into the
2142                        // repeating state.  With the ?, the mark states following the *? DON'T loop
2143                        // back into the repeating state.  Thus, "[a-z]*xyz" will match the longest
2144                        // sequence of letters that ends in "xyz," while "[a-z]*? will match the
2145                        // _shortest_ sequence of letters that ends in "xyz".
2146                        // We use extra bookkeeping to achieve this effect, since everything else works
2147                        // according to the "longest possible match" principle.  The basic principle
2148                        // is that transitions out of a looping state are written in over the looping
2149                        // value instead of being reconciled, and that we copy the contents of the
2150                        // looping state into empty cells of all non-terminal states that follow the
2151                        // looping state.
2152//System.out.println("c = " + c + ", p = " + p + ", rule.length() = " + rule.length());
2153                        if (c == '*' && p + 1 < rule.length() && rule.charAt(p + 1) == '?') {
2154//System.out.println("Saw *?");
2155                            setLoopingStates(decisionPointList, decisionPointList);
2156                            ++p;
2157                        }
2158                    }
2159                }
2160
2161                // a ( marks the beginning of a sequence of characters.  Parentheses can either
2162                // contain several alternative character sequences (i.e., "(ab|cd|ef)"), or
2163                // they can contain a sequence of characters that can repeat (i.e., "(abc)*").  Thus,
2164                // A () group can have multiple entry and exit points.  To keep track of this,
2165                // we reserve TWO spots on the decision-point stack.  The top of the stack is
2166                // the list of exit points, which becomes the current decision point list when
2167                // the ) is reached.  The next entry down is the decision point list at the
2168                // beginning of the (), which becomes the current decision point list at every
2169                // entry point.
2170                // In addition to keeping track of the exit points and the active decision
2171                // points before the ( (i.e., the places from which the () can be entered),
2172                // we need to keep track of the entry points in case the expression loops
2173                // (i.e., is followed by *).  We do that by creating a dummy state in the
2174                // state table and adding it to the decision point list (BEFORE it's duplicated
2175                // on the stack).  Nobody points to this state, so it'll get optimized out
2176                // at the end.  It exists only to hold the entry points in case the ()
2177                // expression loops.
2178                if (c == '(') {
2179
2180                    // add a new state to the state table to hold the entry points into
2181                    // the () expression
2182                    tempStateTable.addElement(new short[numCategories + 1]);
2183
2184                    // we have to adjust lastState and currentState to account for the
2185                    // new dummy state
2186                    lastState = currentState;
2187                    ++currentState;
2188
2189                    // add the current state to the decision point list (add it at the
2190                    // BEGINNING so we can find it later)
2191                    decisionPointList.insertElementAt(new Integer  (currentState), 0);
2192
2193                    // finally, push a copy of the current decision point list onto the
2194                    // stack (this keeps track of the active decision point list before
2195                    // the () expression), followed by an empty decision point list
2196                    // (this will hold the exit points)
2197                    decisionPointStack.push(decisionPointList.clone());
2198                    decisionPointStack.push(new Vector  ());
2199                }
2200
2201                // a | separates alternative character sequences in a () expression.  When
2202                // a | is encountered, we add the current decision point list to the exit-point
2203                // list, and restore the decision point list to its state prior to the (.
2204                if (c == '|') {
2205
2206                    // pick out the top two decision point lists on the stack
2207                    Vector   oneDown = (Vector  )decisionPointStack.pop();
2208                    Vector   twoDown = (Vector  )decisionPointStack.peek();
2209                    decisionPointStack.push(oneDown);
2210
2211                    // append the current decision point list to the list below it
2212                    // on the stack (the list of exit points), and restore the
2213                    // current decision point list to its state before the () expression
2214                    for (int i = 0; i < decisionPointList.size(); i++)
2215                        oneDown.addElement(decisionPointList.elementAt(i));
2216                    decisionPointList = (Vector  )twoDown.clone();
2217                }
2218
2219                // a ) marks the end of a sequence of characters.  We do one of two things
2220                // depending on whether the sequence repeats (i.e., whether the ) is followed
2221                // by *):  If the sequence doesn't repeat, then the exit-point list is merged
2222                // with the current decision point list and the decision point list from before
2223                // the () is thrown away.  If the sequence does repeat, then we fish out the
2224                // state we were in before the ( and copy all of its forward transitions
2225                // (i.e., every transition added by the () expression) into every state in the
2226                // exit-point list and the current decision point list.  The current decision
2227                // point list is then merged with both the exit-point list AND the saved version
2228                // of the decision point list from before the ().  Then we throw out the *.
2229                if (c == ')') {
2230
2231                    // pull the exit point list off the stack, merge it with the current
2232                    // decision point list, and make the merged version the current
2233                    // decision point list
2234                    Vector   exitPoints = (Vector  )decisionPointStack.pop();
2235                    for (int i = 0; i < decisionPointList.size(); i++)
2236                        exitPoints.addElement(decisionPointList.elementAt(i));
2237                    decisionPointList = exitPoints;
2238
2239                    // if the ) isn't followed by a *, + or ?, then all we have to do is throw
2240                    // away the other list on the decision point stack, and we're done
2241                    if (p + 1 >= rule.length() || (
2242                            rule.charAt(p + 1) != '*' &&
2243                            rule.charAt(p + 1) != '+' &&
2244                            rule.charAt(p + 1) != '?')
2245                    ) {
2246                        decisionPointStack.pop();
2247                    }
2248
2249                    // but if the sequence is conditional or it repeats,
2250                    // we have a lot more work to do...
2251                    else {
2252
2253                        // now exitPoints and decisionPointList have to point to equivalent
2254                        // vectors, but not the SAME vector
2255                        exitPoints = (Vector  )decisionPointList.clone();
2256
2257                        // pop the original decision point list off the stack
2258                        Vector   temp = (Vector  )decisionPointStack.pop();
2259
2260                        // we squirreled away the row number of our entry point list
2261                        // at the beginning of the original decision point list.  Fish
2262                        // that state number out and retrieve the entry point list
2263                        int tempStateNum = ((Integer  )temp.firstElement()).intValue();
2264                        short[] tempState = (short[])tempStateTable.elementAt(tempStateNum);
2265
2266                        // merge the original decision point list with the current
2267                        // decision point list
2268                        if (rule.charAt(p + 1) == '?' || rule.charAt(p + 1) == '*') {
2269                            for (int i = 0; i < decisionPointList.size(); i++)
2270                                temp.addElement(decisionPointList.elementAt(i));
2271                            decisionPointList = temp;
2272                        }
2273
2274                        // finally, for * and + copy every forward reference from the entry point
2275                        // list into every state in the new decision point list
2276                        if (rule.charAt(p + 1) == '+' || rule.charAt(p + 1) == '*') {
2277                            for (int i = 0; i < tempState.length; i++) {
2278                                if (tempState[i] > tempStateNum) {
2279                                    updateStateTable(exitPoints,
2280                                                     new Character  ((char)(i + 0x100)).toString(),
2281                                                     tempState[i]);
2282                                }
2283                            }
2284                        }
2285
2286                        // update lastState and currentState, and throw away the *, +, or ?
2287                        lastState = currentState;
2288                        currentState = tempStateTable.size() - 1;
2289                        ++p;
2290                    }
2291                }
2292
2293                // a / marks the position where the break is to go if the character sequence
2294                // matches this rule.  We update the flag word of every state on the decision
2295                // point list to mark them as ending states, and take note of the fact that
2296                // we've seen the slash
2297                if (c == '/') {
2298                    sawEarlyBreak = true;
2299                    for (int i = 0; i < decisionPointList.size(); i++) {
2300                        state = (short[])tempStateTable.elementAt(((Integer  )decisionPointList.
2301                                        elementAt(i)).intValue());
2302                        state[numCategories] |= LOOKAHEAD_STATE_FLAG;
2303                    }
2304                }
2305
2306                // if we get here without executing any of the above clauses, we have a
2307                // syntax error.  However, for now we just ignore the offending character
2308                // and move on
2309/*
2310debugPrintln("====Parsed \"" + rule.substring(0, p + 1) + "\"...");
2311System.out.println("      currentState = " + currentState);
2312debugPrintVectorOfVectors("      decisionPointStack:", "        ", decisionPointStack);
2313debugPrintVector("        ", decisionPointList);
2314debugPrintVector("      loopingStates = ", loopingStates);
2315debugPrintVector("      statesToBackfill = ", statesToBackfill);
2316System.out.println("      sawEarlyBreak = " + sawEarlyBreak);
2317debugPrintTempStateTable();
2318*/
2319
2320                // clearLoopingStates is a signal back from updateStateTable() that we've
2321                // transitioned to a state that won't loop back to the current looping
2322                // state.  (In other words, we've gotten to a point where we can no longer
2323                // go back into a *? we saw earlier.)  Clear out the list of looping states
2324                // and backfill any states that need to be backfilled.
2325                if (clearLoopingStates) {
2326                    setLoopingStates(null, decisionPointList);
2327                }
2328
2329                // advance to the next character, now that we've processed the current
2330                // character
2331                ++p;
2332            }
2333
2334            // this takes care of backfilling any states that still need to be backfilled
2335            setLoopingStates(null, decisionPointList);
2336
2337            // when we reach the end of the string, we do a postprocessing step to mark the
2338            // end states.  The decision point list contains every state that can transition
2339            // to the end state-- that is, every state that is the last state in a sequence
2340            // that matches the rule.  All of these states are considered "mark states"
2341            // or "accepting states"-- that is, states that cause the position returned from
2342            // next() to be updated.  A mark state represents a possible break position.
2343            // This allows us to look ahead and remember how far the rule matched
2344            // before following the new branch (see next() for more information).
2345            // The temporary state table has an extra "flag column" at the end where this
2346            // information is stored.  We mark the end states by setting a flag in their
2347            // flag column.
2348            // Now if we saw the / in the rule, then everything after it is lookahead
2349            // material and the break really goes where the slash is.  In this case,
2350            // we mark these states as BOTH accepting states and lookahead states.  This
2351            // signals that these states cause the break position to be updated to the
2352            // position of the slash rather than the current break position.
2353            for (int i = 0; i < decisionPointList.size(); i++) {
2354                int rowNum = ((Integer  )decisionPointList.elementAt(i)).intValue();
2355                state = (short[])tempStateTable.elementAt(rowNum);
2356                state[numCategories] |= END_STATE_FLAG;
2357                if (sawEarlyBreak) {
2358                    state[numCategories] |= LOOKAHEAD_STATE_FLAG;
2359                }
2360            }
2361/*
2362debugPrintln("====Parsed \"" + rule + ";");
2363System.out.println();
2364System.out.println("      currentState = " + currentState);
2365debugPrintVectorOfVectors("      decisionPointStack:", "        ", decisionPointStack);
2366debugPrintVector("        ", decisionPointList);
2367debugPrintVector("      loopingStates = ", loopingStates);
2368debugPrintVector("      statesToBackfill = ", statesToBackfill);
2369System.out.println("      sawEarlyBreak = " + sawEarlyBreak);
2370debugPrintTempStateTable();
2371*/
2372        }
2373
2374
2375        /**
2376         * Update entries in the state table, and merge states when necessary to keep
2377         * the table deterministic.
2378         * @param rows The list of rows that need updating (the decision point list)
2379         * @param pendingChars A character category list, encoded in a String.  This is the
2380         * list of the columns that need updating.
2381         * @param newValue Update the cells specfied above to contain this value
2382         */
2383        private void updateStateTable(Vector   rows,
2384                                      String   pendingChars,
2385                                      short newValue) {
2386            // create a dummy state that has the specified row number (newValue) in
2387            // the cells that need to be updated (those specified by pendingChars)
2388            // and 0 in the other cells
2389            short[] newValues = new short[numCategories + 1];
2390            for (int i = 0; i < pendingChars.length(); i++)
2391                newValues[(int)(pendingChars.charAt(i)) - 0x100] = newValue;
2392
2393            // go through the list of rows to update, and update them by calling
2394            // mergeStates() to merge them the the dummy state we created
2395            for (int i = 0; i < rows.size(); i++) {
2396                mergeStates(((Integer  )rows.elementAt(i)).intValue(), newValues, rows);
2397            }
2398        }
2399
2400        /**
2401         * The real work of making the state table deterministic happens here.  This function
2402         * merges a state in the state table (specified by rowNum) with a state that is
2403         * passed in (newValues).  The basic process is to copy the nonzero cells in newStates
2404         * into the state in the state table (we'll call that oldValues).  If there's a
2405         * collision (i.e., if the same cell has a nonzero value in both states, and it's
2406         * not the SAME value), then we have to reconcile the collision.  We do this by
2407         * creating a new state, adding it to the end of the state table, and using this
2408         * function recursively to merge the original two states into a single, combined
2409         * state.  This process may happen recursively (i.e., each successive level may
2410         * involve collisions).  To prevent infinite recursion, we keep a log of merge
2411         * operations.  Any time we're merging two states we've merged before, we can just
2412         * supply the row number for the result of that merge operation rather than creating
2413         * a new state just like it.
2414         * @param rowNum The row number in the state table of the state to be updated
2415         * @param newValues The state to merge it with.
2416         * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
2417         * (itself a copy of the decision point list from parseRule()).  Newly-created
2418         * states get added to the decision point list if their "parents" were on it.
2419         */
2420        private void mergeStates(int rowNum,
2421                                 short[] newValues,
2422                                 Vector   rowsBeingUpdated) {
2423            short[] oldValues = (short[])(tempStateTable.elementAt(rowNum));
2424/*
2425System.out.print("***Merging " + rowNum + ":");
2426for (int i = 0; i < oldValues.length; i++) System.out.print("\t" + oldValues[i]);
2427System.out.println();
2428System.out.print("    with   \t");
2429for (int i = 0; i < newValues.length; i++) System.out.print("\t" + newValues[i]);
2430System.out.println();
2431*/
2432
2433            boolean isLoopingState = loopingStates.contains(new Integer  (rowNum));
2434
2435            // for each of the cells in the rows we're reconciling, do...
2436            for (int i = 0; i < oldValues.length; i++) {
2437
2438                // if they contain the same value, we don't have to do anything
2439                if (oldValues[i] == newValues[i]) {
2440                    continue;
2441                }
2442
2443                // if oldValues is a looping state and the state the current cell points to
2444                // is too, then we can just stomp over the current value of that cell (and
2445                // set the clear-looping-states flag if necessary)
2446                else if (isLoopingState && loopingStates.contains(new Integer  (oldValues[i]))) {
2447                    if (newValues[i] != 0) {
2448                        if (oldValues[i] == 0) {
2449                            clearLoopingStates = true;
2450                        }
2451                        oldValues[i] = newValues[i];
2452                    }
2453                }
2454
2455                // if the current cell in oldValues is 0, copy in the corresponding value
2456                // from newValues
2457                else if (oldValues[i] == 0) {
2458                    oldValues[i] = newValues[i];
2459                }
2460
2461                // the last column of each row is the flag column.  Take care to merge the
2462                // flag words correctly
2463                else if (i == numCategories) {
2464                    oldValues[i] = (short)((newValues[i] & ALL_FLAGS) | oldValues[i]);
2465                }
2466
2467                // if both newValues and oldValues have a nonzero value in the current
2468                // cell, and it isn't the same value both places...
2469                else if (oldValues[i] != 0 && newValues[i] != 0) {
2470
2471                    // look up this pair of cell values in the merge list.  If it's
2472                    // found, update the cell in oldValues to point to the merged state
2473                    int combinedRowNum = searchMergeList(oldValues[i], newValues[i]);
2474                    if (combinedRowNum != 0) {
2475                        oldValues[i] = (short)combinedRowNum;
2476                    }
2477
2478                    // otherwise, we have to reconcile them...
2479                    else {
2480                        // copy our row numbers into variables to make things easier
2481                        int oldRowNum = oldValues[i];
2482                        int newRowNum = newValues[i];
2483                        combinedRowNum = tempStateTable.size();
2484
2485                        // add this pair of row numbers to the merge list (create it first
2486                        // if we haven't created the merge list yet)
2487                        if (mergeList == null) {
2488                            mergeList = new Vector  ();
2489                        }
2490                        mergeList.addElement(new int[] { oldRowNum, newRowNum, combinedRowNum });
2491
2492//System.out.println("***At " + rowNum + ", merging " + oldRowNum + " and " + newRowNum + " into " + combinedRowNum);
2493
2494                        // create a new row to represent the merged state, and copy the
2495                        // contents of oldRow into it, then add it to the end of the
2496                        // state table and update the original row (oldValues) to point
2497                        // to the new, merged, state
2498                        short[] newRow = new short[numCategories + 1];
2499                        short[] oldRow = (short[])(tempStateTable.elementAt(oldRowNum));
2500                        System.arraycopy(oldRow, 0, newRow, 0, numCategories + 1);
2501                        tempStateTable.addElement(newRow);
2502                        oldValues[i] = (short)combinedRowNum;
2503
2504
2505//System.out.println("lastOldRowNum = " + lastOldRowNum);
2506//System.out.println("lastCombinedRowNum = " + lastCombinedRowNum);
2507//System.out.println("decisionPointList.contains(lastOldRowNum) = " + decisionPointList.contains(new Integer(lastOldRowNum)));
2508//System.out.println("decisionPointList.contains(lastCombinedRowNum) = " + decisionPointList.contains(new Integer(lastCombinedRowNum)));
2509
2510                        // if the decision point list contains either of the parent rows,
2511                        // update it to include the new row as well
2512                        if ((decisionPointList.contains(new Integer  (oldRowNum))
2513                                || decisionPointList.contains(new Integer  (newRowNum)))
2514                            && !decisionPointList.contains(new Integer  (combinedRowNum))
2515                        ) {
2516                            decisionPointList.addElement(new Integer  (combinedRowNum));
2517                        }
2518
2519                        // do the same thing with the list of rows being updated
2520                        if ((rowsBeingUpdated.contains(new Integer  (oldRowNum))
2521                                || rowsBeingUpdated.contains(new Integer  (newRowNum)))
2522                            && !rowsBeingUpdated.contains(new Integer  (combinedRowNum))
2523                        ) {
2524                            decisionPointList.addElement(new Integer  (combinedRowNum));
2525                        }
2526                        // now (groan) do the same thing for all the entries on the
2527                        // decision point stack
2528                        for (int k = 0; k < decisionPointStack.size(); k++) {
2529                            Vector   dpl = (Vector  )decisionPointStack.elementAt(k);
2530                            if ((dpl.contains(new Integer  (oldRowNum))
2531                                    || dpl.contains(new Integer  (newRowNum)))
2532                                && !dpl.contains(new Integer  (combinedRowNum))
2533                            ) {
2534                                dpl.addElement(new Integer  (combinedRowNum));
2535                            }
2536                        }
2537
2538                        // FINALLY (puff puff puff), call mergeStates() recursively to copy
2539                        // the row referred to by newValues into the new row and resolve any
2540                        // conflicts that come up at that level
2541                        mergeStates(combinedRowNum, (short[])(tempStateTable.elementAt(
2542                                        newValues[i])), rowsBeingUpdated);
2543                    }
2544                }
2545            }
2546            return;
2547        }
2548
2549        /**
2550         * The merge list is a list of pairs of rows that have been merged somewhere in
2551         * the process of building this state table, along with the row number of the
2552         * row containing the merged state.  This function looks up a pair of row numbers
2553         * and returns the row number of the row they combine into.  (It returns 0 if
2554         * this pair of rows isn't in the merge list.)
2555         */
2556        private int searchMergeList(int a, int b) {
2557            // if there is no merge list, there obviously isn't anything in it
2558            if (mergeList == null) {
2559                return 0;
2560            }
2561
2562            // otherwise, for each element in the merge list...
2563            else {
2564                int[] entry;
2565                for (int i = 0; i < mergeList.size(); i++) {
2566                    entry = (int[])(mergeList.elementAt(i));
2567
2568                    // we have a hit if the two row numbers match the two row numbers
2569                    // in the beginning of the entry (the two that combine), in either
2570                    // order
2571                    if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) {
2572                        return entry[2];
2573                    }
2574
2575                    // we also have a hit if one of the two row numbers matches the marged
2576                    // row number and the other one matches one of the original row numbers
2577                    if ((entry[2] == a && (entry[0] == b || entry[1] == b))) {
2578                        return entry[2];
2579                    }
2580                    if ((entry[2] == b && (entry[0] == a || entry[1] == a))) {
2581                        return entry[2];
2582                    }
2583                }
2584                return 0;
2585            }
2586        }
2587
2588        /**
2589         * This function is used to update the list of current loooping states (i.e.,
2590         * states that are controlled by a *? construct).  It backfills values from
2591         * the looping states into unpopulated cells of the states that are currently
2592         * marked for backfilling, and then updates the list of looping states to be
2593         * the new list
2594         * @param newLoopingStates The list of new looping states
2595         * @param endStates The list of states to treat as end states (states that
2596         * can exit the loop).
2597         */
2598        private void setLoopingStates(Vector   newLoopingStates, Vector   endStates) {
2599
2600            // if the current list of looping states isn't empty, we have to backfill
2601            // values from the looping states into the states that are waiting to be
2602            // backfilled
2603            if (!loopingStates.isEmpty()) {
2604                int loopingState = ((Integer  )loopingStates.lastElement()).intValue();
2605                int rowNum;
2606
2607                // don't backfill into an end state OR any state reachable from an end state
2608                // (since the search for reachable states is recursive, it's split out into
2609                // a separate function, eliminateBackfillStates(), below)
2610                for (int i = 0; i < endStates.size(); i++) {
2611                    eliminateBackfillStates(((Integer  )endStates.elementAt(i)).intValue());
2612                }
2613
2614                // we DON'T actually backfill the states that need to be backfilled here.
2615                // Instead, we MARK them for backfilling.  The reason for this is that if
2616                // there are multiple rules in the state-table description, the looping
2617                // states may have some of their values changed by a succeeding rule, and
2618                // this wouldn't be reflected in the backfilled states.  We mark a state
2619                // for backfilling by putting the row number of the state to copy from
2620                // into the flag cell at the end of the row
2621                for (int i = 0; i < statesToBackfill.size(); i++) {
2622                    rowNum = ((Integer  )statesToBackfill.elementAt(i)).intValue();
2623                    short[] state = (short[])tempStateTable.elementAt(rowNum);
2624                    state[numCategories] =
2625                        (short)((state[numCategories] & ALL_FLAGS) | loopingState);
2626                }
2627                statesToBackfill.removeAllElements();
2628                loopingStates.removeAllElements();
2629            }
2630
2631            if (newLoopingStates != null) {
2632                loopingStates = (Vector  )newLoopingStates.clone();
2633            }
2634        }
2635
2636        /**
2637         * This removes "ending states" and states reachable from them from the
2638         * list of states to backfill.
2639         * @param The row number of the state to remove from the backfill list
2640         */
2641        private void eliminateBackfillStates(int baseState) {
2642
2643            // don't do anything unless this state is actually in the backfill list...
2644            if (statesToBackfill.contains(new Integer  (baseState))) {
2645
2646                // if it is, take it out
2647                statesToBackfill.removeElement(new Integer  (baseState));
2648
2649                // then go through and recursively call this function for every
2650                // state that the base state points to
2651                short[] state = (short[])tempStateTable.elementAt(baseState);
2652                for (int i = 0; i < numCategories; i++) {
2653                    if (state[i] != 0) {
2654                        eliminateBackfillStates(state[i]);
2655                    }
2656                }
2657            }
2658        }
2659
2660        /**
2661         * This function completes the backfilling process by actually doing the
2662         * backfilling on the states that are marked for it
2663         */
2664        private void backfillLoopingStates() {
2665            short[] state;
2666            short[] loopingState = null;
2667            int loopingStateRowNum = 0;
2668            int fromState;
2669
2670            // for each state in the state table...
2671            for (int i = 0; i < tempStateTable.size(); i++) {
2672                state = (short[])tempStateTable.elementAt(i);
2673
2674                // check the state's flag word to see if it's marked for backfilling
2675                // (it's marked for backfilling if any bits other than the two high-order
2676                // bits are set-- if they are, then the flag word, minus the two high bits,
2677                // is the row number to copy from)
2678                fromState = state[numCategories] & ~ALL_FLAGS;
2679                if (fromState > 0) {
2680
2681                    // load up the state to copy from (if we haven't already)
2682                    if (fromState != loopingStateRowNum) {
2683                        loopingStateRowNum = fromState;
2684                        loopingState = (short[])tempStateTable.elementAt(loopingStateRowNum);
2685                    }
2686
2687                    // clear out the backfill part of the flag word
2688                    state[numCategories] &= ALL_FLAGS;
2689
2690                    // then fill all zero cells in the current state with values
2691                    // from the corresponding cells of the fromState
2692                    for (int j = 0; j < state.length; j++) {
2693                        if (state[j] == 0) {
2694                            state[j] = loopingState[j];
2695                        }
2696                        else if (state[j] == DONT_LOOP_FLAG) {
2697                            state[j] = 0;
2698                        }
2699                    }
2700                }
2701            }
2702        }
2703
2704        /**
2705         * This function completes the state-table-building process by doing several
2706         * postprocessing steps and copying everything into its final resting place
2707         * in the iterator itself
2708         * @param forward True if we're working on the forward state table
2709         */
2710        private void finishBuildingStateTable(boolean forward) {
2711//debugPrintTempStateTable();
2712            // start by backfilling the looping states
2713            backfillLoopingStates();
2714//debugPrintTempStateTable();
2715
2716            int[] rowNumMap = new int[tempStateTable.size()];
2717            Stack   rowsToFollow = new Stack  ();
2718            rowsToFollow.push(new Integer  (1));
2719            rowNumMap[1] = 1;
2720
2721            // determine which states are no longer reachable from the start state
2722            // (the reachable states will have their row numbers in the row number
2723            // map, and the nonreachable states will have zero in the row number map)
2724            while (rowsToFollow.size() != 0) {
2725                int rowNum = ((Integer  )rowsToFollow.pop()).intValue();
2726                short[] row = (short[])(tempStateTable.elementAt(rowNum));
2727
2728                for (int i = 0; i < numCategories; i++) {
2729                    if (row[i] != 0) {
2730                        if (rowNumMap[row[i]] == 0) {
2731                            rowNumMap[row[i]] = row[i];
2732                            rowsToFollow.push(new Integer  (row[i]));
2733                        }
2734                    }
2735                }
2736            }
2737/*
2738System.out.println("The following rows are not reachable:");
2739for (int i = 1; i < rowNumMap.length; i++)
2740if (rowNumMap[i] == 0) System.out.print("\t" + i);
2741System.out.println();
2742*/
2743
2744            // variable not used boolean madeChange;
2745            int newRowNum;
2746
2747            // algorithm for minimizing the number of states in the table adapted from
2748            // Aho & Ullman, "Principles of Compiler Design"
2749            // The basic idea here is to organize the states into classes.  When we're done,
2750            // all states in the same class can be considered identical and all but one eliminated.
2751
2752            // initially assign states to classes based on the number of populated cells they
2753            // contain (the class number is the number of populated cells)
2754            int[] stateClasses = new int[tempStateTable.size()];
2755            int nextClass = numCategories + 1;
2756            short[] state1, state2;
2757            for (int i = 1; i < stateClasses.length; i++) {
2758                if (rowNumMap[i] == 0) {
2759                    continue;
2760                }
2761                state1 = (short[])tempStateTable.elementAt(i);
2762                for (int j = 0; j < numCategories; j++) {
2763                    if (state1[j] != 0) {
2764                        ++stateClasses[i];
2765                    }
2766                }
2767                if (stateClasses[i] == 0) {
2768                    stateClasses[i] = nextClass;
2769                }
2770            }
2771            ++nextClass;
2772
2773            // then, for each class, elect the first member of that class as that class's
2774            // "representative".  For each member of the class, compare it to the "representative."
2775            // If there's a column position where the state being tested transitions to a
2776            // state in a DIFFERENT class from the class where the "representative" transitions,
2777            // then move the state into a new class.  Repeat this process until no new classes
2778            // are created.
2779            int currentClass;
2780            int lastClass;
2781            boolean split;
2782
2783            do {
2784//System.out.println("Making a pass...");
2785                currentClass = 1;
2786                lastClass = nextClass;
2787                while (currentClass < nextClass) {
2788//System.out.print("States in class #" + currentClass +":");
2789                    split = false;
2790                    state1 = state2 = null;
2791                    for (int i = 0; i < stateClasses.length; i++) {
2792                        if (stateClasses[i] == currentClass) {
2793//System.out.print("\t" + i);
2794                            if (state1 == null) {
2795                                state1 = (short[])tempStateTable.elementAt(i);
2796                            }
2797                            else {
2798                                state2 = (short[])tempStateTable.elementAt(i);
2799                                for (int j = 0; j < state2.length; j++)
2800                                    if ((j == numCategories && state1[j] != state2[j] && forward)
2801                                            || (j != numCategories && stateClasses[state1[j]]
2802                                            != stateClasses[state2[j]])) {
2803                                        stateClasses[i] = nextClass;
2804                                        split = true;
2805                                        break;
2806                                    }
2807                            }
2808                        }
2809                    }
2810                    if (split) {
2811                        ++nextClass;
2812                    }
2813                    ++currentClass;
2814//System.out.println();
2815                }
2816            } while (lastClass != nextClass);
2817
2818            // at this point, all of the states in a class except the first one (the
2819            //"representative") can be eliminated, so update the row-number map accordingly
2820            int[] representatives = new int[nextClass];
2821            for (int i = 1; i < stateClasses.length; i++)
2822                if (representatives[stateClasses[i]] == 0) {
2823                    representatives[stateClasses[i]] = i;
2824                }
2825                else {
2826                    rowNumMap[i] = representatives[stateClasses[i]];
2827                }
2828//System.out.println("Renumbering...");
2829
2830            // renumber all remaining rows...
2831            // first drop all that are either unreferenced or not a class representative
2832            for (int i = 1; i < rowNumMap.length; i++) {
2833                if (rowNumMap[i] != i) {
2834                    tempStateTable.setElementAt(null, i);
2835                }
2836            }
2837
2838            // then calculate everybody's new row number and update the row
2839            // number map appropriately (the first pass updates the row numbers
2840            // of all the class representatives [the rows we're keeping], and the
2841            // second pass updates the cross references for all the rows that
2842            // are being deleted)
2843            newRowNum = 1;
2844            for (int i = 1; i < rowNumMap.length; i++) {
2845                if (tempStateTable.elementAt(i) != null) {
2846                    rowNumMap[i] = newRowNum++;
2847                }
2848            }
2849            for (int i = 1; i < rowNumMap.length; i++) {
2850                if (tempStateTable.elementAt(i) == null) {
2851                    rowNumMap[i] = rowNumMap[rowNumMap[i]];
2852                }
2853            }
2854//for (int i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int newRowNum = rowNumMap.length;
2855
2856            // allocate the permanent state table, and copy the remaining rows into it
2857            // (adjusting all the cell values, of course)
2858
2859            // this section does that for the forward state table
2860            if (forward) {
2861                endStates = new boolean[newRowNum];
2862                lookaheadStates = new boolean[newRowNum];
2863                stateTable = new short[newRowNum * numCategories];
2864                int p = 0;
2865                int p2 = 0;
2866                for (int i = 0; i < tempStateTable.size(); i++) {
2867                    short[] row = (short[])(tempStateTable.elementAt(i));
2868                    if (row == null) {
2869                        continue;
2870                    }
2871                    for (int j = 0; j < numCategories; j++) {
2872                        stateTable[p] = (short)(rowNumMap[row[j]]);
2873                        ++p;
2874                    }
2875                    endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0);
2876                    lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0);
2877                    ++p2;
2878                }
2879            }
2880
2881            // and this section does it for the backward state table
2882            else {
2883                backwardsStateTable = new short[newRowNum * numCategories];
2884                int p = 0;
2885                for (int i = 0; i < tempStateTable.size(); i++) {
2886                    short[] row = (short[])(tempStateTable.elementAt(i));
2887                    if (row == null) {
2888                        continue;
2889                    }
2890                    for (int j = 0; j < numCategories; j++) {
2891                        backwardsStateTable[p] = (short)(rowNumMap[row[j]]);
2892                        ++p;
2893                    }
2894                }
2895            }
2896        }
2897
2898        /**
2899         * This function builds the backward state table from the forward state
2900         * table and any additional rules (identified by the ! on the front)
2901         * supplied in the description
2902         */
2903        private void buildBackwardsStateTable(Vector   tempRuleList) {
2904
2905            // create the temporary state table and seed it with two rows (row 0
2906            // isn't used for anything, and we have to create row 1 (the initial
2907            // state) before we can do anything else
2908            tempStateTable = new Vector  ();
2909            tempStateTable.addElement(new short[numCategories + 1]);
2910            tempStateTable.addElement(new short[numCategories + 1]);
2911
2912            // although the backwards state table is built automatically from the forward
2913            // state table, there are some situations (the default sentence-break rules,
2914            // for example) where this doesn't yield enough stop states, causing a dramatic
2915            // drop in performance.  To help with these cases, the user may supply
2916            // supplemental rules that are added to the backward state table.  These have
2917            // the same syntax as the normal break rules, but begin with '!' to distinguish
2918            // them from normal break rules
2919            for (int i = 0; i < tempRuleList.size(); i++) {
2920                String   rule = (String  )tempRuleList.elementAt(i);
2921                if (rule.charAt(0) == '!') {
2922                    parseRule(rule.substring(1), false);
2923                }
2924            }
2925            backfillLoopingStates();
2926
2927            // Backwards iteration is qualitatively different from forwards iteration.
2928            // This is because backwards iteration has to be made to operate from no context
2929            // at all-- the user should be able to ask BreakIterator for the break position
2930            // immediately on either side of some arbitrary offset in the text.  The
2931            // forward iteration table doesn't let us do that-- it assumes complete
2932            // information on the context, which means starting from the beginning of the
2933            // document.
2934            // The way we do backward and random-access iteration is to back up from the
2935            // current (or user-specified) position until we see something we're sure is
2936            // a break position (it may not be the last break position immediately
2937            // preceding our starting point, however).  Then we roll forward from there to
2938            // locate the actual break position we're after.
2939            // This means that the backwards state table doesn't have to identify every
2940            // break position, allowing the building algorithm to be much simpler.  Here,
2941            // we use a "pairs" approach, scanning the forward-iteration state table for
2942            // pairs of character categories we ALWAYS break between, and building a state
2943            // table from that information.  No context is required-- all this state table
2944            // looks at is a pair of adjacent characters.
2945
2946            // It's possible that the user has supplied supplementary rules (see above).
2947            // This has to be done first to keep parseRule() and friends from becoming
2948            // EVEN MORE complicated.  The automatically-generated states are appended
2949            // onto the end of the state table, and then the two sets of rules are
2950            // stitched together at the end.  Take note of the row number of the
2951            // first row of the auromatically-generated part.
2952            int backTableOffset = tempStateTable.size();
2953            if (backTableOffset > 2) {
2954                ++backTableOffset;
2955            }
2956
2957            // the automatically-generated part of the table models a two-dimensional
2958            // array where the two dimensions represent the two characters we're currently
2959            // looking at.  To model this as a state table, we actually need one additional
2960            // row to represent the initial state.  It gets populated with the row numbers
2961            // of the other rows (in order).
2962            for (int i = 0; i < numCategories + 1; i++)
2963                tempStateTable.addElement(new short[numCategories + 1]);
2964
2965            short[] state = (short[])tempStateTable.elementAt(backTableOffset - 1);
2966            for (int i = 0; i < numCategories; i++)
2967                state[i] = (short)(i + backTableOffset);
2968
2969            // scavenge the forward state table for pairs of character categories
2970            // that always have a break between them.  The algorithm is as follows:
2971            // Look down each column in the state table.  For each nonzero cell in
2972            // that column, look up the row it points to.  For each nonzero cell in
2973            // that row, populate a cell in the backwards state table: the row number
2974            // of that cell is the number of the column we were scanning (plus the
2975            // offset that locates this sub-table), and the column number of that cell
2976            // is the column number of the nonzero cell we just found.  This cell is
2977            // populated with its own column number (adjusted according to the actual
2978            // location of the sub-table).  This process will produce a state table
2979            // whose behavior is the same as looking up successive pairs of characters
2980            // in an array of Booleans to determine whether there is a break.
2981            int numRows = stateTable.length / numCategories;
2982            for (int column = 0; column < numCategories; column++) {
2983                for (int row = 0; row < numRows; row++) {
2984                    int nextRow = lookupState(row, column);
2985                    if (nextRow != 0) {
2986                        for (int nextColumn = 0; nextColumn < numCategories; nextColumn++) {
2987                            int cellValue = lookupState(nextRow, nextColumn);
2988                            if (cellValue != 0) {
2989                                state = (short[])tempStateTable.elementAt(nextColumn +
2990                                                backTableOffset);
2991                                state[column] = (short)(column + backTableOffset);
2992                            }
2993                        }
2994                    }
2995                }
2996            }
2997
2998//debugPrintTempStateTable();
2999            // if the user specified some backward-iteration rules with the ! token,
3000            // we have to merge the resulting state table with the auto-generated one
3001            // above.  First copy the populated cells from row 1 over the populated
3002            // cells in the auto-generated table.  Then copy values from row 1 of the
3003            // auto-generated table into all of the the unpopulated cells of the
3004            // rule-based table.
3005            if (backTableOffset > 1) {
3006
3007                // for every row in the auto-generated sub-table, if a cell is
3008                // populated that is also populated in row 1 of the rule-based
3009                // sub-table, copy the value from row 1 over the value in the
3010                // auto-generated sub-table
3011                state = (short[])tempStateTable.elementAt(1);
3012                for (int i = backTableOffset - 1; i < tempStateTable.size(); i++) {
3013                    short[] state2 = (short[])tempStateTable.elementAt(i);
3014                    for (int j = 0; j < numCategories; j++) {
3015                        if (state[j] != 0 && state2[j] != 0) {
3016                            state2[j] = state[j];
3017                        }
3018                    }
3019                }
3020
3021                // now, for every row in the rule-based sub-table that is not
3022                // an end state, fill in all unpopulated cells with the values
3023                // of the corresponding cells in the first row of the auto-
3024                // generated sub-table.
3025                state = (short[])tempStateTable.elementAt(backTableOffset - 1);
3026                for (int i = 1; i < backTableOffset - 1; i++) {
3027                    short[] state2 = (short[])tempStateTable.elementAt(i);
3028                    if ((state2[numCategories] & END_STATE_FLAG) == 0) {
3029                        for (int j = 0; j < numCategories; j++) {
3030                            if (state2[j] == 0) {
3031                                state2[j] = state[j];
3032                            }
3033                        }
3034                    }
3035                }
3036            }
3037
3038//debugPrintTempStateTable();
3039
3040            // finally, clean everything up and copy it into the actual BreakIterator
3041            // by calling finishBuildingStateTable()
3042            finishBuildingStateTable(false);
3043/*
3044System.out.print("C:\t");
3045for (int i = 0; i < numCategories; i++)
3046System.out.print(Integer.toString(i) + "\t");
3047System.out.println(); System.out.print("=================================================");
3048for (int i = 0; i < backwardsStateTable.length; i++) {
3049if (i % numCategories == 0) {
3050System.out.println();
3051System.out.print(Integer.toString(i / numCategories) + ":\t");
3052}
3053if (backwardsStateTable[i] == 0) System.out.print(".\t"); else System.out.print(Integer.toString(backwardsStateTable[i]) + "\t");
3054}
3055System.out.println();
3056*/
3057        }
3058
3059        /**
3060         * Throws an IllegalArgumentException representing a syntax error in the rule
3061         * description.  The exception's message contains some debugging information.
3062         * @param message A message describing the problem
3063         * @param position The position in the description where the problem was
3064         * discovered
3065         * @param context The string containing the error
3066         * @internal
3067         */
3068        protected void error(String   message, int position, String   context) {
3069            throw new IllegalArgumentException  ("Parse error: " + message + "\n" +
3070                    Utility.escape(context.substring(0, position)) + "\n\n" +
3071                    Utility.escape(context.substring(position)));
3072        }
3073
3074    ///CLOVER:OFF
3075        /**
3076         * @internal
3077         */
3078        protected void debugPrintVector(String   label, Vector   v) {
3079            System.out.print(label);
3080            for (int i = 0; i < v.size(); i++)
3081                System.out.print(v.elementAt(i).toString() + "\t");
3082            System.out.println();
3083        }
3084
3085        /**
3086         * @internal
3087         */
3088        protected void debugPrintVectorOfVectors(String   label1, String   label2, Vector   v) {
3089            System.out.println(label1);
3090            for (int i = 0; i < v.size(); i++)
3091                debugPrintVector(label2, (Vector  )v.elementAt(i));
3092        }
3093
3094        /**
3095         * @internal
3096         */
3097        protected void debugPrintTempStateTable() {
3098            System.out.println("      tempStateTable:");
3099            System.out.print("        C:\t");
3100            for (int i = 0; i <= numCategories; i++)
3101                System.out.print(Integer.toString(i) + "\t");
3102            System.out.println();
3103            for (int i = 1; i < tempStateTable.size(); i++) {
3104                short[] row = (short[])(tempStateTable.elementAt(i));
3105                System.out.print("        " + i + ":\t");
3106                for (int j = 0; j < row.length; j++) {
3107                    if (row[j] == 0) {
3108                        System.out.print(".\t");
3109                    }
3110                    else {
3111                        System.out.print(Integer.toString(row[j]) + "\t");
3112                    }
3113                }
3114                System.out.println();
3115            }
3116        }
3117
3118    }
3119    ///CLOVER:ON
3120
3121    /*
3122     * This class exists to work around a bug in HotJava's implementation
3123     * of CharacterIterator, which incorrectly handles setIndex(endIndex).
3124     * This iterator relies only on base.setIndex(n) where n is less than
3125     * endIndex.
3126     *
3127     * One caveat:  if the base iterator's begin and end indices change
3128     * the change will not be reflected by this wrapper.  Does that matter?
3129     */
3130    ///CLOVER:OFF
3131    // Only used for HotJava, so clover won't encounter it
3132    private static final class SafeCharIterator implements CharacterIterator  ,
3133                                                           Cloneable   {
3134
3135        private CharacterIterator   base;
3136        private int rangeStart;
3137        private int rangeLimit;
3138        private int currentIndex;
3139
3140        SafeCharIterator(CharacterIterator   base) {
3141            this.base = base;
3142            this.rangeStart = base.getBeginIndex();
3143            this.rangeLimit = base.getEndIndex();
3144            this.currentIndex = base.getIndex();
3145        }
3146
3147        public char first() {
3148            return setIndex(rangeStart);
3149        }
3150
3151        public char last() {
3152            return setIndex(rangeLimit - 1);
3153        }
3154
3155        public char current() {
3156            if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
3157                return DONE;
3158            }
3159            else {
3160                return base.setIndex(currentIndex);
3161            }
3162        }
3163
3164        public char next() {
3165
3166            currentIndex++;
3167            if (currentIndex >= rangeLimit) {
3168                currentIndex = rangeLimit;
3169                return DONE;
3170            }
3171            else {
3172                return base.setIndex(currentIndex);
3173            }
3174        }
3175
3176        public char previous() {
3177
3178            currentIndex--;
3179            if (currentIndex < rangeStart) {
3180                currentIndex = rangeStart;
3181                return DONE;
3182            }
3183            else {
3184                return base.setIndex(currentIndex);
3185            }
3186        }
3187
3188        public char setIndex(int i) {
3189
3190            if (i < rangeStart || i > rangeLimit) {
3191                throw new IllegalArgumentException  ("Invalid position");
3192            }
3193            currentIndex = i;
3194            return current();
3195        }
3196
3197        public int getBeginIndex() {
3198            return rangeStart;
3199        }
3200
3201        public int getEndIndex() {
3202            return rangeLimit;
3203        }
3204
3205        public int getIndex() {
3206            return currentIndex;
3207        }
3208
3209        public Object   clone() {
3210
3211            SafeCharIterator copy = null;
3212            try {
3213                copy = (SafeCharIterator) super.clone();
3214            }
3215            catch(CloneNotSupportedException   e) {
3216                throw new Error  ("Clone not supported: " + e);
3217            }
3218
3219            CharacterIterator   copyOfBase = (CharacterIterator  ) base.clone();
3220            copy.base = copyOfBase;
3221            return copy;
3222        }
3223    }
3224    ///CLOVER:ON
3225
3226    ///CLOVER:OFF
3227   /**
3228     * @internal
3229     */
3230    public static void debugPrintln(String   s) {
3231        final String   zeros = "0000";
3232        String   temp;
3233        StringBuffer   out = new StringBuffer  ();
3234        for (int i = 0; i < s.length(); i++) {
3235            char c = s.charAt(i);
3236            if (c >= ' ' && c < '\u007f') {
3237                out.append(c);
3238            }
3239            else {
3240                out.append("\\u");
3241                temp = Integer.toHexString((int)c);
3242                out.append(zeros.substring(0, 4 - temp.length()));
3243                out.append(temp);
3244            }
3245        }
3246        System.out.println(out);
3247    }
3248    ///CLOVER:ON
3249}
3250
3251
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags