KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > TransliteratorParser


1 /*
2 **********************************************************************
3 * Copyright (c) 2001-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */

7 package com.ibm.icu.text;
8
9 import com.ibm.icu.impl.data.ResourceReader;
10 import com.ibm.icu.impl.Utility;
11 import java.util.Vector JavaDoc;
12 import java.util.Hashtable JavaDoc;
13 import java.text.ParsePosition JavaDoc;
14 import com.ibm.icu.lang.*;
15 import com.ibm.icu.impl.UCharacterProperty;
16
17 class TransliteratorParser {
18
19     //----------------------------------------------------------------------
20
// Data members
21
//----------------------------------------------------------------------
22

23     /**
24      * PUBLIC data member.
25      * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group
26      * of rules in the rule set
27      */

28     public Vector JavaDoc dataVector;
29
30     /**
31      * PUBLIC data member.
32      * A Vector of Strings containing all of the ID blocks in the rule set
33      */

34     public Vector JavaDoc idBlockVector;
35
36     /**
37      * The current data object for which we are parsing rules
38      */

39     private RuleBasedTransliterator.Data curData;
40
41     /**
42      * PUBLIC data member containing the parsed compound filter, if any.
43      */

44     public UnicodeSet compoundFilter;
45
46
47     private int direction;
48
49     /**
50      * Temporary symbol table used during parsing.
51      */

52     private ParseData parseData;
53
54     /**
55      * Temporary vector of set variables. When parsing is complete, this
56      * is copied into the array data.variables. As with data.variables,
57      * element 0 corresponds to character data.variablesBase.
58      */

59     private Vector JavaDoc variablesVector;
60
61     /**
62      * Temporary table of variable names. When parsing is complete, this is
63      * copied into data.variableNames.
64      */

65     private Hashtable JavaDoc variableNames;
66
67     /**
68      * String of standins for segments. Used during the parsing of a single
69      * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds
70      * to StringMatcher object segmentObjects.elementAt(0), etc.
71      */

72     private StringBuffer JavaDoc segmentStandins;
73
74     /**
75      * Vector of StringMatcher objects for segments. Used during the
76      * parsing of a single rule.
77      * segmentStandins.charAt(0) is the standin for "$1" and corresponds
78      * to StringMatcher object segmentObjects.elementAt(0), etc.
79      */

80     private Vector JavaDoc segmentObjects;
81
82     /**
83      * The next available stand-in for variables. This starts at some point in
84      * the private use area (discovered dynamically) and increments up toward
85      * <code>variableLimit</code>. At any point during parsing, available
86      * variables are <code>variableNext..variableLimit-1</code>.
87      */

88     private char variableNext;
89
90     /**
91      * The last available stand-in for variables. This is discovered
92      * dynamically. At any point during parsing, available variables are
93      * <code>variableNext..variableLimit-1</code>. During variable definition
94      * we use the special value variableLimit-1 as a placeholder.
95      */

96     private char variableLimit;
97
98     /**
99      * When we encounter an undefined variable, we do not immediately signal
100      * an error, in case we are defining this variable, e.g., "$a = [a-z];".
101      * Instead, we save the name of the undefined variable, and substitute
102      * in the placeholder char variableLimit - 1, and decrement
103      * variableLimit.
104      */

105     private String JavaDoc undefinedVariableName;
106
107     /**
108      * The stand-in character for the 'dot' set, represented by '.' in
109      * patterns. This is allocated the first time it is needed, and
110      * reused thereafter.
111      */

112     private int dotStandIn = -1;
113
114     //----------------------------------------------------------------------
115
// Constants
116
//----------------------------------------------------------------------
117

118     // Indicator for ID blocks
119
private static final String JavaDoc ID_TOKEN = "::";
120     private static final int ID_TOKEN_LEN = 2;
121
122 /*
123 (reserved for future expansion)
124     // markers for beginning and end of rule groups
125     private static final String BEGIN_TOKEN = "BEGIN";
126     private static final String END_TOKEN = "END";
127 */

128
129     // Operators
130
private static final char VARIABLE_DEF_OP = '=';
131     private static final char FORWARD_RULE_OP = '>';
132     private static final char REVERSE_RULE_OP = '<';
133     private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op
134

135     private static final String JavaDoc OPERATORS = "=><\u2190\u2192\u2194";
136     private static final String JavaDoc HALF_ENDERS = "=><\u2190\u2192\u2194;";
137
138     // Other special characters
139
private static final char QUOTE = '\'';
140     private static final char ESCAPE = '\\';
141     private static final char END_OF_RULE = ';';
142     private static final char RULE_COMMENT_CHAR = '#';
143
144     private static final char CONTEXT_ANTE = '{'; // ante{key
145
private static final char CONTEXT_POST = '}'; // key}post
146
private static final char CURSOR_POS = '|';
147     private static final char CURSOR_OFFSET = '@';
148     private static final char ANCHOR_START = '^';
149
150     private static final char KLEENE_STAR = '*';
151     private static final char ONE_OR_MORE = '+';
152     private static final char ZERO_OR_ONE = '?';
153
154     private static final char DOT = '.';
155     private static final String JavaDoc DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";
156
157     // By definition, the ANCHOR_END special character is a
158
// trailing SymbolTable.SYMBOL_REF character.
159
// private static final char ANCHOR_END = '$';
160

161     // Segments of the input string are delimited by "(" and ")". In the
162
// output string these segments are referenced as "$1", "$2", etc.
163
private static final char SEGMENT_OPEN = '(';
164     private static final char SEGMENT_CLOSE = ')';
165
166     // A function is denoted &Source-Target/Variant(text)
167
private static final char FUNCTION = '&';
168
169     // Aliases for some of the syntax characters. These are provided so
170
// transliteration rules can be expressed in XML without clashing with
171
// XML syntax characters '<', '>', and '&'.
172
private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
173
private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
174
private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
175
private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)
176

177     // Special characters disallowed at the top level
178
private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]");
179
180     // Special characters disallowed within a segment
181
private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]");
182
183     // Special characters disallowed within a function argument
184
private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]");
185
186     //----------------------------------------------------------------------
187
// class ParseData
188
//----------------------------------------------------------------------
189

190     /**
191      * This class implements the SymbolTable interface. It is used
192      * during parsing to give UnicodeSet access to variables that
193      * have been defined so far. Note that it uses variablesVector,
194      * _not_ data.variables.
195      */

196     private class ParseData implements SymbolTable {
197
198         /**
199          * Implement SymbolTable API.
200          */

201         public char[] lookup(String JavaDoc name) {
202             return (char[]) variableNames.get(name);
203         }
204
205         /**
206          * Implement SymbolTable API.
207          */

208         public UnicodeMatcher lookupMatcher(int ch) {
209             // Note that we cannot use data.lookup() because the
210
// set array has not been constructed yet.
211
int i = ch - curData.variablesBase;
212             if (i >= 0 && i < variablesVector.size()) {
213                 return (UnicodeMatcher) variablesVector.elementAt(i);
214             }
215             return null;
216         }
217
218         /**
219          * Implement SymbolTable API. Parse out a symbol reference
220          * name.
221          */

222         public String JavaDoc parseReference(String JavaDoc text, ParsePosition JavaDoc pos, int limit) {
223             int start = pos.getIndex();
224             int i = start;
225             while (i < limit) {
226                 char c = text.charAt(i);
227                 if ((i==start && !Character.isUnicodeIdentifierStart(c)) ||
228                     !Character.isUnicodeIdentifierPart(c)) {
229                     break;
230                 }
231                 ++i;
232             }
233             if (i == start) { // No valid name chars
234
return null;
235             }
236             pos.setIndex(i);
237             return text.substring(start, i);
238         }
239
240         /**
241          * Return true if the given character is a matcher standin or a plain
242          * character (non standin).
243          */

244         public boolean isMatcher(int ch) {
245             // Note that we cannot use data.lookup() because the
246
// set array has not been constructed yet.
247
int i = ch - curData.variablesBase;
248             if (i >= 0 && i < variablesVector.size()) {
249                 return variablesVector.elementAt(i) instanceof UnicodeMatcher;
250             }
251             return true;
252         }
253
254         /**
255          * Return true if the given character is a replacer standin or a plain
256          * character (non standin).
257          */

258         public boolean isReplacer(int ch) {
259             // Note that we cannot use data.lookup() because the
260
// set array has not been constructed yet.
261
int i = ch - curData.variablesBase;
262             if (i >= 0 && i < variablesVector.size()) {
263                 return variablesVector.elementAt(i) instanceof UnicodeReplacer;
264             }
265             return true;
266         }
267     }
268
269     //----------------------------------------------------------------------
270
// classes RuleBody, RuleArray, and RuleReader
271
//----------------------------------------------------------------------
272

273     /**
274      * A private abstract class representing the interface to rule
275      * source code that is broken up into lines. Handles the
276      * folding of lines terminated by a backslash. This folding
277      * is limited; it does not account for comments, quotes, or
278      * escapes, so its use to be limited.
279      */

280     private static abstract class RuleBody {
281
282         /**
283          * Retrieve the next line of the source, or return null if
284          * none. Folds lines terminated by a backslash into the
285          * next line, without regard for comments, quotes, or
286          * escapes.
287          */

288         String JavaDoc nextLine() {
289             String JavaDoc s = handleNextLine();
290             if (s != null &&
291                 s.length() > 0 &&
292                 s.charAt(s.length() - 1) == '\\') {
293
294                 StringBuffer JavaDoc b = new StringBuffer JavaDoc(s);
295                 do {
296                     b.deleteCharAt(b.length()-1);
297                     s = handleNextLine();
298                     if (s == null) {
299                         break;
300                     }
301                     b.append(s);
302                 } while (s.length() > 0 &&
303                          s.charAt(s.length() - 1) == '\\');
304
305                 s = b.toString();
306             }
307             return s;
308         }
309
310         /**
311          * Reset to the first line of the source.
312          */

313         abstract void reset();
314
315         /**
316          * Subclass method to return the next line of the source.
317          */

318         abstract String JavaDoc handleNextLine();
319     }
320
321     /**
322      * RuleBody subclass for a String[] array.
323      */

324     private static class RuleArray extends RuleBody {
325         String JavaDoc[] array;
326         int i;
327         public RuleArray(String JavaDoc[] array) { this.array = array; i = 0; }
328         public String JavaDoc handleNextLine() {
329             return (i < array.length) ? array[i++] : null;
330         }
331         public void reset() {
332             i = 0;
333         }
334     }
335
336     /**
337      * RuleBody subclass for a ResourceReader.
338      */

339     private static class RuleReader extends RuleBody {
340         ResourceReader reader;
341         public RuleReader(ResourceReader reader) { this.reader = reader; }
342         public String JavaDoc handleNextLine() {
343             try {
344                 return reader.readLine();
345             } catch (java.io.IOException JavaDoc e) {}
346             return null;
347         }
348         public void reset() {
349             reader.reset();
350         }
351     }
352
353     //----------------------------------------------------------------------
354
// class RuleHalf
355
//----------------------------------------------------------------------
356

357     /**
358      * A class representing one side of a rule. This class knows how to
359      * parse half of a rule. It is tightly coupled to the method
360      * TransliteratorParser.parseRule().
361      */

362     private static class RuleHalf {
363
364         public String JavaDoc text;
365
366         public int cursor = -1; // position of cursor in text
367
public int ante = -1; // position of ante context marker '{' in text
368
public int post = -1; // position of post context marker '}' in text
369

370         // Record the offset to the cursor either to the left or to the
371
// right of the key. This is indicated by characters on the output
372
// side that allow the cursor to be positioned arbitrarily within
373
// the matching text. For example, abc{def} > | @@@ xyz; changes
374
// def to xyz and moves the cursor to before abc. Offset characters
375
// must be at the start or end, and they cannot move the cursor past
376
// the ante- or postcontext text. Placeholders are only valid in
377
// output text. The length of the ante and post context is
378
// determined at runtime, because of supplementals and quantifiers.
379
public int cursorOffset = 0; // only nonzero on output side
380

381         // Position of first CURSOR_OFFSET on _right_. This will be -1
382
// for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
383
private int cursorOffsetPos = 0;
384
385         public boolean anchorStart = false;
386         public boolean anchorEnd = false;
387
388         /**
389          * The segment number from 1..n of the next '(' we see
390          * during parsing; 1-based.
391          */

392         private int nextSegmentNumber = 1;
393
394         /**
395          * Parse one side of a rule, stopping at either the limit,
396          * the END_OF_RULE character, or an operator.
397          * @return the index after the terminating character, or
398          * if limit was reached, limit
399          */

400         public int parse(String JavaDoc rule, int pos, int limit,
401                          TransliteratorParser parser) {
402             int start = pos;
403             StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
404             pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
405             text = buf.toString();
406
407             if (cursorOffset > 0 && cursor != cursorOffsetPos) {
408                 syntaxError("Misplaced " + CURSOR_POS, rule, start);
409             }
410
411             return pos;
412         }
413
414         /**
415          * Parse a section of one side of a rule, stopping at either
416          * the limit, the END_OF_RULE character, an operator, or a
417          * segment close character. This method parses both a
418          * top-level rule half and a segment within such a rule half.
419          * It calls itself recursively to parse segments and nested
420          * segments.
421          * @param buf buffer into which to accumulate the rule pattern
422          * characters, either literal characters from the rule or
423          * standins for UnicodeMatcher objects including segments.
424          * @param illegal the set of special characters that is illegal during
425          * this parse.
426          * @param isSegment if true, then we've already seen a '(' and
427          * pos on entry points right after it. Accumulate everything
428          * up to the closing ')', put it in a segment matcher object,
429          * generate a standin for it, and add the standin to buf. As
430          * a side effect, update the segments vector with a reference
431          * to the segment matcher. This works recursively for nested
432          * segments. If isSegment is false, just accumulate
433          * characters into buf.
434          * @return the index after the terminating character, or
435          * if limit was reached, limit
436          */

437         private int parseSection(String JavaDoc rule, int pos, int limit,
438                                  TransliteratorParser parser,
439                                  StringBuffer JavaDoc buf,
440                                  UnicodeSet illegal,
441                                  boolean isSegment) {
442             int start = pos;
443             ParsePosition JavaDoc pp = null;
444             int quoteStart = -1; // Most recent 'single quoted string'
445
int quoteLimit = -1;
446             int varStart = -1; // Most recent $variableReference
447
int varLimit = -1;
448             int[] iref = new int[1];
449             int bufStart = buf.length();
450
451         main:
452             while (pos < limit) {
453                 // Since all syntax characters are in the BMP, fetching
454
// 16-bit code units suffices here.
455
char c = rule.charAt(pos++);
456                 if (UCharacterProperty.isRuleWhiteSpace(c)) {
457                     continue;
458                 }
459                 // HALF_ENDERS is all chars that end a rule half: "<>=;"
460
if (HALF_ENDERS.indexOf(c) >= 0) {
461                     if (isSegment) {
462                         syntaxError("Unclosed segment", rule, start);
463                     }
464                     break main;
465                 }
466                 if (anchorEnd) {
467                     // Text after a presumed end anchor is a syntax err
468
syntaxError("Malformed variable reference", rule, start);
469                 }
470                 if (UnicodeSet.resemblesPattern(rule, pos-1)) {
471                     if (pp == null) {
472                         pp = new ParsePosition JavaDoc(0);
473                     }
474                     pp.setIndex(pos-1); // Backup to opening '['
475
buf.append(parser.parseSet(rule, pp));
476                     pos = pp.getIndex();
477                     continue;
478                 }
479                 // Handle escapes
480
if (c == ESCAPE) {
481                     if (pos == limit) {
482                         syntaxError("Trailing backslash", rule, start);
483                     }
484                     iref[0] = pos;
485                     int escaped = Utility.unescapeAt(rule, iref);
486                     pos = iref[0];
487                     if (escaped == -1) {
488                         syntaxError("Malformed escape", rule, start);
489                     }
490                     parser.checkVariableRange(escaped, rule, start);
491                     UTF16.append(buf, escaped);
492                     continue;
493                 }
494                 // Handle quoted matter
495
if (c == QUOTE) {
496                     int iq = rule.indexOf(QUOTE, pos);
497                     if (iq == pos) {
498                         buf.append(c); // Parse [''] outside quotes as [']
499
++pos;
500                     } else {
501                         /* This loop picks up a run of quoted text of the
502                          * form 'aaaa' each time through. If this run
503                          * hasn't really ended ('aaaa''bbbb') then it keeps
504                          * looping, each time adding on a new run. When it
505                          * reaches the final quote it breaks.
506                          */

507                         quoteStart = buf.length();
508                         for (;;) {
509                             if (iq < 0) {
510                                 syntaxError("Unterminated quote", rule, start);
511                             }
512                             buf.append(rule.substring(pos, iq));
513                             pos = iq+1;
514                             if (pos < limit && rule.charAt(pos) == QUOTE) {
515                             // Parse [''] inside quotes as [']
516
iq = rule.indexOf(QUOTE, pos+1);
517                             // Continue looping
518
} else {
519                                 break;
520                             }
521                         }
522                         quoteLimit = buf.length();
523                         
524                         for (iq=quoteStart; iq<quoteLimit; ++iq) {
525                             parser.checkVariableRange(buf.charAt(iq), rule, start);
526                         }
527                     }
528                     continue;
529                 }
530
531                 parser.checkVariableRange(c, rule, start);
532
533                 if (illegal.contains(c)) {
534                     syntaxError("Illegal character '" + c + '\'', rule, start);
535                 }
536
537                 switch (c) {
538                     
539                 //------------------------------------------------------
540
// Elements allowed within and out of segments
541
//------------------------------------------------------
542
case ANCHOR_START:
543                     if (buf.length() == 0 && !anchorStart) {
544                         anchorStart = true;
545                     } else {
546                         syntaxError("Misplaced anchor start",
547                                     rule, start);
548                     }
549                     break;
550                 case SEGMENT_OPEN:
551                     {
552                         // bufSegStart is the offset in buf to the first
553
// character of the segment we are parsing.
554
int bufSegStart = buf.length();
555
556                         // Record segment number now, since nextSegmentNumber
557
// will be incremented during the call to parseSection
558
// if there are nested segments.
559
int segmentNumber = nextSegmentNumber++; // 1-based
560

561                         // Parse the segment
562
pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);
563
564                         // After parsing a segment, the relevant characters are
565
// in buf, starting at offset bufSegStart. Extract them
566
// into a string matcher, and replace them with a
567
// standin for that matcher.
568
StringMatcher m =
569                             new StringMatcher(buf.substring(bufSegStart),
570                                               segmentNumber, parser.curData);
571
572                         // Record and associate object and segment number
573
parser.setSegmentObject(segmentNumber, m);
574                         buf.setLength(bufSegStart);
575                         buf.append(parser.getSegmentStandin(segmentNumber));
576                     }
577                     break;
578                 case FUNCTION:
579                 case ALT_FUNCTION:
580                     {
581                         iref[0] = pos;
582                         TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref);
583                         // The next character MUST be a segment open
584
if (single == null ||
585                             !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
586                             syntaxError("Invalid function", rule, start);
587                         }
588
589                         Transliterator t = single.getInstance();
590                         if (t == null) {
591                             syntaxError("Invalid function ID", rule, start);
592                         }
593
594                         // bufSegStart is the offset in buf to the first
595
// character of the segment we are parsing.
596
int bufSegStart = buf.length();
597
598                         // Parse the segment
599
pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);
600
601                         // After parsing a segment, the relevant characters are
602
// in buf, starting at offset bufSegStart.
603
FunctionReplacer r =
604                             new FunctionReplacer(t,
605                                 new StringReplacer(buf.substring(bufSegStart), parser.curData));
606
607                         // Replace the buffer contents with a stand-in
608
buf.setLength(bufSegStart);
609                         buf.append(parser.generateStandInFor(r));
610                     }
611                     break;
612                 case SymbolTable.SYMBOL_REF:
613                     // Handle variable references and segment references "$1" .. "$9"
614
{
615                         // A variable reference must be followed immediately
616
// by a Unicode identifier start and zero or more
617
// Unicode identifier part characters, or by a digit
618
// 1..9 if it is a segment reference.
619
if (pos == limit) {
620                             // A variable ref character at the end acts as
621
// an anchor to the context limit, as in perl.
622
anchorEnd = true;
623                             break;
624                         }
625                         // Parse "$1" "$2" .. "$9" .. (no upper limit)
626
c = rule.charAt(pos);
627                         int r = UCharacter.digit(c, 10);
628                         if (r >= 1 && r <= 9) {
629                             iref[0] = pos;
630                             r = Utility.parseNumber(rule, iref, 10);
631                             if (r < 0) {
632                                 syntaxError("Undefined segment reference",
633                                             rule, start);
634                             }
635                             pos = iref[0];
636                             buf.append(parser.getSegmentStandin(r));
637                         } else {
638                             if (pp == null) { // Lazy create
639
pp = new ParsePosition JavaDoc(0);
640                             }
641                             pp.setIndex(pos);
642                             String JavaDoc name = parser.parseData.
643                                 parseReference(rule, pp, limit);
644                             if (name == null) {
645                                 // This means the '$' was not followed by a
646
// valid name. Try to interpret it as an
647
// end anchor then. If this also doesn't work
648
// (if we see a following character) then signal
649
// an error.
650
anchorEnd = true;
651                                 break;
652                             }
653                             pos = pp.getIndex();
654                             // If this is a variable definition statement,
655
// then the LHS variable will be undefined. In
656
// that case appendVariableDef() will append the
657
// special placeholder char variableLimit-1.
658
varStart = buf.length();
659                             parser.appendVariableDef(name, buf);
660                             varLimit = buf.length();
661                         }
662                     }
663                     break;
664                 case DOT:
665                     buf.append(parser.getDotStandIn());
666                     break;
667                 case KLEENE_STAR:
668                 case ONE_OR_MORE:
669                 case ZERO_OR_ONE:
670                     // Quantifiers. We handle single characters, quoted strings,
671
// variable references, and segments.
672
// a+ matches aaa
673
// 'foo'+ matches foofoofoo
674
// $v+ matches xyxyxy if $v == xy
675
// (seg)+ matches segsegseg
676
{
677                         if (isSegment && buf.length() == bufStart) {
678                             // The */+ immediately follows '('
679
syntaxError("Misplaced quantifier", rule, start);
680                             break;
681                         }
682  
683                         int qstart, qlimit;
684                         // The */+ follows an isolated character or quote
685
// or variable reference
686
if (buf.length() == quoteLimit) {
687                             // The */+ follows a 'quoted string'
688
qstart = quoteStart;
689                             qlimit = quoteLimit;
690                         } else if (buf.length() == varLimit) {
691                             // The */+ follows a $variableReference
692
qstart = varStart;
693                             qlimit = varLimit;
694                         } else {
695                             // The */+ follows a single character, possibly
696
// a segment standin
697
qstart = buf.length() - 1;
698                             qlimit = qstart + 1;
699                         }
700
701                         UnicodeMatcher m;
702                         try {
703                             m = new StringMatcher(buf.toString(), qstart, qlimit,
704                                               0, parser.curData);
705                         } catch (RuntimeException JavaDoc e) {
706                             throw new IllegalArgumentException JavaDoc("Failure in rule: " + rule.substring(pos, limit));
707                         }
708                         int min = 0;
709                         int max = Quantifier.MAX;
710                         switch (c) {
711                         case ONE_OR_MORE:
712                             min = 1;
713                             break;
714                         case ZERO_OR_ONE:
715                             min = 0;
716                             max = 1;
717                             break;
718                             // case KLEENE_STAR:
719
// do nothing -- min, max already set
720
}
721                         m = new Quantifier(m, min, max);
722                         buf.setLength(qstart);
723                         buf.append(parser.generateStandInFor(m));
724                     }
725                     break;
726
727                 //------------------------------------------------------
728
// Elements allowed ONLY WITHIN segments
729
//------------------------------------------------------
730
case SEGMENT_CLOSE:
731                     // assert(isSegment);
732
// We're done parsing a segment.
733
break main;
734
735                 //------------------------------------------------------
736
// Elements allowed ONLY OUTSIDE segments
737
//------------------------------------------------------
738
case CONTEXT_ANTE:
739                     if (ante >= 0) {
740                         syntaxError("Multiple ante contexts", rule, start);
741                     }
742                     ante = buf.length();
743                     break;
744                 case CONTEXT_POST:
745                     if (post >= 0) {
746                         syntaxError("Multiple post contexts", rule, start);
747                     }
748                     post = buf.length();
749                     break;
750                 case CURSOR_POS:
751                     if (cursor >= 0) {
752                         syntaxError("Multiple cursors", rule, start);
753                     }
754                     cursor = buf.length();
755                     break;
756                 case CURSOR_OFFSET:
757                     if (cursorOffset < 0) {
758                         if (buf.length() > 0) {
759                             syntaxError("Misplaced " + c, rule, start);
760                         }
761                         --cursorOffset;
762                     } else if (cursorOffset > 0) {
763                         if (buf.length() != cursorOffsetPos || cursor >= 0) {
764                             syntaxError("Misplaced " + c, rule, start);
765                         }
766                         ++cursorOffset;
767                     } else {
768                         if (cursor == 0 && buf.length() == 0) {
769                             cursorOffset = -1;
770                         } else if (cursor < 0) {
771                             cursorOffsetPos = buf.length();
772                             cursorOffset = 1;
773                         } else {
774                             syntaxError("Misplaced " + c, rule, start);
775                         }
776                     }
777                     break;
778
779                 //------------------------------------------------------
780
// Non-special characters
781
//------------------------------------------------------
782
default:
783                     // Disallow unquoted characters other than [0-9A-Za-z]
784
// in the printable ASCII range. These characters are
785
// reserved for possible future use.
786
if (c >= 0x0021 && c <= 0x007E &&
787                         !((c >= '0' && c <= '9') ||
788                           (c >= 'A' && c <= 'Z') ||
789                           (c >= 'a' && c <= 'z'))) {
790                         syntaxError("Unquoted " + c, rule, start);
791                     }
792                     buf.append(c);
793                     break;
794                 }
795             }
796             return pos;
797         }
798
799         /**
800          * Remove context.
801          */

802         void removeContext() {
803             text = text.substring(ante < 0 ? 0 : ante,
804                                   post < 0 ? text.length() : post);
805             ante = post = -1;
806             anchorStart = anchorEnd = false;
807         }
808
809         /**
810          * Return true if this half looks like valid output, that is, does not
811          * contain quantifiers or other special input-only elements.
812          */

813         public boolean isValidOutput(TransliteratorParser parser) {
814             for (int i=0; i<text.length(); ) {
815                 int c = UTF16.charAt(text, i);
816                 i += UTF16.getCharCount(c);
817                 if (!parser.parseData.isReplacer(c)) {
818                     return false;
819                 }
820             }
821             return true;
822         }
823
824         /**
825          * Return true if this half looks like valid input, that is, does not
826          * contain functions or other special output-only elements.
827          */

828         public boolean isValidInput(TransliteratorParser parser) {
829             for (int i=0; i<text.length(); ) {
830                 int c = UTF16.charAt(text, i);
831                 i += UTF16.getCharCount(c);
832                 if (!parser.parseData.isMatcher(c)) {
833                     return false;
834                 }
835             }
836             return true;
837         }
838     }
839
840     //----------------------------------------------------------------------
841
// PUBLIC methods
842
//----------------------------------------------------------------------
843

844     /**
845      * Constructor.
846      */

847     public TransliteratorParser() {
848     }
849
850     /**
851      * Parse a set of rules. After the parse completes, examine the public
852      * data members for results.
853      */

854     public void parse(String JavaDoc rules, int direction) {
855         parseRules(new RuleArray(new String JavaDoc[] { rules }), direction);
856     }
857    
858     /**
859      * Parse a set of rules. After the parse completes, examine the public
860      * data members for results.
861      */

862     public void parse(ResourceReader rules, int direction) {
863         parseRules(new RuleReader(rules), direction);
864     }
865
866     //----------------------------------------------------------------------
867
// PRIVATE methods
868
//----------------------------------------------------------------------
869

870     /**
871      * Parse an array of zero or more rules. The strings in the array are
872      * treated as if they were concatenated together, with rule terminators
873      * inserted between array elements if not present already.
874      *
875      * Any previous rules are discarded. Typically this method is called exactly
876      * once, during construction.
877      *
878      * The member this.data will be set to null if there are no rules.
879      *
880      * @exception IllegalArgumentException if there is a syntax error in the
881      * rules
882      */

883     void parseRules(RuleBody ruleArray, int dir) {
884         boolean parsingIDs = true;
885         boolean inBeginEndBlock = false;
886         int ruleCount = 0;
887
888         dataVector = new Vector JavaDoc();
889         idBlockVector = new Vector JavaDoc();
890         curData = null;
891         direction = dir;
892         compoundFilter = null;
893         variablesVector = new Vector JavaDoc();
894         variableNames = new Hashtable JavaDoc();
895         parseData = new ParseData();
896
897         StringBuffer JavaDoc errors = null;
898         int errorCount = 0;
899
900         ruleArray.reset();
901
902         StringBuffer JavaDoc idBlockResult = new StringBuffer JavaDoc();
903
904         // The compound filter offset is an index into idBlockResult.
905
// If it is 0, then the compound filter occurred at the start,
906
// and it is the offset to the _start_ of the compound filter
907
// pattern. Otherwise it is the offset to the _limit_ of the
908
// compound filter pattern within idBlockResult.
909
this.compoundFilter = null;
910         int compoundFilterOffset = -1;
911
912     main:
913         for (;;) {
914             String JavaDoc rule = ruleArray.nextLine();
915             if (rule == null) {
916                 break;
917             }
918             int pos = 0;
919             int limit = rule.length();
920             while (pos < limit) {
921                 char c = rule.charAt(pos++);
922                 if (UCharacterProperty.isRuleWhiteSpace(c)) {
923                     continue;
924                 }
925                 // Skip lines starting with the comment character
926
if (c == RULE_COMMENT_CHAR) {
927                     pos = rule.indexOf("\n", pos) + 1;
928                     if (pos == 0) {
929                         break; // No "\n" found; rest of rule is a commnet
930
}
931                     continue; // Either fall out or restart with next line
932
}
933
934                 // skip empty rules
935
if (c == END_OF_RULE)
936                     continue;
937
938                 // Often a rule file contains multiple errors. It's
939
// convenient to the rule author if these are all reported
940
// at once. We keep parsing rules even after a failure, up
941
// to a specified limit, and report all errors at once.
942
try {
943                     ++ruleCount;
944
945                     // We've found the start of a rule or ID. c is its first
946
// character, and pos points past c.
947
--pos;
948                     // Look for an ID token. Must have at least ID_TOKEN_LEN + 1
949
// chars left.
950
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
951                             rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
952                         pos += ID_TOKEN_LEN;
953                         c = rule.charAt(pos);
954                         while (UCharacterProperty.isRuleWhiteSpace(c) && pos < limit) {
955                             ++pos;
956                             c = rule.charAt(pos);
957                         }
958                         int[] p = new int[] { pos };
959
960                         if (!parsingIDs) {
961                             if (curData != null) {
962                                 if (direction == Transliterator.FORWARD)
963                                     dataVector.add(curData);
964                                 else
965                                     dataVector.insertElementAt(curData, 0);
966                                 curData = null;
967                             }
968                             parsingIDs = true;
969                         }
970
971                         TransliteratorIDParser.SingleID id =
972                             TransliteratorIDParser.parseSingleID(
973                                           rule, p, direction);
974                         if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
975                             // Successful ::ID parse.
976

977                             if (direction == Transliterator.FORWARD) {
978                                 idBlockResult.append(id.canonID).append(END_OF_RULE);
979                             } else {
980                                 idBlockResult.insert(0, id.canonID + END_OF_RULE);
981                             }
982
983                         } else {
984                             // Couldn't parse an ID. Try to parse a global filter
985
int[] withParens = new int[] { -1 };
986                             UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null);
987                             if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
988                                 if ((direction == Transliterator.FORWARD) ==
989                                     (withParens[0] == 0)) {
990                                     if (compoundFilter != null) {
991                                         // Multiple compound filters
992
syntaxError("Multiple global filters", rule, pos);
993                                     }
994                                     compoundFilter = f;
995                                     compoundFilterOffset = ruleCount;
996                                }
997                             } else {
998                                 // Invalid ::id
999
// Can be parsed as neither an ID nor a global filter
1000
syntaxError("Invalid ::ID", rule, pos);
1001                            }
1002                        }
1003
1004                        pos = p[0];
1005                    } else {
1006                        if (parsingIDs) {
1007                            if (direction == Transliterator.FORWARD)
1008                                idBlockVector.add(idBlockResult.toString());
1009                            else
1010                                idBlockVector.insertElementAt(idBlockResult.toString(), 0);
1011                            idBlockResult.delete(0, idBlockResult.length());
1012                            parsingIDs = false;
1013                            curData = new RuleBasedTransliterator.Data();
1014
1015                            // By default, rules use part of the private use area
1016
// E000..F8FF for variables and other stand-ins. Currently
1017
// the range F000..F8FF is typically sufficient. The 'use
1018
// variable range' pragma allows rule sets to modify this.
1019
setVariableRange(0xF000, 0xF8FF);
1020                        }
1021
1022                        if (resemblesPragma(rule, pos, limit)) {
1023                            int ppp = parsePragma(rule, pos, limit);
1024                            if (ppp < 0) {
1025                                syntaxError("Unrecognized pragma", rule, pos);
1026                            }
1027                            pos = ppp;
1028                        // Parse a rule
1029
} else {
1030                            pos = parseRule(rule, pos, limit);
1031                        }
1032                    }
1033                } catch (IllegalArgumentException JavaDoc e) {
1034                    if (errorCount == 30) {
1035                        errors.append("\nMore than 30 errors; further messages squelched");
1036                        break main;
1037                    }
1038                    if (errors == null) {
1039                        errors = new StringBuffer JavaDoc(e.getMessage());
1040                    } else {
1041                        errors.append("\n" + e.getMessage());
1042                    }
1043                    ++errorCount;
1044                    pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
1045
}
1046            }
1047        }
1048        if (parsingIDs && idBlockResult.length() > 0) {
1049            if (direction == Transliterator.FORWARD)
1050                idBlockVector.add(idBlockResult.toString());
1051            else
1052                idBlockVector.insertElementAt(idBlockResult.toString(), 0);
1053        }
1054        else if (!parsingIDs && curData != null) {
1055            if (direction == Transliterator.FORWARD)
1056                dataVector.add(curData);
1057            else
1058                dataVector.insertElementAt(curData, 0);
1059        }
1060
1061        // Convert the set vector to an array
1062
for (int i = 0; i < dataVector.size(); i++) {
1063            RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
1064            data.variables = new Object JavaDoc[variablesVector.size()];
1065            variablesVector.copyInto(data.variables);
1066            data.variableNames = new Hashtable JavaDoc();
1067            data.variableNames.putAll(variableNames);
1068        }
1069        variablesVector = null;
1070
1071        // Do more syntax checking and index the rules
1072
try {
1073            if (compoundFilter != null) {
1074                if ((direction == Transliterator.FORWARD &&
1075                     compoundFilterOffset != 1) ||
1076                    (direction == Transliterator.REVERSE &&
1077                     compoundFilterOffset != ruleCount)) {
1078                    throw new IllegalArgumentException JavaDoc("Compound filters misplaced");
1079                }
1080            }
1081
1082            for (int i = 0; i < dataVector.size(); i++) {
1083                RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)dataVector.get(i);
1084                data.ruleSet.freeze();
1085            }
1086
1087            if (idBlockVector.size() == 1 && ((String JavaDoc)idBlockVector.get(0)).length() == 0)
1088                idBlockVector.remove(0);
1089
1090        } catch (IllegalArgumentException JavaDoc e) {
1091            if (errors == null) {
1092                errors = new StringBuffer JavaDoc(e.getMessage());
1093            } else {
1094                errors.append("\n").append(e.getMessage());
1095            }
1096        }
1097
1098        if (errors != null) {
1099            throw new IllegalArgumentException JavaDoc(errors.toString());
1100        }
1101    }
1102
1103    /**
1104     * MAIN PARSER. Parse the next rule in the given rule string, starting
1105     * at pos. Return the index after the last character parsed. Do not
1106     * parse characters at or after limit.
1107     *
1108     * Important: The character at pos must be a non-whitespace character
1109     * that is not the comment character.
1110     *
1111     * This method handles quoting, escaping, and whitespace removal. It
1112     * parses the end-of-rule character. It recognizes context and cursor
1113     * indicators. Once it does a lexical breakdown of the rule at pos, it
1114     * creates a rule object and adds it to our rule list.
1115     *
1116     * This method is tightly coupled to the inner class RuleHalf.
1117     */

1118    private int parseRule(String JavaDoc rule, int pos, int limit) {
1119        // Locate the left side, operator, and right side
1120
int start = pos;
1121        char operator = 0;
1122
1123        // Set up segments data
1124
segmentStandins = new StringBuffer JavaDoc();
1125        segmentObjects = new Vector JavaDoc();
1126
1127        RuleHalf left = new RuleHalf();
1128        RuleHalf right = new RuleHalf();
1129
1130        undefinedVariableName = null;
1131        pos = left.parse(rule, pos, limit, this);
1132
1133        if (pos == limit ||
1134            OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
1135            syntaxError("No operator pos=" + pos, rule, start);
1136        }
1137        ++pos;
1138
1139        // Found an operator char. Check for forward-reverse operator.
1140
if (operator == REVERSE_RULE_OP &&
1141            (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1142            ++pos;
1143            operator = FWDREV_RULE_OP;
1144        }
1145
1146        // Translate alternate op characters.
1147
switch (operator) {
1148        case ALT_FORWARD_RULE_OP:
1149            operator = FORWARD_RULE_OP;
1150            break;
1151        case ALT_REVERSE_RULE_OP:
1152            operator = REVERSE_RULE_OP;
1153            break;
1154        case ALT_FWDREV_RULE_OP:
1155            operator = FWDREV_RULE_OP;
1156            break;
1157        }
1158
1159        pos = right.parse(rule, pos, limit, this);
1160
1161        if (pos < limit) {
1162            if (rule.charAt(--pos) == END_OF_RULE) {
1163                ++pos;
1164            } else {
1165                // RuleHalf parser must have terminated at an operator
1166
syntaxError("Unquoted operator", rule, start);
1167            }
1168        }
1169
1170        if (operator == VARIABLE_DEF_OP) {
1171            // LHS is the name. RHS is a single character, either a literal
1172
// or a set (already parsed). If RHS is longer than one
1173
// character, it is either a multi-character string, or multiple
1174
// sets, or a mixture of chars and sets -- syntax error.
1175

1176            // We expect to see a single undefined variable (the one being
1177
// defined).
1178
if (undefinedVariableName == null) {
1179                syntaxError("Missing '$' or duplicate definition", rule, start);
1180            }
1181            if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
1182                syntaxError("Malformed LHS", rule, start);
1183            }
1184            if (left.anchorStart || left.anchorEnd ||
1185                right.anchorStart || right.anchorEnd) {
1186                syntaxError("Malformed variable def", rule, start);
1187            }
1188            // We allow anything on the right, including an empty string.
1189
int n = right.text.length();
1190            char[] value = new char[n];
1191            right.text.getChars(0, n, value, 0);
1192            variableNames.put(undefinedVariableName, value);
1193
1194            ++variableLimit;
1195            return pos;
1196        }
1197
1198        // If this is not a variable definition rule, we shouldn't have
1199
// any undefined variable names.
1200
if (undefinedVariableName != null) {
1201            syntaxError("Undefined variable $" + undefinedVariableName,
1202                        rule, start);
1203        }
1204
1205        // Verify segments
1206
if (segmentStandins.length() > segmentObjects.size()) {
1207            syntaxError("Undefined segment reference", rule, start);
1208        }
1209        for (int i=0; i<segmentStandins.length(); ++i) {
1210            if (segmentStandins.charAt(i) == 0) {
1211                syntaxError("Internal error", rule, start); // will never happen
1212
}
1213        }
1214        for (int i=0; i<segmentObjects.size(); ++i) {
1215            if (segmentObjects.elementAt(i) == null) {
1216                syntaxError("Internal error", rule, start); // will never happen
1217
}
1218        }
1219
1220        // If the direction we want doesn't match the rule
1221
// direction, do nothing.
1222
if (operator != FWDREV_RULE_OP &&
1223            ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
1224            return pos;
1225        }
1226
1227        // Transform the rule into a forward rule by swapping the
1228
// sides if necessary.
1229
if (direction == Transliterator.REVERSE) {
1230            RuleHalf temp = left;
1231            left = right;
1232            right = temp;
1233        }
1234
1235        // Remove non-applicable elements in forward-reverse
1236
// rules. Bidirectional rules ignore elements that do not
1237
// apply.
1238
if (operator == FWDREV_RULE_OP) {
1239            right.removeContext();
1240            left.cursor = -1;
1241            left.cursorOffset = 0;
1242        }
1243
1244        // Normalize context
1245
if (left.ante < 0) {
1246            left.ante = 0;
1247        }
1248        if (left.post < 0) {
1249            left.post = left.text.length();
1250        }
1251
1252        // Context is only allowed on the input side. Cursors are only
1253
// allowed on the output side. Segment delimiters can only appear
1254
// on the left, and references on the right. Cursor offset
1255
// cannot appear without an explicit cursor. Cursor offset
1256
// cannot place the cursor outside the limits of the context.
1257
// Anchors are only allowed on the input side.
1258
if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 ||
1259            (right.cursorOffset != 0 && right.cursor < 0) ||
1260            // - The following two checks were used to ensure that the
1261
// - the cursor offset stayed within the ante- or postcontext.
1262
// - However, with the addition of quantifiers, we have to
1263
// - allow arbitrary cursor offsets and do runtime checking.
1264
//(right.cursorOffset > (left.text.length() - left.post)) ||
1265
//(-right.cursorOffset > left.ante) ||
1266
right.anchorStart || right.anchorEnd ||
1267            !left.isValidInput(this) || !right.isValidOutput(this) ||
1268            left.ante > left.post) {
1269            syntaxError("Malformed rule", rule, start);
1270        }
1271
1272        // Flatten segment objects vector to an array
1273
UnicodeMatcher[] segmentsArray = null;
1274        if (segmentObjects.size() > 0) {
1275            segmentsArray = new UnicodeMatcher[segmentObjects.size()];
1276            segmentObjects.toArray(segmentsArray);
1277        }
1278
1279        curData.ruleSet.addRule(new TransliterationRule(
1280                                     left.text, left.ante, left.post,
1281                                     right.text, right.cursor, right.cursorOffset,
1282                                     segmentsArray,
1283                                     left.anchorStart, left.anchorEnd,
1284                                     curData));
1285
1286        return pos;
1287    }
1288
1289    /**
1290     * Set the variable range to [start, end] (inclusive).
1291     */

1292    private void setVariableRange(int start, int end) {
1293        if (start > end || start < 0 || end > 0xFFFF) {
1294            throw new IllegalArgumentException JavaDoc("Invalid variable range " + start + ", " + end);
1295        }
1296        
1297        curData.variablesBase = (char) start; // first private use
1298

1299        if (dataVector.size() == 0) {
1300            variableNext = (char) start;
1301            variableLimit = (char) (end + 1);
1302        }
1303    }
1304
1305    /**
1306     * Assert that the given character is NOT within the variable range.
1307     * If it is, signal an error. This is neccesary to ensure that the
1308     * variable range does not overlap characters used in a rule.
1309     */

1310    private void checkVariableRange(int ch, String JavaDoc rule, int start) {
1311        if (ch >= curData.variablesBase && ch < variableLimit) {
1312            syntaxError("Variable range character in rule", rule, start);
1313        }
1314    }
1315
1316    // (The following method is part of an unimplemented feature.
1317
// Remove this clover pragma after the feature is implemented.
1318
// 2003-06-11 ICU 2.6 Alan)
1319
///CLOVER:OFF
1320
/**
1321     * Set the maximum backup to 'backup', in response to a pragma
1322     * statement.
1323     */

1324    private void pragmaMaximumBackup(int backup) {
1325        //TODO Finish
1326
throw new IllegalArgumentException JavaDoc("use maximum backup pragma not implemented yet");
1327    }
1328    ///CLOVER:ON
1329

1330    // (The following method is part of an unimplemented feature.
1331
// Remove this clover pragma after the feature is implemented.
1332
// 2003-06-11 ICU 2.6 Alan)
1333
///CLOVER:OFF
1334
/**
1335     * Begin normalizing all rules using the given mode, in response
1336     * to a pragma statement.
1337     */

1338    private void pragmaNormalizeRules(Normalizer.Mode mode) {
1339        //TODO Finish
1340
throw new IllegalArgumentException JavaDoc("use normalize rules pragma not implemented yet");
1341    }
1342    ///CLOVER:ON
1343

1344    /**
1345     * Return true if the given rule looks like a pragma.
1346     * @param pos offset to the first non-whitespace character
1347     * of the rule.
1348     * @param limit pointer past the last character of the rule.
1349     */

1350    static boolean resemblesPragma(String JavaDoc rule, int pos, int limit) {
1351        // Must start with /use\s/i
1352
return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
1353    }
1354
1355    /**
1356     * Parse a pragma. This method assumes resemblesPragma() has
1357     * already returned true.
1358     * @param pos offset to the first non-whitespace character
1359     * of the rule.
1360     * @param limit pointer past the last character of the rule.
1361     * @return the position index after the final ';' of the pragma,
1362     * or -1 on failure.
1363     */

1364    private int parsePragma(String JavaDoc rule, int pos, int limit) {
1365        int[] array = new int[2];
1366
1367        // resemblesPragma() has already returned true, so we
1368
// know that pos points to /use\s/i; we can skip 4 characters
1369
// immediately
1370
pos += 4;
1371        
1372        // Here are the pragmas we recognize:
1373
// use variable range 0xE000 0xEFFF;
1374
// use maximum backup 16;
1375
// use nfd rules;
1376
int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
1377        if (p >= 0) {
1378            setVariableRange(array[0], array[1]);
1379            return p;
1380        }
1381
1382        p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
1383        if (p >= 0) {
1384            pragmaMaximumBackup(array[0]);
1385            return p;
1386        }
1387
1388        p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
1389        if (p >= 0) {
1390            pragmaNormalizeRules(Normalizer.NFD);
1391            return p;
1392        }
1393
1394        p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
1395        if (p >= 0) {
1396            pragmaNormalizeRules(Normalizer.NFC);
1397            return p;
1398        }
1399
1400        // Syntax error: unable to parse pragma
1401
return -1;
1402    }
1403
1404    /**
1405     * Throw an exception indicating a syntax error. Search the rule string
1406     * for the probable end of the rule. Of course, if the error is that
1407     * the end of rule marker is missing, then the rule end will not be found.
1408     * In any case the rule start will be correctly reported.
1409     * @param msg error description
1410     * @param rule pattern string
1411     * @param start position of first character of current rule
1412     */

1413    static final void syntaxError(String JavaDoc msg, String JavaDoc rule, int start) {
1414        int end = ruleEnd(rule, start, rule.length());
1415        throw new IllegalArgumentException JavaDoc(msg + " in \"" +
1416                                           Utility.escape(rule.substring(start, end)) + '"');
1417    }
1418
1419    static final int ruleEnd(String JavaDoc rule, int start, int limit) {
1420        int end = Utility.quotedIndexOf(rule, start, limit, ";");
1421        if (end < 0) {
1422            end = limit;
1423        }
1424        return end;
1425    }
1426
1427    /**
1428     * Parse a UnicodeSet out, store it, and return the stand-in character
1429     * used to represent it.
1430     */

1431    private final char parseSet(String JavaDoc rule, ParsePosition JavaDoc pos) {
1432        UnicodeSet set = new UnicodeSet(rule, pos, parseData);
1433        if (variableNext >= variableLimit) {
1434            throw new RuntimeException JavaDoc("Private use variables exhausted");
1435        }
1436        set.compact();
1437        return generateStandInFor(set);
1438    }
1439
1440    /**
1441     * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer.
1442     * Store the object.
1443     */

1444    char generateStandInFor(Object JavaDoc obj) {
1445        // assert(obj != null);
1446

1447        // Look up previous stand-in, if any. This is a short list
1448
// (typical n is 0, 1, or 2); linear search is optimal.
1449
for (int i=0; i<variablesVector.size(); ++i) {
1450            if (variablesVector.elementAt(i) == obj) { // [sic] pointer comparison
1451
return (char) (curData.variablesBase + i);
1452            }
1453        }
1454
1455        if (variableNext >= variableLimit) {
1456            throw new RuntimeException JavaDoc("Variable range exhausted");
1457        }
1458        variablesVector.addElement(obj);
1459        return variableNext++;
1460    }
1461
1462    /**
1463     * Return the standin for segment seg (1-based).
1464     */

1465    public char getSegmentStandin(int seg) {
1466        if (segmentStandins.length() < seg) {
1467            segmentStandins.setLength(seg);
1468        }
1469        char c = segmentStandins.charAt(seg-1);
1470        if (c == 0) {
1471            if (variableNext >= variableLimit) {
1472                throw new RuntimeException JavaDoc("Variable range exhausted");
1473            }
1474            c = variableNext++;
1475            // Set a placeholder in the master variables vector that will be
1476
// filled in later by setSegmentObject(). We know that we will get
1477
// called first because setSegmentObject() will call us.
1478
variablesVector.addElement(null);
1479            segmentStandins.setCharAt(seg-1, c);
1480        }
1481        return c;
1482    }
1483    
1484    /**
1485     * Set the object for segment seg (1-based).
1486     */

1487    public void setSegmentObject(int seg, StringMatcher obj) {
1488        // Since we call parseSection() recursively, nested
1489
// segments will result in segment i+1 getting parsed
1490
// and stored before segment i; be careful with the
1491
// vector handling here.
1492
if (segmentObjects.size() < seg) {
1493            segmentObjects.setSize(seg);
1494        }
1495        int index = getSegmentStandin(seg) - curData.variablesBase;
1496        if (segmentObjects.elementAt(seg-1) != null ||
1497            variablesVector.elementAt(index) != null) {
1498            throw new RuntimeException JavaDoc(); // should never happen
1499
}
1500        segmentObjects.setElementAt(obj, seg-1);
1501        variablesVector.setElementAt(obj, index);
1502    }
1503
1504    /**
1505     * Return the stand-in for the dot set. It is allocated the first
1506     * time and reused thereafter.
1507     */

1508    char getDotStandIn() {
1509        if (dotStandIn == -1) {
1510            dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
1511        }
1512        return (char) dotStandIn;
1513    }
1514
1515    /**
1516     * Append the value of the given variable name to the given
1517     * StringBuffer.
1518     * @exception IllegalArgumentException if the name is unknown.
1519     */

1520    private void appendVariableDef(String JavaDoc name, StringBuffer JavaDoc buf) {
1521        char[] ch = (char[]) variableNames.get(name);
1522        if (ch == null) {
1523            // We allow one undefined variable so that variable definition
1524
// statements work. For the first undefined variable we return
1525
// the special placeholder variableLimit-1, and save the variable
1526
// name.
1527
if (undefinedVariableName == null) {
1528                undefinedVariableName = name;
1529                if (variableNext >= variableLimit) {
1530                    throw new RuntimeException JavaDoc("Private use variables exhausted");
1531                }
1532                buf.append((char) --variableLimit);
1533            } else {
1534                throw new IllegalArgumentException JavaDoc("Undefined variable $"
1535                                                   + name);
1536            }
1537        } else {
1538            buf.append(ch);
1539        }
1540    }
1541}
1542
1543//eof
1544
Popular Tags