KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > ibm > icu > text > RBBIRuleScanner


1 /*
2  *******************************************************************************
3  * Copyright (C) 2003-2006,
4  * International Business Machines Corporation and others. All Rights Reserved.
5  *******************************************************************************
6  */

7  
8 package com.ibm.icu.text;
9
10 import java.util.HashMap JavaDoc;
11 import java.lang.IllegalArgumentException JavaDoc;
12 import java.text.ParsePosition JavaDoc;
13
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.impl.Assert;
16 import com.ibm.icu.impl.Utility;
17
18 /**
19   * This class is part of the Rule Based Break Iterator rule compiler.
20   * It scans the rules and builds the parse tree.
21   * There is no public API here.
22   * @internal
23   */

24 class RBBIRuleScanner {
25     
26     private final int kStackSize = 100; // The size of the state stack for
27
// rules parsing. Corresponds roughly
28
// to the depth of parentheses nesting
29
// that is allowed in the rules.
30

31     static class RBBIRuleChar {
32         int fChar;
33         boolean fEscaped;
34     }
35
36
37
38 RBBIRuleBuilder fRB; // The rule builder that we are part of.
39

40 int fScanIndex; // Index of current character being processed
41
// in the rule input string.
42
int fNextIndex; // Index of the next character, which
43
// is the first character not yet scanned.
44
boolean fQuoteMode; // Scan is in a 'quoted region'
45
int fLineNum; // Line number in input file.
46
int fCharNum; // Char position within the line.
47
int fLastChar; // Previous char, needed to count CR-LF
48
// as a single line, not two.
49

50 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
51
// processing.
52
String JavaDoc fVarName; // $variableName, valid when we've just
53
// scanned one.
54

55
56 short fStack[] = new short[kStackSize]; // State stack, holds state pushes
57
int fStackPtr; // and pops as specified in the state
58
// transition rules.
59

60 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
61
// during the parse of a rule
62
int fNodeStackPtr;
63
64
65 boolean fReverseRule; // True if the rule currently being scanned
66
// is a reverse direction rule (if it
67
// starts with a '!')
68

69 boolean fLookAheadRule; // True if the rule includes a '/'
70
// somewhere within it.
71

72 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
73
// $variable symbols.
74

75 HashMap JavaDoc fSetTable = new HashMap JavaDoc(); // UnicocodeSet hash table, holds indexes to
76
// the sets created while parsing rules.
77
// The key is the string used for creating
78
// the set.
79

80 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
81
// the scanning of RBBI rules. The
82
// indicies for these are assigned by the
83
// perl script that builds the state tables.
84
// See rbbirpt.h.
85

86 int fRuleNum; // Counts each rule as it is scanned.
87

88 int fOptionStart; // Input index of start of a !!option
89
// keyword, while being scanned.
90

91     
92
93    static private String JavaDoc gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
94    static private String JavaDoc gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
95    static private String JavaDoc gRuleSet_digit_char_pattern = "[0-9]";
96    static private String JavaDoc gRuleSet_name_start_char_pattern = "[_\\p{L}]";
97    static private String JavaDoc gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
98    static private String JavaDoc kAny = "any";
99
100     
101  
102
103 //----------------------------------------------------------------------------------------
104
//
105
// Constructor.
106
//
107
//----------------------------------------------------------------------------------------
108
RBBIRuleScanner(RBBIRuleBuilder rb)
109 {
110     fRB = rb;
111     fLineNum = 1;
112
113     //
114
// Set up the constant Unicode Sets.
115
// Note: These could be made static and shared among
116
// all instances of RBBIRuleScanners.
117
fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern);
118     fRuleSets[RBBIRuleParseTable.kRuleSet_white_space-128] = new UnicodeSet(gRuleSet_white_space_pattern);
119     fRuleSets[RBBIRuleParseTable.kRuleSet_name_char-128] = new UnicodeSet(gRuleSet_name_char_pattern);
120     fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char-128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
121     fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern);
122
123     fSymbolTable = new RBBISymbolTable(this, rb.fRules);
124  }
125
126
127
128
129 //----------------------------------------------------------------------------------------
130
//
131
// doParseAction Do some action during rule parsing.
132
// Called by the parse state machine.
133
// Actions build the parse tree and Unicode Sets,
134
// and maintain the parse stack for nested expressions.
135
//
136
//----------------------------------------------------------------------------------------
137
boolean doParseActions(int action)
138 {
139     RBBINode n = null;
140
141     boolean returnVal = true;
142
143     switch (action) {
144
145     case RBBIRuleParseTable.doExprStart:
146         pushNewNode(RBBINode.opStart);
147         fRuleNum++;
148         break;
149
150
151     case RBBIRuleParseTable.doExprOrOperator:
152         {
153             fixOpStack(RBBINode.precOpCat);
154             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
155             RBBINode orNode = pushNewNode(RBBINode.opOr);
156             orNode.fLeftChild = operandNode;
157             operandNode.fParent = orNode;
158         }
159         break;
160
161     case RBBIRuleParseTable.doExprCatOperator:
162         // concatenation operator.
163
// For the implicit concatenation of adjacent terms in an expression that are
164
// not separated by any other operator. Action is invoked between the
165
// actions for the two terms.
166
{
167             fixOpStack(RBBINode.precOpCat);
168             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
169             RBBINode catNode = pushNewNode(RBBINode.opCat);
170             catNode.fLeftChild = operandNode;
171             operandNode.fParent = catNode;
172         }
173         break;
174
175     case RBBIRuleParseTable.doLParen:
176         // Open Paren.
177
// The openParen node is a dummy operation type with a low precedence,
178
// which has the affect of ensuring that any real binary op that
179
// follows within the parens binds more tightly to the operands than
180
// stuff outside of the parens.
181
pushNewNode(RBBINode.opLParen);
182         break;
183
184     case RBBIRuleParseTable.doExprRParen:
185         fixOpStack(RBBINode.precLParen);
186         break;
187
188     case RBBIRuleParseTable.doNOP:
189         break;
190
191     case RBBIRuleParseTable.doStartAssign:
192         // We've just scanned "$variable = "
193
// The top of the node stack has the $variable ref node.
194

195         // Save the start position of the RHS text in the StartExpression node
196
// that precedes the $variableReference node on the stack.
197
// This will eventually be used when saving the full $variable replacement
198
// text as a string.
199
n = fNodeStack[fNodeStackPtr-1];
200         n.fFirstPos = fNextIndex; // move past the '='
201

202         // Push a new start-of-expression node; needed to keep parse of the
203
// RHS expression happy.
204
pushNewNode(RBBINode.opStart);
205         break;
206
207
208
209
210     case RBBIRuleParseTable.doEndAssign:
211         {
212             // We have reached the end of an assignement statement.
213
// Current scan char is the ';' that terminates the assignment.
214

215             // Terminate expression, leaves expression parse tree rooted in TOS node.
216
fixOpStack(RBBINode.precStart);
217
218             RBBINode startExprNode = fNodeStack[fNodeStackPtr-2];
219             RBBINode varRefNode = fNodeStack[fNodeStackPtr-1];
220             RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];
221
222             // Save original text of right side of assignment, excluding the terminating ';'
223
// in the root of the node for the right-hand-side expression.
224
RHSExprNode.fFirstPos = startExprNode.fFirstPos;
225             RHSExprNode.fLastPos = fScanIndex;
226             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, RHSExprNode.fLastPos, RHSExprNode.fText);
227
RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, RHSExprNode.fLastPos);
228
229             // Expression parse tree becomes l. child of the $variable reference node.
230
varRefNode.fLeftChild = RHSExprNode;
231             RHSExprNode.fParent = varRefNode;
232
233             // Make a symbol table entry for the $variableRef node.
234
fSymbolTable.addEntry(varRefNode.fText, varRefNode);
235  
236             // Clean up the stack.
237
fNodeStackPtr-=3;
238             break;
239         }
240
241     case RBBIRuleParseTable.doEndOfRule:
242         {
243         fixOpStack(RBBINode.precStart); // Terminate expression, leaves expression
244

245         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rtree") >= 0) {printNodeStack("end of rule");}
246         Assert.assrt(fNodeStackPtr == 1);
247
248         // If this rule includes a look-ahead '/', add a endMark node to the
249
// expression tree.
250
if (fLookAheadRule) {
251             RBBINode thisRule = fNodeStack[fNodeStackPtr];
252             RBBINode endNode = pushNewNode(RBBINode.endMark);
253             RBBINode catNode = pushNewNode(RBBINode.opCat);
254             fNodeStackPtr -= 2;
255             catNode.fLeftChild = thisRule;
256             catNode.fRightChild = endNode;
257             fNodeStack[fNodeStackPtr] = catNode;
258             endNode.fVal = fRuleNum;
259             endNode.fLookAheadEnd = true;
260         }
261
262         // All rule expressions are ORed together.
263
// The ';' that terminates an expression really just functions as a '|' with
264
// a low operator prededence.
265
//
266
// Each of the four sets of rules are collected separately.
267
// (forward, reverse, safe_forward, safe_reverse)
268
// OR this rule into the appropriate group of them.
269
//
270

271         int destRules = (fReverseRule? fRB.fReverseTree : fRB.fDefaultTree);
272
273         if (fRB.fTreeRoots[destRules] != null) {
274             // This is not the first rule encounted.
275
// OR previous stuff (from *destRules)
276
// with the current rule expression (on the Node Stack)
277
// with the resulting OR expression going to *destRules
278
//
279
RBBINode thisRule = fNodeStack[fNodeStackPtr];
280             RBBINode prevRules = fRB.fTreeRoots[destRules];
281             RBBINode orNode = pushNewNode(RBBINode.opOr);
282             orNode.fLeftChild = prevRules;
283             prevRules.fParent = orNode;
284             orNode.fRightChild = thisRule;
285             thisRule.fParent = orNode;
286             fRB.fTreeRoots[destRules] = orNode;
287         }
288         else
289         {
290             // This is the first rule encountered (for this direction).
291
// Just move its parse tree from the stack to *destRules.
292
fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
293         }
294         fReverseRule = false; // in preparation for the next rule.
295
fLookAheadRule = false;
296         fNodeStackPtr = 0;
297         }
298         break;
299
300
301     case RBBIRuleParseTable.doRuleError:
302         error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
303         returnVal = false;
304         break;
305
306
307     case RBBIRuleParseTable.doVariableNameExpectedErr:
308         error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
309         break;
310
311
312     //
313
// Unary operands + ? *
314
// These all appear after the operand to which they apply.
315
// When we hit one, the operand (may be a whole sub expression)
316
// will be on the top of the stack.
317
// Unary Operator becomes TOS, with the old TOS as its one child.
318
case RBBIRuleParseTable.doUnaryOpPlus:
319         {
320             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
321             RBBINode plusNode = pushNewNode(RBBINode.opPlus);
322             plusNode.fLeftChild = operandNode;
323             operandNode.fParent = plusNode;
324         }
325         break;
326
327     case RBBIRuleParseTable.doUnaryOpQuestion:
328         {
329             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
330             RBBINode qNode = pushNewNode(RBBINode.opQuestion);
331             qNode.fLeftChild = operandNode;
332             operandNode.fParent = qNode;
333         }
334         break;
335
336     case RBBIRuleParseTable.doUnaryOpStar:
337         {
338             RBBINode operandNode = fNodeStack[fNodeStackPtr--];
339             RBBINode starNode = pushNewNode(RBBINode.opStar);
340             starNode.fLeftChild = operandNode;
341             operandNode.fParent = starNode;
342         }
343         break;
344
345     case RBBIRuleParseTable.doRuleChar:
346         // A "Rule Character" is any single character that is a literal part
347
// of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]"
348
// These are pretty uncommon in break rules; the terms are more commonly
349
// sets. To keep things uniform, treat these characters like as
350
// sets that just happen to contain only one character.
351
{
352             n = pushNewNode(RBBINode.setRef);
353             String JavaDoc s = (new StringBuffer JavaDoc().append((char)fC.fChar)).toString();
354             findSetFor(s, n, null);
355             n.fFirstPos = fScanIndex;
356             n.fLastPos = fNextIndex;
357             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
358             break;
359         }
360
361     case RBBIRuleParseTable.doDotAny:
362         // scanned a ".", meaning match any single character.
363
{
364             n = pushNewNode(RBBINode.setRef);
365             findSetFor(kAny, n, null);
366             n.fFirstPos = fScanIndex;
367             n.fLastPos = fNextIndex;
368             n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
369             break;
370         }
371
372     case RBBIRuleParseTable.doSlash:
373         // Scanned a '/', which identifies a look-ahead break position in a rule.
374
n = pushNewNode(RBBINode.lookAhead);
375         n.fVal = fRuleNum;
376         n.fFirstPos = fScanIndex;
377         n.fLastPos = fNextIndex;
378         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
379         fLookAheadRule = true;
380         break;
381
382
383     case RBBIRuleParseTable.doStartTagValue:
384         // Scanned a '{', the opening delimiter for a tag value within a rule.
385
n = pushNewNode(RBBINode.tag);
386         n.fVal = 0;
387         n.fFirstPos = fScanIndex;
388         n.fLastPos = fNextIndex;
389         break;
390
391     case RBBIRuleParseTable.doTagDigit:
392         // Just scanned a decimal digit that's part of a tag value
393
{
394             n = fNodeStack[fNodeStackPtr];
395             int v = Character.digit((char)fC.fChar, 10);
396             n.fVal = n.fVal*10 + v;
397             break;
398         }
399
400     case RBBIRuleParseTable.doTagValue:
401         n = fNodeStack[fNodeStackPtr];
402         n.fLastPos = fNextIndex;
403         n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
404         break;
405
406     case RBBIRuleParseTable.doTagExpectedError:
407         error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
408         returnVal = false;
409         break;
410
411     case RBBIRuleParseTable.doOptionStart:
412         // Scanning a !!option. At the start of string.
413
fOptionStart = fScanIndex;
414         break;
415
416     case RBBIRuleParseTable.doOptionEnd:
417         {
418             String JavaDoc opt = fRB.fRules.substring(fOptionStart, fScanIndex);
419             if (opt.equals("chain")) {
420                 fRB.fChainRules = true;
421             } else if (opt.equals("LBCMNoChain")) {
422                 fRB.fLBCMNoChain = true;
423             } else if (opt.equals("forward")) {
424                 fRB.fDefaultTree = fRB.fForwardTree;
425             } else if (opt.equals("reverse")) {
426                 fRB.fDefaultTree = fRB.fReverseTree;
427             } else if (opt.equals("safe_forward")) {
428                 fRB.fDefaultTree = fRB.fSafeFwdTree;
429             } else if (opt.equals("safe_reverse")) {
430                 fRB.fDefaultTree = fRB.fSafeRevTree;
431             } else if (opt.equals("lookAheadHardBreak")) {
432                 fRB.fLookAheadHardBreak = true;
433             } else {
434                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
435             }
436             break;
437         }
438
439     case RBBIRuleParseTable.doReverseDir:
440         fReverseRule = true;
441         break;
442
443     case RBBIRuleParseTable.doStartVariableName:
444         n = pushNewNode(RBBINode.varRef);
445         n.fFirstPos = fScanIndex;
446         break;
447
448     case RBBIRuleParseTable.doEndVariableName:
449         n = fNodeStack[fNodeStackPtr];
450         if (n==null || n.fType != RBBINode.varRef) {
451             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
452             break;
453         }
454         n.fLastPos = fScanIndex;
455         n.fText = fRB.fRules.substring(n.fFirstPos+1, n.fLastPos);
456         // Look the newly scanned name up in the symbol table
457
// If there's an entry, set the l. child of the var ref to the replacement expression.
458
// (We also pass through here when scanning assignments, but no harm is done, other
459
// than a slight wasted effort that seems hard to avoid. Lookup will be null)
460
n.fLeftChild = fSymbolTable.lookupNode(n.fText);
461         break;
462
463     case RBBIRuleParseTable.doCheckVarDef:
464         n = fNodeStack[fNodeStackPtr];
465         if (n.fLeftChild == null) {
466             error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
467             returnVal = false;
468         }
469         break;
470
471     case RBBIRuleParseTable.doExprFinished:
472         break;
473
474     case RBBIRuleParseTable.doRuleErrorAssignExpr:
475         error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
476         returnVal = false;
477         break;
478
479     case RBBIRuleParseTable.doExit:
480         returnVal = false;
481         break;
482
483     case RBBIRuleParseTable.doScanUnicodeSet:
484         scanSet();
485         break;
486
487     default:
488         error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
489         returnVal = false;
490         break;
491     }
492     return returnVal;
493 }
494
495
496
497
498 //----------------------------------------------------------------------------------------
499
//
500
// Error Throw and IllegalArgumentException in response to a rule parse error.
501
//
502
//----------------------------------------------------------------------------------------
503
void error(int e) {
504     String JavaDoc s = "Error " + e + " at line " + fLineNum + " column " + fCharNum;
505     IllegalArgumentException JavaDoc ex = new IllegalArgumentException JavaDoc(s);
506     throw ex;
507     
508  }
509  
510
511
512
513
514 //----------------------------------------------------------------------------------------
515
//
516
// fixOpStack The parse stack holds partially assembled chunks of the parse tree.
517
// An entry on the stack may be as small as a single setRef node,
518
// or as large as the parse tree
519
// for an entire expression (this will be the one item left on the stack
520
// when the parsing of an RBBI rule completes.
521
//
522
// This function is called when a binary operator is encountered.
523
// It looks back up the stack for operators that are not yet associated
524
// with a right operand, and if the precedence of the stacked operator >=
525
// the precedence of the current operator, binds the operand left,
526
// to the previously encountered operator.
527
//
528
//----------------------------------------------------------------------------------------
529
void fixOpStack(int p) {
530     RBBINode n;
531     // printNodeStack("entering fixOpStack()");
532
for (;;) {
533         n = fNodeStack[fNodeStackPtr-1]; // an operator node
534
if (n.fPrecedence == 0) {
535             System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
536             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
537             return;
538         }
539
540         if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
541             // The most recent operand goes with the current operator,
542
// not with the previously stacked one.
543
break;
544         }
545             // Stack operator is a binary op ( '|' or concatenation)
546
// TOS operand becomes right child of this operator.
547
// Resulting subexpression becomes the TOS operand.
548
n.fRightChild = fNodeStack[fNodeStackPtr];
549             fNodeStack[fNodeStackPtr].fParent = n;
550             fNodeStackPtr--;
551         // printNodeStack("looping in fixOpStack() ");
552
}
553
554     if (p <= RBBINode.precLParen) {
555         // Scan is at a right paren or end of expression.
556
// The scanned item must match the stack, or else there was an error.
557
// Discard the left paren (or start expr) node from the stack,
558
// leaving the completed (sub)expression as TOS.
559
if (n.fPrecedence != p) {
560                 // Right paren encountered matched start of expression node, or
561
// end of expression matched with a left paren node.
562
error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
563             }
564             fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
565             fNodeStackPtr--;
566             // Delete the now-discarded LParen or Start node.
567
// delete n;
568
}
569     // printNodeStack("leaving fixOpStack()");
570
}
571
572
573 //----------------------------------------------------------------------------
574
//
575
// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
576
// been encountered. The val Node will be of nodetype uset
577
// and contain pointers to the actual UnicodeSets.
578
// The Key is the source string for initializing the set.
579
//
580
// The hash table is used to avoid creating duplicate
581
// unnamed (not $var references) UnicodeSets.
582
//
583
//----------------------------------------------------------------------------
584
static class RBBISetTableEl {
585     String JavaDoc key;
586     RBBINode val;
587 }
588
589
590 //----------------------------------------------------------------------------------------
591
//
592
// findSetFor given a String,
593
// - find the corresponding Unicode Set (uset node)
594
// (create one if necessary)
595
// - Set fLeftChild of the caller's node (should be a setRef node)
596
// to the uset node
597
// Maintain a hash table of uset nodes, so the same one is always used
598
// for the same string.
599
// If a "to adopt" set is provided and we haven't seen this key before,
600
// add the provided set to the hash table.
601
// If the string is one (32 bit) char in length, the set contains
602
// just one element which is the char in question.
603
// If the string is "any", return a set containing all chars.
604
//
605
//----------------------------------------------------------------------------------------
606
void findSetFor(String JavaDoc s, RBBINode node, UnicodeSet setToAdopt) {
607
608     RBBISetTableEl el;
609
610     // First check whether we've already cached a set for this string.
611
// If so, just use the cached set in the new node.
612
// delete any set provided by the caller, since we own it.
613
el = (RBBISetTableEl)fSetTable.get(s);
614     if (el != null) {
615         node.fLeftChild = el.val;
616         Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
617         return;
618     }
619
620     // Haven't seen this set before.
621
// If the caller didn't provide us with a prebuilt set,
622
// create a new UnicodeSet now.
623
if (setToAdopt == null) {
624         if (s.equals(kAny)) {
625             setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
626         } else {
627             int c;
628             c = UTF16.charAt(s, 0);
629             setToAdopt = new UnicodeSet(c, c);
630         }
631     }
632
633     //
634
// Make a new uset node to refer to this UnicodeSet
635
// This new uset node becomes the child of the caller's setReference node.
636
//
637
RBBINode usetNode = new RBBINode(RBBINode.uset);
638     usetNode.fInputSet = setToAdopt;
639     usetNode.fParent = node;
640     node.fLeftChild = usetNode;
641     usetNode.fText = s;
642
643
644     //
645
// Add the new uset node to the list of all uset nodes.
646
//
647
fRB.fUSetNodes.add(usetNode);
648
649
650     //
651
// Add the new set to the set hash table.
652
//
653
el = new RBBISetTableEl();
654     el.key = s;
655     el.val = usetNode;
656     fSetTable.put(el.key, el);
657
658     return;
659 }
660
661
662
663 //
664
// Assorted Unicode character constants.
665
// Numeric because there is no portable way to enter them as literals.
666
// (Think EBCDIC).
667
//
668
static final int chNEL = 0x85; // NEL newline variant
669
static final int chLS = 0x2028; // Unicode Line Separator
670

671
672 //----------------------------------------------------------------------------------------
673
//
674
// stripRules Return a rules string without unnecessary
675
// characters.
676
//
677
//----------------------------------------------------------------------------------------
678
static String JavaDoc stripRules(String JavaDoc rules) {
679     StringBuffer JavaDoc strippedRules = new StringBuffer JavaDoc();
680     int rulesLength = rules.length();
681     for (int idx = 0; idx < rulesLength; ) {
682         char ch = rules.charAt(idx++);
683         if (ch == '#') {
684             while (idx < rulesLength
685                 && ch != '\r' && ch != '\n' && ch != chNEL)
686             {
687                 ch = rules.charAt(idx++);
688             }
689         }
690         if (!UCharacter.isISOControl(ch)) {
691             strippedRules.append(ch);
692         }
693     }
694     return strippedRules.toString();
695 }
696
697
698 //----------------------------------------------------------------------------------------
699
//
700
// nextCharLL Low Level Next Char from rule input source.
701
// Get a char from the input character iterator,
702
// keep track of input position for error reporting.
703
//
704
//----------------------------------------------------------------------------------------
705
int nextCharLL() {
706     int ch;
707
708     if (fNextIndex >= fRB.fRules.length()) {
709         return -1;
710     }
711     ch = UTF16.charAt(fRB.fRules, fNextIndex);
712     fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
713
714     if (ch == '\r' ||
715         ch == chNEL ||
716         ch == chLS ||
717         ch == '\n' && fLastChar != '\r') {
718         // Character is starting a new line. Bump up the line number, and
719
// reset the column to 0.
720
fLineNum++;
721         fCharNum=0;
722         if (fQuoteMode) {
723             error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
724             fQuoteMode = false;
725         }
726     }
727     else {
728         // Character is not starting a new line. Except in the case of a
729
// LF following a CR, increment the column position.
730
if (ch != '\n') {
731             fCharNum++;
732         }
733     }
734     fLastChar = ch;
735     return ch;
736 }
737
738
739 //---------------------------------------------------------------------------------
740
//
741
// nextChar for rules scanning. At this level, we handle stripping
742
// out comments and processing backslash character escapes.
743
// The rest of the rules grammar is handled at the next level up.
744
//
745
//---------------------------------------------------------------------------------
746
void nextChar(RBBIRuleChar c) {
747
748     // Unicode Character constants needed for the processing done by nextChar(),
749
// in hex because literals wont work on EBCDIC machines.
750

751     fScanIndex = fNextIndex;
752     c.fChar = nextCharLL();
753     c.fEscaped = false;
754
755     //
756
// check for '' sequence.
757
// These are recognized in all contexts, whether in quoted text or not.
758
//
759
if (c.fChar == '\'') {
760         if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
761             c.fChar = nextCharLL(); // get nextChar officially so character counts
762
c.fEscaped = true; // stay correct.
763
}
764         else
765         {
766             // Single quote, by itself.
767
// Toggle quoting mode.
768
// Return either '(' or ')', because quotes cause a grouping of the quoted text.
769
fQuoteMode = !fQuoteMode;
770             if (fQuoteMode == true) {
771                 c.fChar = '(';
772             } else {
773                 c.fChar = ')';
774             }
775             c.fEscaped = false; // The paren that we return is not escaped.
776
return;
777         }
778     }
779
780     if (fQuoteMode) {
781         c.fEscaped = true;
782     }
783     else
784     {
785         // We are not in a 'quoted region' of the source.
786
//
787
if (c.fChar == '#') {
788             // Start of a comment. Consume the rest of it.
789
// The new-line char that terminates the comment is always returned.
790
// It will be treated as white-space, and serves to break up anything
791
// that might otherwise incorrectly clump together with a comment in
792
// the middle (a variable name, for example.)
793
for (;;) {
794                 c.fChar = nextCharLL();
795                 if (c.fChar == (int)-1 || // EOF
796
c.fChar == '\r' ||
797                     c.fChar == '\n' ||
798                     c.fChar == chNEL ||
799                     c.fChar == chLS) {break;}
800             }
801         }
802         if (c.fChar == (int)-1) {
803             return;
804         }
805
806         //
807
// check for backslash escaped characters.
808
// Use String.unescapeAt() to handle them.
809
//
810
if (c.fChar == '\\') {
811             c.fEscaped = true;
812             int [] unescapeIndex = new int[1];
813             unescapeIndex[0] = fNextIndex;
814             c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
815             if (unescapeIndex[0] == fNextIndex) {
816                 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
817             }
818             
819             fCharNum += unescapeIndex[0]-fNextIndex;
820             fNextIndex = unescapeIndex[0];
821         }
822     }
823     // putc(c.fChar, stdout);
824
}
825
826 //---------------------------------------------------------------------------------
827
//
828
// Parse RBBI rules. The state machine for rules parsing is here.
829
// The state tables are hand-written in the file rbbirpt.txt,
830
// and converted to the form used here by a perl
831
// script rbbicst.pl
832
//
833
//---------------------------------------------------------------------------------
834
void parse() {
835     int state;
836     RBBIRuleParseTable.RBBIRuleTableElement tableEl;
837
838
839     state = 1;
840     nextChar(fC);
841     //
842
// Main loop for the rule parsing state machine.
843
// Runs once per state transition.
844
// Each time through optionally performs, depending on the state table,
845
// - an advance to the the next input char
846
// - an action to be performed.
847
// - pushing or popping a state to/from the local state return stack.
848
//
849
for (;;) {
850         // Quit if state == 0. This is the normal way to exit the state machine.
851
//
852
if (state == 0) {
853             break;
854         }
855
856         // Find the state table element that matches the input char from the rule, or the
857
// class of the input character. Start with the first table row for this
858
// state, then linearly scan forward until we find a row that matches the
859
// character. The last row for each state always matches all characters, so
860
// the search will stop there, if not before.
861
//
862
tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
863             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) {
864                 System.out.println("char, line, col = (\'" + (char)fC.fChar + "\', " + fLineNum + ", " + fCharNum +
865                         " state = " + tableEl.fStateName);
866             }
867
868         for (int tableRow=state; ;tableRow++) { // loop over the state table rows associated with this state.
869
tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
870             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) { System.out.print(".");}
871             if (tableEl.fCharClass < 127 && fC.fEscaped == false && tableEl.fCharClass == fC.fChar) {
872                 // Table row specified an individual character, not a set, and
873
// the input character is not escaped, and
874
// the input character matched it.
875
break;
876             }
877             if (tableEl.fCharClass == 255) {
878                 // Table row specified default, match anything character class.
879
break;
880             }
881             if (tableEl.fCharClass == 254 && fC.fEscaped) {
882                 // Table row specified "escaped" and the char was escaped.
883
break;
884             }
885             if (tableEl.fCharClass == 253 && fC.fEscaped &&
886                 (fC.fChar == 0x50 || fC.fChar == 0x70 )) {
887                 // Table row specified "escaped P" and the char is either 'p' or 'P'.
888
break;
889             }
890             if (tableEl.fCharClass == 252 && fC.fChar == (int)-1) {
891                 // Table row specified eof and we hit eof on the input.
892
break;
893             }
894
895             if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class &&
896
fC.fEscaped == false && // char is not escaped &&
897
fC.fChar != (int)-1) { // char is not EOF
898
UnicodeSet uniset = fRuleSets[tableEl.fCharClass-128];
899                 if (uniset.contains(fC.fChar)) {
900                     // Table row specified a character class, or set of characters,
901
// and the current char matches it.
902
break;
903                 }
904             }
905        }
906         
907         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) { System.out.println("");}
908         //
909
// We've found the row of the state table that matches the current input
910
// character from the rules string.
911
// Perform any action specified by this row in the state table.
912
if (doParseActions(tableEl.fAction) == false) {
913             // Break out of the state machine loop if the
914
// the action signalled some kind of error, or
915
// the action was to exit, occurs on normal end-of-rules-input.
916
break;
917         }
918
919         if (tableEl.fPushState != 0) {
920             fStackPtr++;
921             if (fStackPtr >= kStackSize) {
922                 System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
923                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
924             }
925             fStack[fStackPtr] = tableEl.fPushState;
926         }
927
928         if (tableEl.fNextChar) {
929             nextChar(fC);
930         }
931
932         // Get the next state from the table entry, or from the
933
// state stack if the next state was specified as "pop".
934
if (tableEl.fNextState != 255) {
935             state = tableEl.fNextState;
936         } else {
937             state = fStack[fStackPtr];
938             fStackPtr--;
939             if (fStackPtr < 0) {
940                 System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
941                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
942             }
943         }
944
945     }
946
947     //
948
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
949
//
950
if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
951         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
952         RBBINode operand = pushNewNode(RBBINode.setRef);
953         findSetFor(kAny, operand, null);
954         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
955         operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
956         fNodeStackPtr -= 2;
957     }
958
959
960     //
961
// Parsing of the input RBBI rules is complete.
962
// We now have a parse tree for the rule expressions
963
// and a list of all UnicodeSets that are referenced.
964
//
965
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("symbols")>=0) {
966         fSymbolTable.rbbiSymtablePrint();
967         }
968     if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ptree")>=0)
969     {
970         System.out.println("Completed Forward Rules Parse Tree...");
971         fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
972         System.out.println("\nCompleted Reverse Rules Parse Tree...");
973         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
974         System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
975         if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
976             System.out.println(" -- null -- ");
977         } else {
978             fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
979         }
980         System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
981         if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
982             System.out.println(" -- null -- ");
983         } else {
984             fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
985         }
986     }
987 }
988
989
990 //---------------------------------------------------------------------------------
991
//
992
// printNodeStack for debugging...
993
//
994
//---------------------------------------------------------------------------------
995
void printNodeStack(String JavaDoc title) {
996     int i;
997     System.out.println(title + ". Dumping node stack...\n");
998     for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i].printTree(true);}
999 }
1000
1001
1002
1003
1004//---------------------------------------------------------------------------------
1005
//
1006
// pushNewNode create a new RBBINode of the specified type and push it
1007
// onto the stack of nodes.
1008
//
1009
//---------------------------------------------------------------------------------
1010
RBBINode pushNewNode(int nodeType) {
1011    fNodeStackPtr++;
1012    if (fNodeStackPtr >= kStackSize) {
1013        System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1014        error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1015    }
1016    fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1017    return fNodeStack[fNodeStackPtr];
1018}
1019
1020
1021
1022//---------------------------------------------------------------------------------
1023
//
1024
// scanSet Construct a UnicodeSet from the text at the current scan
1025
// position. Advance the scan position to the first character
1026
// after the set.
1027
//
1028
// A new RBBI setref node referring to the set is pushed onto the node
1029
// stack.
1030
//
1031
// The scan position is normally under the control of the state machine
1032
// that controls rule parsing. UnicodeSets, however, are parsed by
1033
// the UnicodeSet constructor, not by the RBBI rule parser.
1034
//
1035
//---------------------------------------------------------------------------------
1036
void scanSet() {
1037    UnicodeSet uset = null;
1038    int startPos;
1039    ParsePosition JavaDoc pos = new ParsePosition JavaDoc(fScanIndex);
1040    int i;
1041
1042    startPos = fScanIndex;
1043    try {
1044        uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1045    } catch (Exception JavaDoc e) { // TODO: catch fewer exception types.
1046
// Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1047
error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1048    }
1049
1050    // Verify that the set contains at least one code point.
1051
//
1052
if (uset.isEmpty()) {
1053        // This set is empty.
1054
// Make it an error, because it almost certainly is not what the user wanted.
1055
// Also, avoids having to think about corner cases in the tree manipulation code
1056
// that occurs later on.
1057
// TODO: this shouldn't be an error; it does happen.
1058
error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1059    }
1060
1061
1062    // Advance the RBBI parse postion over the UnicodeSet pattern.
1063
// Don't just set fScanIndex because the line/char positions maintained
1064
// for error reporting would be thrown off.
1065
i = pos.getIndex();
1066    for (;;) {
1067        if (fNextIndex >= i) {
1068            break;
1069        }
1070        nextCharLL();
1071    }
1072
1073        RBBINode n;
1074
1075        n = pushNewNode(RBBINode.setRef);
1076        n.fFirstPos = startPos;
1077        n.fLastPos = fNextIndex;
1078        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1079        // findSetFor() serves several purposes here:
1080
// - Adopts storage for the UnicodeSet, will be responsible for deleting.
1081
// - Mantains collection of all sets in use, needed later for establishing
1082
// character categories for run time engine.
1083
// - Eliminates mulitiple instances of the same set.
1084
// - Creates a new uset node if necessary (if this isn't a duplicate.)
1085
findSetFor(n.fText, n, uset);
1086    }
1087
1088}
1089
1090
Popular Tags