RBBIRuleScanner


1   /*
2    *******************************************************************************
3    * Copyright (C) 2003-2006,
4    * International Business Machines Corporation and others. All Rights Reserved.
5    *******************************************************************************
6    */
7    
8   package com.ibm.icu.text;
9   
10  import java.util.HashMap  ;
11  import java.lang.IllegalArgumentException  ;
12  import java.text.ParsePosition  ;
13  
14  import com.ibm.icu.lang.UCharacter;
15  import com.ibm.icu.impl.Assert;
16  import com.ibm.icu.impl.Utility;
17  
18  /**
19    *  This class is part of the Rule Based Break Iterator rule compiler.
20    *  It scans the rules and builds the parse tree.
21    *  There is no public API here.
22    *  @internal
23    */
24  class RBBIRuleScanner {
25      
26      private  final int    kStackSize = 100;               // The size of the state stack for
27      //   rules parsing.  Corresponds roughly
28      //   to the depth of parentheses nesting
29      //   that is allowed in the rules.
30      
31      static class RBBIRuleChar {
32          int             fChar;
33          boolean         fEscaped;
34      }
35  
36  
37  
38  RBBIRuleBuilder               fRB;              // The rule builder that we are part of.
39  
40  int                       fScanIndex;        // Index of current character being processed
41                                                   //   in the rule input string.
42  int                       fNextIndex;        // Index of the next character, which
43                                                   //   is the first character not yet scanned.
44  boolean                  fQuoteMode;        // Scan is in a 'quoted region'
45  int                       fLineNum;          // Line number in input file.
46  int                       fCharNum;          // Char position within the line.
47  int                       fLastChar;         // Previous char, needed to count CR-LF
48                                                   //   as a single line, not two.
49  
50  RBBIRuleChar              fC = new RBBIRuleChar();    // Current char for parse state machine
51                                                   //   processing.
52  String                      fVarName;          // $variableName, valid when we've just
53                                                   //   scanned one.
54  
55  
56  short  fStack[] = new short[kStackSize];  // State stack, holds state pushes
57  int                       fStackPtr;           //  and pops as specified in the state
58                                                     //  transition rules.
59  
60  RBBINode  fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
61                                                         //  during the parse of a rule
62  int                        fNodeStackPtr;
63  
64  
65  boolean                          fReverseRule;     // True if the rule currently being scanned
66                                                   //  is a reverse direction rule (if it
67                                                   //  starts with a '!')
68  
69  boolean                          fLookAheadRule;   // True if the rule includes a '/'
70                                                   //   somewhere within it.
71  
72  RBBISymbolTable              fSymbolTable;     // symbol table, holds definitions of
73                                                   //   $variable symbols.
74  
75  HashMap                   fSetTable = new HashMap  ();        // UnicocodeSet hash table, holds indexes to
76                                             //   the sets created while parsing rules.
77                                                   //   The key is the string used for creating
78                                                   //   the set.
79  
80  UnicodeSet      fRuleSets[] = new UnicodeSet[10];    // Unicode Sets that are needed during
81                                                   //  the scanning of RBBI rules.  The
82                                                   //  indicies for these are assigned by the
83                                                   //  perl script that builds the state tables.
84                                                   //  See rbbirpt.h.
85  
86  int                        fRuleNum;         // Counts each rule as it is scanned.
87  
88  int                        fOptionStart;     // Input index of start of a !!option
89                                                   //   keyword, while being scanned.
90  
91      
92  
93     static private String   gRuleSet_rule_char_pattern       = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
94     static private String   gRuleSet_name_char_pattern       = "[_\\p{L}\\p{N}]";
95     static private String   gRuleSet_digit_char_pattern      = "[0-9]";
96     static private String   gRuleSet_name_start_char_pattern = "[_\\p{L}]";
97     static private String   gRuleSet_white_space_pattern     = "[\\p{Pattern_White_Space}]";
98     static private String   kAny =  "any";
99  
100     
101  
102 
103 //----------------------------------------------------------------------------------------
104 //
105 //  Constructor.
106 //
107 //----------------------------------------------------------------------------------------
108 RBBIRuleScanner(RBBIRuleBuilder rb)
109 {
110     fRB                 = rb;
111     fLineNum            = 1;
112 
113     //
114     //  Set up the constant Unicode Sets.
115     //     Note:  These could be made static and shared among
116     //            all instances of RBBIRuleScanners. 
117     fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char-128]       = new UnicodeSet(gRuleSet_rule_char_pattern);
118     fRuleSets[RBBIRuleParseTable.kRuleSet_white_space-128]     = new UnicodeSet(gRuleSet_white_space_pattern);  
119     fRuleSets[RBBIRuleParseTable.kRuleSet_name_char-128]       = new UnicodeSet(gRuleSet_name_char_pattern);
120     fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char-128] = new UnicodeSet(gRuleSet_name_start_char_pattern);
121     fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char-128]      = new UnicodeSet(gRuleSet_digit_char_pattern);
122 
123     fSymbolTable = new RBBISymbolTable(this, rb.fRules);
124  }
125 
126 
127 
128 
129 //----------------------------------------------------------------------------------------
130 //
131 //  doParseAction        Do some action during rule parsing.
132 //                       Called by the parse state machine.
133 //                       Actions build the parse tree and Unicode Sets,
134 //                       and maintain the parse stack for nested expressions.
135 //
136 //----------------------------------------------------------------------------------------
137 boolean doParseActions(int action)  
138 {
139     RBBINode n       = null;
140 
141     boolean   returnVal = true;
142 
143     switch (action) {
144 
145     case RBBIRuleParseTable.doExprStart:
146         pushNewNode(RBBINode.opStart);
147         fRuleNum++;
148         break;
149 
150 
151     case RBBIRuleParseTable.doExprOrOperator:
152         {
153             fixOpStack(RBBINode.precOpCat);
154             RBBINode  operandNode = fNodeStack[fNodeStackPtr--];
155             RBBINode  orNode      = pushNewNode(RBBINode.opOr);
156             orNode.fLeftChild     = operandNode;
157             operandNode.fParent   = orNode;
158         }
159         break;
160 
161     case RBBIRuleParseTable.doExprCatOperator:
162         // concatenation operator.
163         // For the implicit concatenation of adjacent terms in an expression that are
164         //   not separated by any other operator.  Action is invoked between the
165         //   actions for the two terms.
166         {
167             fixOpStack(RBBINode.precOpCat);
168             RBBINode  operandNode = fNodeStack[fNodeStackPtr--];
169             RBBINode  catNode     = pushNewNode(RBBINode.opCat);
170             catNode.fLeftChild    = operandNode;
171             operandNode.fParent   = catNode;
172         }
173         break;
174 
175     case RBBIRuleParseTable.doLParen:
176         // Open Paren.
177         //   The openParen node is a dummy operation type with a low precedence,
178         //     which has the affect of ensuring that any real binary op that
179         //     follows within the parens binds more tightly to the operands than
180         //     stuff outside of the parens.
181         pushNewNode(RBBINode.opLParen);
182         break;
183 
184     case RBBIRuleParseTable.doExprRParen:
185         fixOpStack(RBBINode.precLParen);
186         break;
187 
188     case RBBIRuleParseTable.doNOP:
189         break;
190 
191     case RBBIRuleParseTable.doStartAssign:
192         // We've just scanned "$variable = "
193         // The top of the node stack has the $variable ref node.
194 
195         // Save the start position of the RHS text in the StartExpression node
196         //   that precedes the $variableReference node on the stack.
197         //   This will eventually be used when saving the full $variable replacement
198         //   text as a string.
199         n = fNodeStack[fNodeStackPtr-1];
200         n.fFirstPos = fNextIndex;              // move past the '='
201 
202         // Push a new start-of-expression node; needed to keep parse of the
203         //   RHS expression happy.
204         pushNewNode(RBBINode.opStart);
205         break;
206 
207 
208 
209 
210     case RBBIRuleParseTable.doEndAssign:
211         {
212             // We have reached the end of an assignement statement.
213             //   Current scan char is the ';' that terminates the assignment.
214 
215             // Terminate expression, leaves expression parse tree rooted in TOS node.
216             fixOpStack(RBBINode.precStart);
217 
218             RBBINode startExprNode  = fNodeStack[fNodeStackPtr-2];
219             RBBINode varRefNode     = fNodeStack[fNodeStackPtr-1];
220             RBBINode RHSExprNode    = fNodeStack[fNodeStackPtr];
221 
222             // Save original text of right side of assignment, excluding the terminating ';'
223             //  in the root of the node for the right-hand-side expression.
224             RHSExprNode.fFirstPos = startExprNode.fFirstPos;
225             RHSExprNode.fLastPos  = fScanIndex;
226             // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, RHSExprNode.fLastPos, RHSExprNode.fText);
227             RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, RHSExprNode.fLastPos); 
228 
229             // Expression parse tree becomes l. child of the $variable reference node.
230             varRefNode.fLeftChild = RHSExprNode;
231             RHSExprNode.fParent   = varRefNode;
232 
233             // Make a symbol table entry for the $variableRef node.
234             fSymbolTable.addEntry(varRefNode.fText, varRefNode);
235  
236             // Clean up the stack.
237             fNodeStackPtr-=3;
238             break;
239         }
240 
241     case RBBIRuleParseTable.doEndOfRule:
242         {
243         fixOpStack(RBBINode.precStart);      // Terminate expression, leaves expression
244 
245         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rtree") >= 0) {printNodeStack("end of rule");}
246         Assert.assrt(fNodeStackPtr == 1);
247 
248         // If this rule includes a look-ahead '/', add a endMark node to the
249         //   expression tree.
250         if (fLookAheadRule) {
251             RBBINode  thisRule       = fNodeStack[fNodeStackPtr];
252             RBBINode  endNode        = pushNewNode(RBBINode.endMark);
253             RBBINode  catNode        = pushNewNode(RBBINode.opCat);
254             fNodeStackPtr -= 2;
255             catNode.fLeftChild       = thisRule;
256             catNode.fRightChild      = endNode;
257             fNodeStack[fNodeStackPtr] = catNode;
258             endNode.fVal             = fRuleNum;
259             endNode.fLookAheadEnd    = true;
260         }
261 
262         // All rule expressions are ORed together.
263         // The ';' that terminates an expression really just functions as a '|' with
264         //   a low operator prededence.
265         //
266         // Each of the four sets of rules are collected separately.
267         //  (forward, reverse, safe_forward, safe_reverse)
268         //  OR this rule into the appropriate group of them.
269         //
270         
271         int destRules = (fReverseRule? fRB.fReverseTree : fRB.fDefaultTree);
272 
273         if (fRB.fTreeRoots[destRules] != null) {
274             // This is not the first rule encounted.
275             // OR previous stuff  (from *destRules)
276             // with the current rule expression (on the Node Stack)
277             //  with the resulting OR expression going to *destRules
278             //
279             RBBINode  thisRule        = fNodeStack[fNodeStackPtr];
280             RBBINode  prevRules       = fRB.fTreeRoots[destRules];
281             RBBINode  orNode          = pushNewNode(RBBINode.opOr);
282             orNode.fLeftChild         = prevRules;
283             prevRules.fParent         = orNode;
284             orNode.fRightChild        = thisRule;
285             thisRule.fParent          = orNode;
286             fRB.fTreeRoots[destRules] = orNode;
287         }
288         else
289         {
290             // This is the first rule encountered (for this direction).
291             // Just move its parse tree from the stack to *destRules.
292             fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];       
293         }
294         fReverseRule   = false;   // in preparation for the next rule.
295         fLookAheadRule = false;
296         fNodeStackPtr  = 0;
297         }
298         break;
299 
300 
301     case RBBIRuleParseTable.doRuleError:
302         error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
303         returnVal = false;
304         break;
305 
306 
307     case RBBIRuleParseTable.doVariableNameExpectedErr:
308         error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
309         break;
310 
311 
312     //
313     //  Unary operands  + ? *
314     //    These all appear after the operand to which they apply.
315     //    When we hit one, the operand (may be a whole sub expression)
316     //    will be on the top of the stack.
317     //    Unary Operator becomes TOS, with the old TOS as its one child.
318     case RBBIRuleParseTable.doUnaryOpPlus:
319         {
320             RBBINode  operandNode = fNodeStack[fNodeStackPtr--];
321             RBBINode  plusNode    = pushNewNode(RBBINode.opPlus);
322             plusNode.fLeftChild   = operandNode;
323             operandNode.fParent   = plusNode;
324         }
325         break;
326 
327     case RBBIRuleParseTable.doUnaryOpQuestion:
328         {
329             RBBINode  operandNode = fNodeStack[fNodeStackPtr--];
330             RBBINode  qNode       = pushNewNode(RBBINode.opQuestion);
331             qNode.fLeftChild      = operandNode;
332             operandNode.fParent   = qNode;
333         }
334         break;
335 
336     case RBBIRuleParseTable.doUnaryOpStar:
337         {
338             RBBINode  operandNode = fNodeStack[fNodeStackPtr--];
339             RBBINode  starNode    = pushNewNode(RBBINode.opStar);
340             starNode.fLeftChild   = operandNode;
341             operandNode.fParent   = starNode;
342         }
343         break;
344 
345     case RBBIRuleParseTable.doRuleChar:
346         // A "Rule Character" is any single character that is a literal part
347         // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
348         // These are pretty uncommon in break rules; the terms are more commonly
349         //  sets.  To keep things uniform, treat these characters like as
350         // sets that just happen to contain only one character.
351         {
352             n = pushNewNode(RBBINode.setRef);
353             String   s = (new StringBuffer  ().append((char)fC.fChar)).toString();
354             findSetFor(s, n, null);
355             n.fFirstPos = fScanIndex;
356             n.fLastPos  = fNextIndex;
357             n.fText     = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
358             break;
359         }
360 
361     case RBBIRuleParseTable.doDotAny:
362         // scanned a ".", meaning match any single character.
363         {
364             n = pushNewNode(RBBINode.setRef);
365             findSetFor(kAny, n, null);
366             n.fFirstPos = fScanIndex;
367             n.fLastPos  = fNextIndex;
368             n.fText     = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
369             break;
370         }
371 
372     case RBBIRuleParseTable.doSlash:
373         // Scanned a '/', which identifies a look-ahead break position in a rule.
374         n = pushNewNode(RBBINode.lookAhead);
375         n.fVal      = fRuleNum;
376         n.fFirstPos = fScanIndex;
377         n.fLastPos  = fNextIndex;
378         n.fText     = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
379         fLookAheadRule = true;
380         break;
381 
382 
383     case RBBIRuleParseTable.doStartTagValue:
384         // Scanned a '{', the opening delimiter for a tag value within a rule.
385         n = pushNewNode(RBBINode.tag);
386         n.fVal      = 0;
387         n.fFirstPos = fScanIndex;
388         n.fLastPos  = fNextIndex;
389         break;
390 
391     case RBBIRuleParseTable.doTagDigit:
392         // Just scanned a decimal digit that's part of a tag value
393         {
394             n = fNodeStack[fNodeStackPtr];
395             int v = Character.digit((char)fC.fChar, 10);
396             n.fVal = n.fVal*10 + v;
397             break;
398         }
399 
400     case RBBIRuleParseTable.doTagValue:
401         n = fNodeStack[fNodeStackPtr];
402         n.fLastPos = fNextIndex;
403         n.fText    = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
404         break;
405 
406     case RBBIRuleParseTable.doTagExpectedError:
407         error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
408         returnVal = false;
409         break;
410 
411     case RBBIRuleParseTable.doOptionStart:
412         // Scanning a !!option.   At the start of string.
413         fOptionStart = fScanIndex;
414         break;
415 
416     case RBBIRuleParseTable.doOptionEnd:
417         {
418             String   opt = fRB.fRules.substring(fOptionStart, fScanIndex);  
419             if (opt.equals("chain")) {
420                 fRB.fChainRules = true;
421             } else if (opt.equals("LBCMNoChain")) {
422                 fRB.fLBCMNoChain = true;
423             } else if (opt.equals("forward")) {
424                 fRB.fDefaultTree   = fRB.fForwardTree;
425             } else if (opt.equals("reverse")) {
426                 fRB.fDefaultTree   = fRB.fReverseTree;
427             } else if (opt.equals("safe_forward")) {
428                 fRB.fDefaultTree   = fRB.fSafeFwdTree;
429             } else if (opt.equals("safe_reverse")) {
430                 fRB.fDefaultTree   = fRB.fSafeRevTree;
431             } else if (opt.equals("lookAheadHardBreak")) {
432                 fRB.fLookAheadHardBreak = true;
433             } else {
434                 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
435             }
436             break;
437         }
438 
439     case RBBIRuleParseTable.doReverseDir:
440         fReverseRule = true;
441         break;
442 
443     case RBBIRuleParseTable.doStartVariableName:
444         n = pushNewNode(RBBINode.varRef);
445         n.fFirstPos = fScanIndex;
446         break;
447 
448     case RBBIRuleParseTable.doEndVariableName:
449         n = fNodeStack[fNodeStackPtr];
450         if (n==null || n.fType != RBBINode.varRef) {
451             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
452             break;
453         }
454         n.fLastPos = fScanIndex;
455         n.fText    = fRB.fRules.substring(n.fFirstPos+1, n.fLastPos);
456         // Look the newly scanned name up in the symbol table
457         //   If there's an entry, set the l. child of the var ref to the replacement expression.
458         //   (We also pass through here when scanning assignments, but no harm is done, other
459         //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
460         n.fLeftChild = fSymbolTable.lookupNode(n.fText);
461         break;
462 
463     case RBBIRuleParseTable.doCheckVarDef:
464         n = fNodeStack[fNodeStackPtr];
465         if (n.fLeftChild == null) {
466             error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
467             returnVal = false;
468         }
469         break;
470 
471     case RBBIRuleParseTable.doExprFinished:
472         break;
473 
474     case RBBIRuleParseTable.doRuleErrorAssignExpr:
475         error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
476         returnVal = false;
477         break;
478 
479     case RBBIRuleParseTable.doExit:
480         returnVal = false;
481         break;
482 
483     case RBBIRuleParseTable.doScanUnicodeSet:
484         scanSet();
485         break;
486 
487     default:
488         error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
489         returnVal = false;
490         break;
491     }
492     return returnVal;
493 }
494 
495 
496 
497 
498 //----------------------------------------------------------------------------------------
499 //
500 //  Error         Throw and IllegalArgumentException in response to a rule parse error.
501 //
502 //----------------------------------------------------------------------------------------
503 void error(int e)  {
504     String   s = "Error " + e + " at line " + fLineNum + " column " + fCharNum;
505     IllegalArgumentException   ex = new IllegalArgumentException  (s);
506     throw ex;
507     
508  }
509  
510 
511 
512 
513 
514 //----------------------------------------------------------------------------------------
515 //
516 //  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
517 //               An entry on the stack may be as small as a single setRef node,
518 //               or as large as the parse tree
519 //               for an entire expression (this will be the one item left on the stack
520 //               when the parsing of an RBBI rule completes.
521 //
522 //               This function is called when a binary operator is encountered.
523 //               It looks back up the stack for operators that are not yet associated
524 //               with a right operand, and if the precedence of the stacked operator >=
525 //               the precedence of the current operator, binds the operand left,
526 //               to the previously encountered operator.
527 //
528 //----------------------------------------------------------------------------------------
529 void fixOpStack(int p)  {
530     RBBINode n;
531     // printNodeStack("entering fixOpStack()");
532     for (;;) {
533         n = fNodeStack[fNodeStackPtr-1];   // an operator node
534         if (n.fPrecedence == 0) {
535             System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
536             error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
537             return;
538         }
539 
540         if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
541             // The most recent operand goes with the current operator,
542             //   not with the previously stacked one.
543             break;
544         }
545             // Stack operator is a binary op  ( '|' or concatenation)
546             //   TOS operand becomes right child of this operator.
547             //   Resulting subexpression becomes the TOS operand.
548             n.fRightChild = fNodeStack[fNodeStackPtr];
549             fNodeStack[fNodeStackPtr].fParent = n;
550             fNodeStackPtr--;
551         // printNodeStack("looping in fixOpStack()   ");
552     }
553 
554     if (p <= RBBINode.precLParen) {
555         // Scan is at a right paren or end of expression.
556         //  The scanned item must match the stack, or else there was an error.
557         //  Discard the left paren (or start expr) node from the stack,
558             //  leaving the completed (sub)expression as TOS.
559             if (n.fPrecedence != p) {
560                 // Right paren encountered matched start of expression node, or
561                 // end of expression matched with a left paren node.
562                 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
563             }
564             fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
565             fNodeStackPtr--;
566             // Delete the now-discarded LParen or Start node.
567             // delete n;
568     }
569     // printNodeStack("leaving fixOpStack()");
570 }
571 
572 
573 //----------------------------------------------------------------------------
574 //
575 //       RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
576 //                        been encountered.  The val Node will be of nodetype uset
577 //                        and contain pointers to the actual UnicodeSets.
578 //                        The Key is the source string for initializing the set.
579 //
580 //                        The hash table is used to avoid creating duplicate
581 //                        unnamed (not $var references) UnicodeSets.
582 //
583 //----------------------------------------------------------------------------
584 static class RBBISetTableEl {
585     String           key;
586     RBBINode       val;
587 }
588 
589 
590 //----------------------------------------------------------------------------------------
591 //
592 //   findSetFor    given a String,
593 //                  - find the corresponding Unicode Set  (uset node)
594 //                         (create one if necessary)
595 //                  - Set fLeftChild of the caller's node (should be a setRef node)
596 //                         to the uset node
597 //                 Maintain a hash table of uset nodes, so the same one is always used
598 //                    for the same string.
599 //                 If a "to adopt" set is provided and we haven't seen this key before,
600 //                    add the provided set to the hash table.
601 //                 If the string is one (32 bit) char in length, the set contains
602 //                    just one element which is the char in question.
603 //                 If the string is "any", return a set containing all chars.
604 //
605 //----------------------------------------------------------------------------------------
606 void findSetFor(String   s, RBBINode node, UnicodeSet setToAdopt) {
607 
608     RBBISetTableEl   el;
609 
610     // First check whether we've already cached a set for this string.
611     // If so, just use the cached set in the new node.
612     //   delete any set provided by the caller, since we own it.
613     el = (RBBISetTableEl)fSetTable.get(s);
614     if (el != null) {
615         node.fLeftChild = el.val;
616         Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
617         return;
618     }
619 
620     // Haven't seen this set before.
621     // If the caller didn't provide us with a prebuilt set,
622     //   create a new UnicodeSet now.
623     if (setToAdopt == null) {
624         if (s.equals(kAny)) {
625             setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
626         } else {
627             int c;
628             c = UTF16.charAt(s, 0);
629             setToAdopt = new UnicodeSet(c, c);
630         }
631     }
632 
633     //
634     // Make a new uset node to refer to this UnicodeSet
635     // This new uset node becomes the child of the caller's setReference node.
636     //
637     RBBINode usetNode    = new RBBINode(RBBINode.uset);
638     usetNode.fInputSet   = setToAdopt;
639     usetNode.fParent     = node;
640     node.fLeftChild      = usetNode;
641     usetNode.fText = s;
642 
643 
644     //
645     // Add the new uset node to the list of all uset nodes.
646     //
647     fRB.fUSetNodes.add(usetNode);
648 
649 
650     //
651     // Add the new set to the set hash table.
652     //
653     el     = new RBBISetTableEl();
654     el.key = s;
655     el.val = usetNode;
656     fSetTable.put(el.key, el);
657 
658     return;
659 }
660 
661 
662 
663 //
664 //  Assorted Unicode character constants.
665 //     Numeric because there is no portable way to enter them as literals.
666 //     (Think EBCDIC).
667 //
668 static final int       chNEL       = 0x85;      //    NEL newline variant
669 static final int       chLS        = 0x2028;    //    Unicode Line Separator
670 
671 
672 //----------------------------------------------------------------------------------------
673 //
674 //  stripRules    Return a rules string without unnecessary
675 //                characters.
676 //
677 //----------------------------------------------------------------------------------------
678 static String   stripRules(String   rules) {
679     StringBuffer   strippedRules = new StringBuffer  ();
680     int rulesLength = rules.length();
681     for (int idx = 0; idx < rulesLength; ) {
682         char ch = rules.charAt(idx++);
683         if (ch == '#') {
684             while (idx < rulesLength
685                 && ch != '\r' && ch != '\n' && ch != chNEL)
686             {
687                 ch = rules.charAt(idx++);
688             }
689         }
690         if (!UCharacter.isISOControl(ch)) {            
691             strippedRules.append(ch);
692         }
693     }
694     return strippedRules.toString();
695 }
696 
697 
698 //----------------------------------------------------------------------------------------
699 //
700 //  nextCharLL    Low Level Next Char from rule input source.
701 //                Get a char from the input character iterator,
702 //                keep track of input position for error reporting.
703 //
704 //----------------------------------------------------------------------------------------
705 int  nextCharLL()  {
706     int  ch;
707 
708     if (fNextIndex >= fRB.fRules.length()) {
709         return -1;
710     }
711     ch         = UTF16.charAt(fRB.fRules, fNextIndex);
712     fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
713 
714     if (ch == '\r' ||
715         ch == chNEL ||
716         ch == chLS   ||
717         ch == '\n' && fLastChar != '\r') {
718         // Character is starting a new line.  Bump up the line number, and
719         //  reset the column to 0.
720         fLineNum++;
721         fCharNum=0;
722         if (fQuoteMode) {
723             error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
724             fQuoteMode = false;
725         }
726     }
727     else {
728         // Character is not starting a new line.  Except in the case of a
729         //   LF following a CR, increment the column position.
730         if (ch != '\n') {
731             fCharNum++;
732         }
733     }
734     fLastChar = ch;
735     return ch;
736 }
737 
738 
739 //---------------------------------------------------------------------------------
740 //
741 //   nextChar     for rules scanning.  At this level, we handle stripping
742 //                out comments and processing backslash character escapes.
743 //                The rest of the rules grammar is handled at the next level up.
744 //
745 //---------------------------------------------------------------------------------
746 void nextChar(RBBIRuleChar c) {
747 
748     // Unicode Character constants needed for the processing done by nextChar(),
749     //   in hex because literals wont work on EBCDIC machines.
750 
751     fScanIndex = fNextIndex;
752     c.fChar    = nextCharLL();
753     c.fEscaped = false;
754 
755     //
756     //  check for '' sequence.
757     //  These are recognized in all contexts, whether in quoted text or not.
758     //
759     if (c.fChar == '\'') {
760         if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
761             c.fChar    = nextCharLL();        // get nextChar officially so character counts
762             c.fEscaped = true;                //   stay correct.
763         }
764         else
765         {
766             // Single quote, by itself.
767             //   Toggle quoting mode.
768             //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
769             fQuoteMode = !fQuoteMode;
770             if (fQuoteMode == true) {
771                 c.fChar = '(';
772             } else {
773                 c.fChar = ')';
774             }
775             c.fEscaped = false;      // The paren that we return is not escaped.
776             return;
777         }
778     }
779 
780     if (fQuoteMode) {
781         c.fEscaped = true;
782     }
783     else
784     {
785         // We are not in a 'quoted region' of the source.
786         //
787         if (c.fChar == '#') {
788             // Start of a comment.  Consume the rest of it.
789             //  The new-line char that terminates the comment is always returned.
790             //  It will be treated as white-space, and serves to break up anything
791             //    that might otherwise incorrectly clump together with a comment in
792             //    the middle (a variable name, for example.)
793             for (;;) {
794                 c.fChar = nextCharLL();
795                 if (c.fChar == (int)-1 ||  // EOF
796                     c.fChar == '\r'     ||
797                     c.fChar == '\n'     ||
798                     c.fChar == chNEL    ||
799                     c.fChar == chLS)       {break;}
800             }
801         }
802         if (c.fChar == (int)-1) {
803             return;
804         }
805 
806         //
807         //  check for backslash escaped characters.
808         //  Use String.unescapeAt() to handle them.
809         //
810         if (c.fChar == '\\') {
811             c.fEscaped = true;
812             int [] unescapeIndex = new int[1];
813             unescapeIndex[0] = fNextIndex;
814             c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex);
815             if (unescapeIndex[0] == fNextIndex) {
816                 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
817             }
818             
819             fCharNum += unescapeIndex[0]-fNextIndex;
820             fNextIndex = unescapeIndex[0];
821         }
822     }
823     // putc(c.fChar, stdout);
824 }
825 
826 //---------------------------------------------------------------------------------
827 //
828 //  Parse RBBI rules.   The state machine for rules parsing is here.
829 //                      The state tables are hand-written in the file rbbirpt.txt,
830 //                      and converted to the form used here by a perl
831 //                      script rbbicst.pl
832 //
833 //---------------------------------------------------------------------------------
834 void parse()  {
835     int      state;
836     RBBIRuleParseTable.RBBIRuleTableElement    tableEl;
837 
838 
839     state = 1;
840     nextChar(fC);
841     //
842     // Main loop for the rule parsing state machine.
843     //   Runs once per state transition.
844     //   Each time through optionally performs, depending on the state table,
845     //      - an advance to the the next input char
846     //      - an action to be performed.
847     //      - pushing or popping a state to/from the local state return stack.
848     //
849     for (;;) {
850         // Quit if state == 0.  This is the normal way to exit the state machine.
851         //
852         if (state == 0) {
853             break;
854         }
855 
856         // Find the state table element that matches the input char from the rule, or the
857         //    class of the input character.  Start with the first table row for this
858         //    state, then linearly scan forward until we find a row that matches the
859         //    character.  The last row for each state always matches all characters, so
860         //    the search will stop there, if not before.
861         //
862         tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
863             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) {
864                 System.out.println("char, line, col = (\'" + (char)fC.fChar + "\', " + fLineNum + ", " + fCharNum +
865                         "    state = " + tableEl.fStateName);
866             }
867 
868         for (int tableRow=state; ;tableRow++) {   // loop over the state table rows associated with this state.
869             tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
870             if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) { System.out.print(".");}
871             if (tableEl.fCharClass < 127 && fC.fEscaped == false &&   tableEl.fCharClass == fC.fChar) {
872                 // Table row specified an individual character, not a set, and
873                 //   the input character is not escaped, and
874                 //   the input character matched it.
875                 break;
876             }
877             if (tableEl.fCharClass == 255) {
878                 // Table row specified default, match anything character class.
879                 break;
880             }
881             if (tableEl.fCharClass == 254 && fC.fEscaped)  {
882                 // Table row specified "escaped" and the char was escaped.
883                 break;
884             }
885             if (tableEl.fCharClass == 253 && fC.fEscaped &&
886                 (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
887                 // Table row specified "escaped P" and the char is either 'p' or 'P'.
888                 break;
889             }
890             if (tableEl.fCharClass == 252 && fC.fChar == (int)-1)  {
891                 // Table row specified eof and we hit eof on the input.
892                 break;
893             }
894 
895             if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 &&     // Table specs a char class &&
896                 fC.fEscaped == false &&                                      //   char is not escaped &&
897                 fC.fChar != (int)-1) {                                       //   char is not EOF
898                 UnicodeSet uniset = fRuleSets[tableEl.fCharClass-128];
899                 if (uniset.contains(fC.fChar)) {
900                     // Table row specified a character class, or set of characters,
901                     //   and the current char matches it.
902                     break;
903                 }
904             }
905        }
906         
907         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("scan")>=0) { System.out.println("");}
908         //
909         // We've found the row of the state table that matches the current input
910         //   character from the rules string.
911         // Perform any action specified  by this row in the state table.
912         if (doParseActions(tableEl.fAction) == false) {
913             // Break out of the state machine loop if the
914             //   the action signalled some kind of error, or
915             //   the action was to exit, occurs on normal end-of-rules-input.
916             break;
917         }
918 
919         if (tableEl.fPushState != 0) {
920             fStackPtr++;
921             if (fStackPtr >= kStackSize) {
922                 System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
923                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
924             }
925             fStack[fStackPtr] = tableEl.fPushState;
926         }
927 
928         if (tableEl.fNextChar) {
929             nextChar(fC);
930         }
931 
932         // Get the next state from the table entry, or from the
933         //   state stack if the next state was specified as "pop".
934         if (tableEl.fNextState != 255) {
935             state = tableEl.fNextState;
936         } else {
937             state = fStack[fStackPtr];
938             fStackPtr--;
939             if (fStackPtr < 0) {
940                 System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
941                 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
942             }
943         }
944 
945     }
946 
947     //
948     // If there were NO user specified reverse rules, set up the equivalent of ".*;"
949     //
950     if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
951         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]  = pushNewNode(RBBINode.opStar);
952         RBBINode  operand = pushNewNode(RBBINode.setRef);
953         findSetFor(kAny, operand, null);
954         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
955         operand.fParent  = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
956         fNodeStackPtr -= 2;
957     }
958 
959 
960     //
961     // Parsing of the input RBBI rules is complete.
962     // We now have a parse tree for the rule expressions
963     // and a list of all UnicodeSets that are referenced.
964     //
965     if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("symbols")>=0) {
966         fSymbolTable.rbbiSymtablePrint();
967         }
968     if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ptree")>=0)
969     {
970         System.out.println("Completed Forward Rules Parse Tree...");
971         fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
972         System.out.println("\nCompleted Reverse Rules Parse Tree...");
973         fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
974         System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
975         if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
976             System.out.println("  -- null -- ");
977         } else {
978             fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
979         }
980         System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
981         if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
982             System.out.println("  -- null -- ");
983         } else {
984             fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
985         }
986     }
987 }
988 
989 
990 //---------------------------------------------------------------------------------
991 //
992 //  printNodeStack     for debugging...
993 //
994 //---------------------------------------------------------------------------------
995 void printNodeStack(String   title) {
996     int i;
997     System.out.println(title + ".  Dumping node stack...\n");
998     for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i].printTree(true);}
999 }
1000
1001
1002
1003
1004//---------------------------------------------------------------------------------
1005//
1006//  pushNewNode   create a new RBBINode of the specified type and push it
1007//                onto the stack of nodes.
1008//
1009//---------------------------------------------------------------------------------
1010RBBINode  pushNewNode(int  nodeType)  {
1011    fNodeStackPtr++;
1012    if (fNodeStackPtr >= kStackSize) {
1013        System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
1014        error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
1015    }
1016    fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
1017    return fNodeStack[fNodeStackPtr];
1018}
1019
1020
1021
1022//---------------------------------------------------------------------------------
1023//
1024//  scanSet    Construct a UnicodeSet from the text at the current scan
1025//             position.  Advance the scan position to the first character
1026//             after the set.
1027//
1028//             A new RBBI setref node referring to the set is pushed onto the node
1029//             stack.
1030//
1031//             The scan position is normally under the control of the state machine
1032//             that controls rule parsing.  UnicodeSets, however, are parsed by
1033//             the UnicodeSet constructor, not by the RBBI rule parser.
1034//
1035//---------------------------------------------------------------------------------
1036void scanSet()  {
1037    UnicodeSet     uset = null;
1038    int            startPos;
1039    ParsePosition    pos = new ParsePosition  (fScanIndex);
1040    int            i;
1041
1042    startPos = fScanIndex;
1043    try {
1044        uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
1045    } catch (Exception   e) {         // TODO:  catch fewer exception types.
1046        // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
1047        error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
1048    }
1049
1050    // Verify that the set contains at least one code point.
1051    //
1052    if (uset.isEmpty()) {
1053        // This set is empty.
1054        //  Make it an error, because it almost certainly is not what the user wanted.
1055        //  Also, avoids having to think about corner cases in the tree manipulation code
1056        //   that occurs later on.
1057        //  TODO:  this shouldn't be an error; it does happen.
1058        error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
1059    }
1060
1061
1062    // Advance the RBBI parse postion over the UnicodeSet pattern.
1063    //   Don't just set fScanIndex because the line/char positions maintained
1064    //   for error reporting would be thrown off.
1065    i = pos.getIndex();
1066    for (;;) {
1067        if (fNextIndex >= i) {
1068            break;
1069        }
1070        nextCharLL();
1071    }
1072
1073        RBBINode         n;
1074
1075        n = pushNewNode(RBBINode.setRef);
1076        n.fFirstPos = startPos;
1077        n.fLastPos  = fNextIndex;
1078        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
1079        //  findSetFor() serves several purposes here:
1080        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1081        //     - Mantains collection of all sets in use, needed later for establishing
1082        //          character categories for run time engine.
1083        //     - Eliminates mulitiple instances of the same set.
1084        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1085        findSetFor(n.fText, n, uset);
1086    }
1087
1088}
1089
1090
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags