KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > org > apache > xerces > internal > impl > xpath > regex > Token


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 1999, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
59
60 import java.util.Vector JavaDoc;
61 import java.util.Hashtable JavaDoc;
62
63 /**
64  * This class represents a node in parse tree.
65  *
66  * @version $Id: Token.java,v 1.7 2003/02/25 14:43:13 sandygao Exp $
67  */

68 class Token implements java.io.Serializable JavaDoc {
69     static final boolean COUNTTOKENS = true;
70     static int tokens = 0;
71
72     static final int CHAR = 0; // Literal char
73
static final int DOT = 11; // .
74
static final int CONCAT = 1; // XY
75
static final int UNION = 2; // X|Y|Z
76
static final int CLOSURE = 3; // X*
77
static final int RANGE = 4; // [a-zA-Z] etc.
78
static final int NRANGE = 5; // [^a-zA-Z] etc.
79
static final int PAREN = 6; // (X) or (?:X)
80
static final int EMPTY = 7; //
81
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
82
static final int NONGREEDYCLOSURE = 9; // *? +?
83
static final int STRING = 10; // strings
84
static final int BACKREFERENCE = 12; // back references
85
static final int LOOKAHEAD = 20; // (?=...)
86
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
87
static final int LOOKBEHIND = 22; // (?<=...)
88
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
89
static final int INDEPENDENT = 24; // (?>...)
90
static final int MODIFIERGROUP = 25; // (?ims-ims:...)
91
static final int CONDITION = 26; // (?(...)yes|no)
92

93     static final int UTF16_MAX = 0x10ffff;
94
95     int type;
96
97     static Token token_dot;
98     static Token token_0to9;
99     static Token token_wordchars;
100     static Token token_not_0to9;
101     static Token token_not_wordchars;
102     static Token token_spaces;
103     static Token token_not_spaces;
104     static Token token_empty;
105     static Token token_linebeginning;
106     static Token token_linebeginning2;
107     static Token token_lineend;
108     static Token token_stringbeginning;
109     static Token token_stringend;
110     static Token token_stringend2;
111     static Token token_wordedge;
112     static Token token_not_wordedge;
113     static Token token_wordbeginning;
114     static Token token_wordend;
115     static {
116         Token.token_empty = new Token(Token.EMPTY);
117
118         Token.token_linebeginning = Token.createAnchor('^');
119         Token.token_linebeginning2 = Token.createAnchor('@');
120         Token.token_lineend = Token.createAnchor('$');
121         Token.token_stringbeginning = Token.createAnchor('A');
122         Token.token_stringend = Token.createAnchor('z');
123         Token.token_stringend2 = Token.createAnchor('Z');
124         Token.token_wordedge = Token.createAnchor('b');
125         Token.token_not_wordedge = Token.createAnchor('B');
126         Token.token_wordbeginning = Token.createAnchor('<');
127         Token.token_wordend = Token.createAnchor('>');
128
129         Token.token_dot = new Token(Token.DOT);
130
131         Token.token_0to9 = Token.createRange();
132         Token.token_0to9.addRange('0', '9');
133         Token.token_wordchars = Token.createRange();
134         Token.token_wordchars.addRange('0', '9');
135         Token.token_wordchars.addRange('A', 'Z');
136         Token.token_wordchars.addRange('_', '_');
137         Token.token_wordchars.addRange('a', 'z');
138         Token.token_spaces = Token.createRange();
139         Token.token_spaces.addRange('\t', '\t');
140         Token.token_spaces.addRange('\n', '\n');
141         Token.token_spaces.addRange('\f', '\f');
142         Token.token_spaces.addRange('\r', '\r');
143         Token.token_spaces.addRange(' ', ' ');
144
145         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
146         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
147         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
148     }
149
150     static Token.ParenToken createLook(int type, Token child) {
151         if (COUNTTOKENS) Token.tokens ++;
152         return new Token.ParenToken(type, child, 0);
153     }
154     static Token.ParenToken createParen(Token child, int pnumber) {
155         if (COUNTTOKENS) Token.tokens ++;
156         return new Token.ParenToken(Token.PAREN, child, pnumber);
157     }
158     static Token.ClosureToken createClosure(Token tok) {
159         if (COUNTTOKENS) Token.tokens ++;
160         return new Token.ClosureToken(Token.CLOSURE, tok);
161     }
162     static Token.ClosureToken createNGClosure(Token tok) {
163         if (COUNTTOKENS) Token.tokens ++;
164         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
165     }
166     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
167         if (COUNTTOKENS) Token.tokens ++;
168         return new Token.ConcatToken(tok1, tok2);
169     }
170     static Token.UnionToken createConcat() {
171         if (COUNTTOKENS) Token.tokens ++;
172         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
173
}
174     static Token.UnionToken createUnion() {
175         if (COUNTTOKENS) Token.tokens ++;
176         return new Token.UnionToken(Token.UNION);
177     }
178     static Token createEmpty() {
179         return Token.token_empty;
180     }
181     static RangeToken createRange() {
182         if (COUNTTOKENS) Token.tokens ++;
183         return new RangeToken(Token.RANGE);
184     }
185     static RangeToken createNRange() {
186         if (COUNTTOKENS) Token.tokens ++;
187         return new RangeToken(Token.NRANGE);
188     }
189     static Token.CharToken createChar(int ch) {
190         if (COUNTTOKENS) Token.tokens ++;
191         return new Token.CharToken(Token.CHAR, ch);
192     }
193     static private Token.CharToken createAnchor(int ch) {
194         if (COUNTTOKENS) Token.tokens ++;
195         return new Token.CharToken(Token.ANCHOR, ch);
196     }
197     static Token.StringToken createBackReference(int refno) {
198         if (COUNTTOKENS) Token.tokens ++;
199         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
200     }
201     static Token.StringToken createString(String JavaDoc str) {
202         if (COUNTTOKENS) Token.tokens ++;
203         return new Token.StringToken(Token.STRING, str, 0);
204     }
205     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
206         if (COUNTTOKENS) Token.tokens ++;
207         return new Token.ModifierToken(child, add, mask);
208     }
209     static Token.ConditionToken createCondition(int refno, Token condition,
210                                                 Token yespat, Token nopat) {
211         if (COUNTTOKENS) Token.tokens ++;
212         return new Token.ConditionToken(refno, condition, yespat, nopat);
213     }
214
215     protected Token(int type) {
216         this.type = type;
217     }
218
219     /**
220      * A number of children.
221      */

222     int size() {
223         return 0;
224     }
225     Token getChild(int index) {
226         return null;
227     }
228     void addChild(Token tok) {
229         throw new RuntimeException JavaDoc("Not supported.");
230     }
231
232                                                 // for RANGE or NRANGE
233
protected void addRange(int start, int end) {
234         throw new RuntimeException JavaDoc("Not supported.");
235     }
236     protected void sortRanges() {
237         throw new RuntimeException JavaDoc("Not supported.");
238     }
239     protected void compactRanges() {
240         throw new RuntimeException JavaDoc("Not supported.");
241     }
242     protected void mergeRanges(Token tok) {
243         throw new RuntimeException JavaDoc("Not supported.");
244     }
245     protected void subtractRanges(Token tok) {
246         throw new RuntimeException JavaDoc("Not supported.");
247     }
248     protected void intersectRanges(Token tok) {
249         throw new RuntimeException JavaDoc("Not supported.");
250     }
251     static Token complementRanges(Token tok) {
252         return RangeToken.complementRanges(tok);
253     }
254
255
256     void setMin(int min) { // for CLOSURE
257
}
258     void setMax(int max) { // for CLOSURE
259
}
260     int getMin() { // for CLOSURE
261
return -1;
262     }
263     int getMax() { // for CLOSURE
264
return -1;
265     }
266     int getReferenceNumber() { // for STRING
267
return 0;
268     }
269     String JavaDoc getString() { // for STRING
270
return null;
271     }
272
273     int getParenNumber() {
274         return 0;
275     }
276     int getChar() {
277         return -1;
278     }
279
280     public String JavaDoc toString() {
281         return this.toString(0);
282     }
283     public String JavaDoc toString(int options) {
284         return this.type == Token.DOT ? "." : "";
285     }
286
287     /**
288      * How many characters are needed?
289      */

290     final int getMinLength() {
291         switch (this.type) {
292           case CONCAT:
293             int sum = 0;
294             for (int i = 0; i < this.size(); i ++)
295                 sum += this.getChild(i).getMinLength();
296             return sum;
297
298           case CONDITION:
299           case UNION:
300             if (this.size() == 0)
301                 return 0;
302             int ret = this.getChild(0).getMinLength();
303             for (int i = 1; i < this.size(); i ++) {
304                 int min = this.getChild(i).getMinLength();
305                 if (min < ret) ret = min;
306             }
307             return ret;
308
309           case CLOSURE:
310           case NONGREEDYCLOSURE:
311             if (this.getMin() >= 0)
312                 return this.getMin() * this.getChild(0).getMinLength();
313             return 0;
314
315           case EMPTY:
316           case ANCHOR:
317             return 0;
318
319           case DOT:
320           case CHAR:
321           case RANGE:
322           case NRANGE:
323             return 1;
324
325           case INDEPENDENT:
326           case PAREN:
327           case MODIFIERGROUP:
328             return this.getChild(0).getMinLength();
329
330           case BACKREFERENCE:
331             return 0; // *******
332

333           case STRING:
334             return this.getString().length();
335
336           case LOOKAHEAD:
337           case NEGATIVELOOKAHEAD:
338           case LOOKBEHIND:
339           case NEGATIVELOOKBEHIND:
340             return 0; // ***** Really?
341

342           default:
343             throw new RuntimeException JavaDoc("Token#getMinLength(): Invalid Type: "+this.type);
344         }
345     }
346
347     final int getMaxLength() {
348         switch (this.type) {
349           case CONCAT:
350             int sum = 0;
351             for (int i = 0; i < this.size(); i ++) {
352                 int d = this.getChild(i).getMaxLength();
353                 if (d < 0) return -1;
354                 sum += d;
355             }
356             return sum;
357
358           case CONDITION:
359           case UNION:
360             if (this.size() == 0)
361                 return 0;
362             int ret = this.getChild(0).getMaxLength();
363             for (int i = 1; ret >= 0 && i < this.size(); i ++) {
364                 int max = this.getChild(i).getMaxLength();
365                 if (max < 0) { // infinity
366
ret = -1;
367                     break;
368                 }
369                 if (max > ret) ret = max;
370             }
371             return ret;
372
373           case CLOSURE:
374           case NONGREEDYCLOSURE:
375             if (this.getMax() >= 0)
376                                                 // When this.child.getMaxLength() < 0,
377
// this returns minus value
378
return this.getMax() * this.getChild(0).getMaxLength();
379             return -1;
380
381           case EMPTY:
382           case ANCHOR:
383             return 0;
384
385           case CHAR:
386             return 1;
387           case DOT:
388           case RANGE:
389           case NRANGE:
390             return 2;
391
392           case INDEPENDENT:
393           case PAREN:
394           case MODIFIERGROUP:
395             return this.getChild(0).getMaxLength();
396
397           case BACKREFERENCE:
398             return -1; // ******
399

400           case STRING:
401             return this.getString().length();
402
403           case LOOKAHEAD:
404           case NEGATIVELOOKAHEAD:
405           case LOOKBEHIND:
406           case NEGATIVELOOKBEHIND:
407             return 0; // ***** Really?
408

409           default:
410             throw new RuntimeException JavaDoc("Token#getMaxLength(): Invalid Type: "+this.type);
411         }
412     }
413
414     static final int FC_CONTINUE = 0;
415     static final int FC_TERMINAL = 1;
416     static final int FC_ANY = 2;
417     private static final boolean isSet(int options, int flag) {
418         return (options & flag) == flag;
419     }
420     final int analyzeFirstCharacter(RangeToken result, int options) {
421         switch (this.type) {
422           case CONCAT:
423             int ret = FC_CONTINUE;
424             for (int i = 0; i < this.size(); i ++)
425                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
426                     break;
427             return ret;
428
429           case UNION:
430             if (this.size() == 0)
431                 return FC_CONTINUE;
432             /*
433              * a|b|c -> FC_TERMINAL
434              * a|.|c -> FC_ANY
435              * a|b| -> FC_CONTINUE
436              */

437             int ret2 = FC_CONTINUE;
438             boolean hasEmpty = false;
439             for (int i = 0; i < this.size(); i ++) {
440                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
441                 if (ret2 == FC_ANY)
442                     break;
443                 else if (ret2 == FC_CONTINUE)
444                     hasEmpty = true;
445             }
446             return hasEmpty ? FC_CONTINUE : ret2;
447
448           case CONDITION:
449             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
450             if (this.size() == 1) return FC_CONTINUE;
451             if (ret3 == FC_ANY) return ret3;
452             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
453             if (ret4 == FC_ANY) return ret4;
454             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
455
456           case CLOSURE:
457           case NONGREEDYCLOSURE:
458             this.getChild(0).analyzeFirstCharacter(result, options);
459             return FC_CONTINUE;
460
461           case EMPTY:
462           case ANCHOR:
463             return FC_CONTINUE;
464
465           case CHAR:
466             int ch = this.getChar();
467             result.addRange(ch, ch);
468             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
469                 ch = Character.toUpperCase((char)ch);
470                 result.addRange(ch, ch);
471                 ch = Character.toLowerCase((char)ch);
472                 result.addRange(ch, ch);
473             }
474             return FC_TERMINAL;
475
476           case DOT: // ****
477
if (isSet(options, RegularExpression.SINGLE_LINE)) {
478                 return FC_CONTINUE; // **** We can not optimize.
479
} else {
480                 return FC_CONTINUE;
481                 /*
482                 result.addRange(0, RegularExpression.LINE_FEED-1);
483                 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
484                 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
485                                 RegularExpression.LINE_SEPARATOR-1);
486                 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
487                 return 1;
488                 */

489             }
490
491           case RANGE:
492             if (isSet(options, RegularExpression.IGNORE_CASE)) {
493                 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
494             } else {
495                 result.mergeRanges(this);
496             }
497             return FC_TERMINAL;
498
499           case NRANGE: // ****
500
if (isSet(options, RegularExpression.IGNORE_CASE)) {
501                 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
502             } else {
503                 result.mergeRanges(Token.complementRanges(this));
504             }
505             return FC_TERMINAL;
506
507           case INDEPENDENT:
508           case PAREN:
509             return this.getChild(0).analyzeFirstCharacter(result, options);
510
511           case MODIFIERGROUP:
512             options |= ((ModifierToken)this).getOptions();
513             options &= ~((ModifierToken)this).getOptionsMask();
514             return this.getChild(0).analyzeFirstCharacter(result, options);
515
516           case BACKREFERENCE:
517             result.addRange(0, UTF16_MAX); // **** We can not optimize.
518
return FC_ANY;
519
520           case STRING:
521             int cha = this.getString().charAt(0);
522             int ch2;
523             if (REUtil.isHighSurrogate(cha)
524                 && this.getString().length() >= 2
525                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
526                 cha = REUtil.composeFromSurrogates(cha, ch2);
527             result.addRange(cha, cha);
528             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
529                 cha = Character.toUpperCase((char)cha);
530                 result.addRange(cha, cha);
531                 cha = Character.toLowerCase((char)cha);
532                 result.addRange(cha, cha);
533             }
534             return FC_TERMINAL;
535
536           case LOOKAHEAD:
537           case NEGATIVELOOKAHEAD:
538           case LOOKBEHIND:
539           case NEGATIVELOOKBEHIND:
540             return FC_CONTINUE;
541
542           default:
543             throw new RuntimeException JavaDoc("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
544         }
545     }
546
547     private final boolean isShorterThan(Token tok) {
548         if (tok == null) return false;
549         /*
550         int mylength;
551         if (this.type == STRING) mylength = this.getString().length();
552         else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
553         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
554         int otherlength;
555         if (tok.type == STRING) otherlength = tok.getString().length();
556         else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
557         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
558         */

559         int mylength;
560         if (this.type == STRING) mylength = this.getString().length();
561         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+this.type);
562         int otherlength;
563         if (tok.type == STRING) otherlength = tok.getString().length();
564         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+tok.type);
565         return mylength < otherlength;
566     }
567
568     static class FixedStringContainer {
569         Token token = null;
570         int options = 0;
571         FixedStringContainer() {
572         }
573     }
574
575     final void findFixedString(FixedStringContainer container, int options) {
576         switch (this.type) {
577           case CONCAT:
578             Token prevToken = null;
579             int prevOptions = 0;
580             for (int i = 0; i < this.size(); i ++) {
581                 this.getChild(i).findFixedString(container, options);
582                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
583                     prevToken = container.token;
584                     prevOptions = container.options;
585                 }
586             }
587             container.token = prevToken;
588             container.options = prevOptions;
589             return;
590
591           case UNION:
592           case CLOSURE:
593           case NONGREEDYCLOSURE:
594           case EMPTY:
595           case ANCHOR:
596           case RANGE:
597           case DOT:
598           case NRANGE:
599           case BACKREFERENCE:
600           case LOOKAHEAD:
601           case NEGATIVELOOKAHEAD:
602           case LOOKBEHIND:
603           case NEGATIVELOOKBEHIND:
604           case CONDITION:
605             container.token = null;
606             return;
607
608           case CHAR: // Ignore CHAR tokens.
609
container.token = null; // **
610
return; // **
611

612           case STRING:
613             container.token = this;
614             container.options = options;
615             return;
616
617           case INDEPENDENT:
618           case PAREN:
619             this.getChild(0).findFixedString(container, options);
620             return;
621
622           case MODIFIERGROUP:
623             options |= ((ModifierToken)this).getOptions();
624             options &= ~((ModifierToken)this).getOptionsMask();
625             this.getChild(0).findFixedString(container, options);
626             return;
627
628           default:
629             throw new RuntimeException JavaDoc("Token#findFixedString(): Invalid Type: "+this.type);
630         }
631     }
632
633     boolean match(int ch) {
634         throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
635     }
636
637     // ------------------------------------------------------
638
private final static Hashtable JavaDoc categories = new Hashtable JavaDoc();
639     private final static Hashtable JavaDoc categories2 = new Hashtable JavaDoc();
640     private static final String JavaDoc[] categoryNames = {
641         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
642         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
643         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
644
"Pi", "Pf", // 29, 30
645
"L", "M", "N", "Z", "C", "P", "S", // 31-37
646
};
647
648     // Schema Rec. {Datatypes} - Punctuation
649
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
650
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
651
static final int CHAR_LETTER = 31;
652     static final int CHAR_MARK = 32;
653     static final int CHAR_NUMBER = 33;
654     static final int CHAR_SEPARATOR = 34;
655     static final int CHAR_OTHER = 35;
656     static final int CHAR_PUNCTUATION = 36;
657     static final int CHAR_SYMBOL = 37;
658     
659     //blockNames in UNICODE 3.1 that supported by XML Schema REC
660
private static final String JavaDoc[] blockNames = {
661         /*0000..007F;*/ "Basic Latin",
662         /*0080..00FF;*/ "Latin-1 Supplement",
663         /*0100..017F;*/ "Latin Extended-A",
664         /*0180..024F;*/ "Latin Extended-B",
665         /*0250..02AF;*/ "IPA Extensions",
666         /*02B0..02FF;*/ "Spacing Modifier Letters",
667         /*0300..036F;*/ "Combining Diacritical Marks",
668         /*0370..03FF;*/ "Greek",
669         /*0400..04FF;*/ "Cyrillic",
670         /*0530..058F;*/ "Armenian",
671         /*0590..05FF;*/ "Hebrew",
672         /*0600..06FF;*/ "Arabic",
673         /*0700..074F;*/ "Syriac",
674         /*0780..07BF;*/ "Thaana",
675         /*0900..097F;*/ "Devanagari",
676         /*0980..09FF;*/ "Bengali",
677         /*0A00..0A7F;*/ "Gurmukhi",
678         /*0A80..0AFF;*/ "Gujarati",
679         /*0B00..0B7F;*/ "Oriya",
680         /*0B80..0BFF;*/ "Tamil",
681         /*0C00..0C7F;*/ "Telugu",
682         /*0C80..0CFF;*/ "Kannada",
683         /*0D00..0D7F;*/ "Malayalam",
684         /*0D80..0DFF;*/ "Sinhala",
685         /*0E00..0E7F;*/ "Thai",
686         /*0E80..0EFF;*/ "Lao",
687         /*0F00..0FFF;*/ "Tibetan",
688         /*1000..109F;*/ "Myanmar",
689         /*10A0..10FF;*/ "Georgian",
690         /*1100..11FF;*/ "Hangul Jamo",
691         /*1200..137F;*/ "Ethiopic",
692         /*13A0..13FF;*/ "Cherokee",
693         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
694         /*1680..169F;*/ "Ogham",
695         /*16A0..16FF;*/ "Runic",
696         /*1780..17FF;*/ "Khmer",
697         /*1800..18AF;*/ "Mongolian",
698         /*1E00..1EFF;*/ "Latin Extended Additional",
699         /*1F00..1FFF;*/ "Greek Extended",
700         /*2000..206F;*/ "General Punctuation",
701         /*2070..209F;*/ "Superscripts and Subscripts",
702         /*20A0..20CF;*/ "Currency Symbols",
703         /*20D0..20FF;*/ "Combining Marks for Symbols",
704         /*2100..214F;*/ "Letterlike Symbols",
705         /*2150..218F;*/ "Number Forms",
706         /*2190..21FF;*/ "Arrows",
707         /*2200..22FF;*/ "Mathematical Operators",
708         /*2300..23FF;*/ "Miscellaneous Technical",
709         /*2400..243F;*/ "Control Pictures",
710         /*2440..245F;*/ "Optical Character Recognition",
711         /*2460..24FF;*/ "Enclosed Alphanumerics",
712         /*2500..257F;*/ "Box Drawing",
713         /*2580..259F;*/ "Block Elements",
714         /*25A0..25FF;*/ "Geometric Shapes",
715         /*2600..26FF;*/ "Miscellaneous Symbols",
716         /*2700..27BF;*/ "Dingbats",
717         /*2800..28FF;*/ "Braille Patterns",
718         /*2E80..2EFF;*/ "CJK Radicals Supplement",
719         /*2F00..2FDF;*/ "Kangxi Radicals",
720         /*2FF0..2FFF;*/ "Ideographic Description Characters",
721         /*3000..303F;*/ "CJK Symbols and Punctuation",
722         /*3040..309F;*/ "Hiragana",
723         /*30A0..30FF;*/ "Katakana",
724         /*3100..312F;*/ "Bopomofo",
725         /*3130..318F;*/ "Hangul Compatibility Jamo",
726         /*3190..319F;*/ "Kanbun",
727         /*31A0..31BF;*/ "Bopomofo Extended",
728         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
729         /*3300..33FF;*/ "CJK Compatibility",
730         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
731         /*4E00..9FFF;*/ "CJK Unified Ideographs",
732         /*A000..A48F;*/ "Yi Syllables",
733         /*A490..A4CF;*/ "Yi Radicals",
734         /*AC00..D7A3;*/ "Hangul Syllables",
735         /*E000..F8FF;*/ "Private Use",
736         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
737         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
738         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
739         /*FE20..FE2F;*/ "Combining Half Marks",
740         /*FE30..FE4F;*/ "CJK Compatibility Forms",
741         /*FE50..FE6F;*/ "Small Form Variants",
742         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
743         /*FEFF..FEFF;*/ "Specials",
744         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
745          //missing Specials add manually
746
/*10300..1032F;*/ "Old Italic", // 84
747
/*10330..1034F;*/ "Gothic",
748         /*10400..1044F;*/ "Deseret",
749         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
750         /*1D100..1D1FF;*/ "Musical Symbols",
751         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
752         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
753         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
754         /*E0000..E007F;*/ "Tags",
755         //missing 2 private use add manually
756

757     };
758     //ADD THOSE MANUALLY
759
//F0000..FFFFD; "Private Use",
760
//100000..10FFFD; "Private Use"
761
//FFF0..FFFD; "Specials",
762
static final String JavaDoc blockRanges =
763        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
764         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
765         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
766         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
767         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
768         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
769         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
770         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
771         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
772         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
773         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
774     static final int[] nonBMPBlockRanges = {
775         0x10300, 0x1032F, // 84
776
0x10330, 0x1034F,
777         0x10400, 0x1044F,
778         0x1D000, 0x1D0FF,
779         0x1D100, 0x1D1FF,
780         0x1D400, 0x1D7FF,
781         0x20000, 0x2A6D6,
782         0x2F800, 0x2FA1F,
783         0xE0000, 0xE007F
784     };
785     private static final int NONBMP_BLOCK_START = 84;
786
787     static protected RangeToken getRange(String JavaDoc name, boolean positive) {
788         if (Token.categories.size() == 0) {
789             synchronized (Token.categories) {
790                 Token[] ranges = new Token[Token.categoryNames.length];
791                 for (int i = 0; i < ranges.length; i ++) {
792                     ranges[i] = Token.createRange();
793                 }
794                 int type;
795                 for (int i = 0; i < 0x10000; i ++) {
796                     type = Character.getType((char)i);
797                     if (type == Character.START_PUNCTUATION ||
798                         type == Character.END_PUNCTUATION) {
799                         //build table of Pi values
800
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
801                             i == 0x201F || i == 0x2039) {
802                             type = CHAR_INIT_QUOTE;
803                         }
804                         //build table of Pf values
805
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
806                             type = CHAR_FINAL_QUOTE;
807                         }
808                     }
809                     ranges[type].addRange(i, i);
810                     switch (type) {
811                       case Character.UPPERCASE_LETTER:
812                       case Character.LOWERCASE_LETTER:
813                       case Character.TITLECASE_LETTER:
814                       case Character.MODIFIER_LETTER:
815                       case Character.OTHER_LETTER:
816                         type = CHAR_LETTER;
817                         break;
818                       case Character.NON_SPACING_MARK:
819                       case Character.COMBINING_SPACING_MARK:
820                       case Character.ENCLOSING_MARK:
821                         type = CHAR_MARK;
822                         break;
823                       case Character.DECIMAL_DIGIT_NUMBER:
824                       case Character.LETTER_NUMBER:
825                       case Character.OTHER_NUMBER:
826                         type = CHAR_NUMBER;
827                         break;
828                       case Character.SPACE_SEPARATOR:
829                       case Character.LINE_SEPARATOR:
830                       case Character.PARAGRAPH_SEPARATOR:
831                         type = CHAR_SEPARATOR;
832                         break;
833                       case Character.CONTROL:
834                       case Character.FORMAT:
835                       case Character.SURROGATE:
836                       case Character.PRIVATE_USE:
837                       case Character.UNASSIGNED:
838                         type = CHAR_OTHER;
839                         break;
840                       case Character.CONNECTOR_PUNCTUATION:
841                       case Character.DASH_PUNCTUATION:
842                       case Character.START_PUNCTUATION:
843                       case Character.END_PUNCTUATION:
844                       case CHAR_INIT_QUOTE:
845                       case CHAR_FINAL_QUOTE:
846                       case Character.OTHER_PUNCTUATION:
847                         type = CHAR_PUNCTUATION;
848                         break;
849                       case Character.MATH_SYMBOL:
850                       case Character.CURRENCY_SYMBOL:
851                       case Character.MODIFIER_SYMBOL:
852                       case Character.OTHER_SYMBOL:
853                         type = CHAR_SYMBOL;
854                         break;
855                       default:
856                         throw new RuntimeException JavaDoc("com.sun.org.apache.xerces.internal.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
857                     }
858                     ranges[type].addRange(i, i);
859                 } // for all characters
860
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
861
862                 for (int i = 0; i < ranges.length; i ++) {
863                     if (Token.categoryNames[i] != null) {
864                         if (i == Character.UNASSIGNED) { // Unassigned
865
ranges[i].addRange(0x10000, Token.UTF16_MAX);
866                         }
867                         Token.categories.put(Token.categoryNames[i], ranges[i]);
868                         Token.categories2.put(Token.categoryNames[i],
869                                               Token.complementRanges(ranges[i]));
870                     }
871                 }
872                 //REVISIT: do we really need to support block names as in Unicode 3.1
873
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
874
//
875
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(50);
876                 for (int i = 0; i < Token.blockNames.length; i ++) {
877                     Token r1 = Token.createRange();
878                     int location;
879                     if (i < NONBMP_BLOCK_START) {
880                         location = i*2;
881                         int rstart = Token.blockRanges.charAt(location);
882                         int rend = Token.blockRanges.charAt(location+1);
883                         //DEBUGING
884
//System.out.println(n+" " +Integer.toHexString(rstart)
885
// +"-"+ Integer.toHexString(rend));
886
r1.addRange(rstart, rend);
887                     } else {
888                         location = (i - NONBMP_BLOCK_START) * 2;
889                         r1.addRange(Token.nonBMPBlockRanges[location],
890                                     Token.nonBMPBlockRanges[location + 1]);
891                     }
892                     String JavaDoc n = Token.blockNames[i];
893                     if (n.equals("Specials"))
894                         r1.addRange(0xfff0, 0xfffd);
895                     if (n.equals("Private Use")) {
896                         r1.addRange(0xF0000,0xFFFFD);
897                         r1.addRange(0x100000,0x10FFFD);
898                     }
899                     Token.categories.put(n, r1);
900                     Token.categories2.put(n, Token.complementRanges(r1));
901                     buffer.setLength(0);
902                     buffer.append("Is");
903                     if (n.indexOf(' ') >= 0) {
904                         for (int ci = 0; ci < n.length(); ci ++)
905                             if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
906                     }
907                     else {
908                         buffer.append(n);
909                     }
910                     Token.setAlias(buffer.toString(), n, true);
911                 }
912
913                 // TR#18 1.2
914
Token.setAlias("ASSIGNED", "Cn", false);
915                 Token.setAlias("UNASSIGNED", "Cn", true);
916                 Token all = Token.createRange();
917                 all.addRange(0, Token.UTF16_MAX);
918                 Token.categories.put("ALL", all);
919                 Token.categories2.put("ALL", Token.complementRanges(all));
920                 Token.registerNonXS("ASSIGNED");
921                 Token.registerNonXS("UNASSIGNED");
922                 Token.registerNonXS("ALL");
923
924                 Token isalpha = Token.createRange();
925                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
926
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
927
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
928
Token.categories.put("IsAlpha", isalpha);
929                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
930                 Token.registerNonXS("IsAlpha");
931
932                 Token isalnum = Token.createRange();
933                 isalnum.mergeRanges(isalpha); // Lu Ll Lo
934
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
935
Token.categories.put("IsAlnum", isalnum);
936                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
937                 Token.registerNonXS("IsAlnum");
938
939                 Token isspace = Token.createRange();
940                 isspace.mergeRanges(Token.token_spaces);
941                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
942
Token.categories.put("IsSpace", isspace);
943                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
944                 Token.registerNonXS("IsSpace");
945
946                 Token isword = Token.createRange();
947                 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
948
isword.addRange('_', '_');
949                 Token.categories.put("IsWord", isword);
950                 Token.categories2.put("IsWord", Token.complementRanges(isword));
951                 Token.registerNonXS("IsWord");
952
953                 Token isascii = Token.createRange();
954                 isascii.addRange(0, 127);
955                 Token.categories.put("IsASCII", isascii);
956                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
957                 Token.registerNonXS("IsASCII");
958
959                 Token isnotgraph = Token.createRange();
960                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
961                 isnotgraph.addRange(' ', ' ');
962                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
963                 Token.categories2.put("IsGraph", isnotgraph);
964                 Token.registerNonXS("IsGraph");
965
966                 Token isxdigit = Token.createRange();
967                 isxdigit.addRange('0', '9');
968                 isxdigit.addRange('A', 'F');
969                 isxdigit.addRange('a', 'f');
970                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
971                 Token.categories2.put("IsXDigit", isxdigit);
972                 Token.registerNonXS("IsXDigit");
973
974                 Token.setAlias("IsDigit", "Nd", true);
975                 Token.setAlias("IsUpper", "Lu", true);
976                 Token.setAlias("IsLower", "Ll", true);
977                 Token.setAlias("IsCntrl", "C", true);
978                 Token.setAlias("IsPrint", "C", false);
979                 Token.setAlias("IsPunct", "P", true);
980                 Token.registerNonXS("IsDigit");
981                 Token.registerNonXS("IsUpper");
982                 Token.registerNonXS("IsLower");
983                 Token.registerNonXS("IsCntrl");
984                 Token.registerNonXS("IsPrint");
985                 Token.registerNonXS("IsPunct");
986
987                 Token.setAlias("alpha", "IsAlpha", true);
988                 Token.setAlias("alnum", "IsAlnum", true);
989                 Token.setAlias("ascii", "IsASCII", true);
990                 Token.setAlias("cntrl", "IsCntrl", true);
991                 Token.setAlias("digit", "IsDigit", true);
992                 Token.setAlias("graph", "IsGraph", true);
993                 Token.setAlias("lower", "IsLower", true);
994                 Token.setAlias("print", "IsPrint", true);
995                 Token.setAlias("punct", "IsPunct", true);
996                 Token.setAlias("space", "IsSpace", true);
997                 Token.setAlias("upper", "IsUpper", true);
998                 Token.setAlias("word", "IsWord", true); // Perl extension
999
Token.setAlias("xdigit", "IsXDigit", true);
1000                Token.registerNonXS("alpha");
1001                Token.registerNonXS("alnum");
1002                Token.registerNonXS("ascii");
1003                Token.registerNonXS("cntrl");
1004                Token.registerNonXS("digit");
1005                Token.registerNonXS("graph");
1006                Token.registerNonXS("lower");
1007                Token.registerNonXS("print");
1008                Token.registerNonXS("punct");
1009                Token.registerNonXS("space");
1010                Token.registerNonXS("upper");
1011                Token.registerNonXS("word");
1012                Token.registerNonXS("xdigit");
1013            } // synchronized
1014
} // if null
1015
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
1016            : (RangeToken)Token.categories2.get(name);
1017        //if (tok == null) System.out.println(name);
1018
return tok;
1019    }
1020    static protected RangeToken getRange(String JavaDoc name, boolean positive, boolean xs) {
1021        RangeToken range = Token.getRange(name, positive);
1022        if (xs && range != null && Token.isRegisterNonXS(name))
1023            range = null;
1024        return range;
1025    }
1026
1027    static Hashtable JavaDoc nonxs = null;
1028    /**
1029     * This method is called by only getRange().
1030     * So this method need not MT-safe.
1031     */

1032    static protected void registerNonXS(String JavaDoc name) {
1033        if (Token.nonxs == null)
1034            Token.nonxs = new Hashtable JavaDoc();
1035        Token.nonxs.put(name, name);
1036    }
1037    static protected boolean isRegisterNonXS(String JavaDoc name) {
1038        if (Token.nonxs == null)
1039            return false;
1040        //DEBUG
1041
//System.err.println("isRegisterNonXS: "+name);
1042
return Token.nonxs.containsKey(name);
1043    }
1044
1045    private static void setAlias(String JavaDoc newName, String JavaDoc name, boolean positive) {
1046        Token t1 = (Token)Token.categories.get(name);
1047        Token t2 = (Token)Token.categories2.get(name);
1048        if (positive) {
1049            Token.categories.put(newName, t1);
1050            Token.categories2.put(newName, t2);
1051        } else {
1052            Token.categories2.put(newName, t1);
1053            Token.categories.put(newName, t2);
1054        }
1055    }
1056
1057    // ------------------------------------------------------
1058

1059    static final String JavaDoc viramaString =
1060    "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1061
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1062
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1063
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1064
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1065
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1066
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1067
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1068
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1069
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1070
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1071

1072    static private Token token_grapheme = null;
1073    static synchronized Token getGraphemePattern() {
1074        if (Token.token_grapheme != null)
1075            return Token.token_grapheme;
1076
1077        Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1078
base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1079        base_char.subtractRanges(Token.getRange("M", true));
1080        base_char.subtractRanges(Token.getRange("C", true));
1081
1082        Token virama = Token.createRange();
1083        for (int i = 0; i < Token.viramaString.length(); i ++) {
1084            int ch = viramaString.charAt(i);
1085            virama.addRange(i, i);
1086        }
1087
1088        Token combiner_wo_virama = Token.createRange();
1089        combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1090        combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1091
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1092

1093        Token left = Token.createUnion(); // base_char?
1094
left.addChild(base_char);
1095        left.addChild(Token.token_empty);
1096
1097        Token foo = Token.createUnion();
1098        foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1099        foo.addChild(combiner_wo_virama);
1100
1101        foo = Token.createClosure(foo);
1102
1103        foo = Token.createConcat(left, foo);
1104
1105        Token.token_grapheme = foo;
1106        return Token.token_grapheme;
1107    }
1108
1109    /**
1110     * Combing Character Sequence in Perl 5.6.
1111     */

1112    static private Token token_ccs = null;
1113    static synchronized Token getCombiningCharacterSequence() {
1114        if (Token.token_ccs != null)
1115            return Token.token_ccs;
1116
1117        Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1118
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1119
Token.token_ccs = foo;
1120        return Token.token_ccs;
1121    }
1122
1123    // ------------------------------------------------------
1124

1125    // ------------------------------------------------------
1126
/**
1127     * This class represents a node in parse tree.
1128     */

1129    static class StringToken extends Token implements java.io.Serializable JavaDoc {
1130        String JavaDoc string;
1131        int refNumber;
1132
1133        StringToken(int type, String JavaDoc str, int n) {
1134            super(type);
1135            this.string = str;
1136            this.refNumber = n;
1137        }
1138
1139        int getReferenceNumber() { // for STRING
1140
return this.refNumber;
1141        }
1142        String JavaDoc getString() { // for STRING
1143
return this.string;
1144        }
1145        
1146        public String JavaDoc toString(int options) {
1147            if (this.type == BACKREFERENCE)
1148                return "\\"+this.refNumber;
1149            else
1150                return REUtil.quoteMeta(this.string);
1151        }
1152    }
1153
1154    /**
1155     * This class represents a node in parse tree.
1156     */

1157    static class ConcatToken extends Token implements java.io.Serializable JavaDoc {
1158        Token child;
1159        Token child2;
1160        
1161        ConcatToken(Token t1, Token t2) {
1162            super(Token.CONCAT);
1163            this.child = t1;
1164            this.child2 = t2;
1165        }
1166
1167        int size() {
1168            return 2;
1169        }
1170        Token getChild(int index) {
1171            return index == 0 ? this.child : this.child2;
1172        }
1173
1174        public String JavaDoc toString(int options) {
1175            String JavaDoc ret;
1176            if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1177                ret = this.child.toString(options)+"+";
1178            } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1179                ret = this.child.toString(options)+"+?";
1180            } else
1181                ret = this.child.toString(options)+this.child2.toString(options);
1182            return ret;
1183        }
1184    }
1185
1186    /**
1187     * This class represents a node in parse tree.
1188     */

1189    static class CharToken extends Token implements java.io.Serializable JavaDoc {
1190        int chardata;
1191
1192        CharToken(int type, int ch) {
1193            super(type);
1194            this.chardata = ch;
1195        }
1196
1197        int getChar() {
1198            return this.chardata;
1199        }
1200
1201        public String JavaDoc toString(int options) {
1202            String JavaDoc ret;
1203            switch (this.type) {
1204              case CHAR:
1205                switch (this.chardata) {
1206                  case '|': case '*': case '+': case '?':
1207                  case '(': case ')': case '.': case '[':
1208                  case '{': case '\\':
1209                    ret = "\\"+(char)this.chardata;
1210                    break;
1211                  case '\f': ret = "\\f"; break;
1212                  case '\n': ret = "\\n"; break;
1213                  case '\r': ret = "\\r"; break;
1214                  case '\t': ret = "\\t"; break;
1215                  case 0x1b: ret = "\\e"; break;
1216                    //case 0x0b: ret = "\\v"; break;
1217
default:
1218                    if (this.chardata >= 0x10000) {
1219                        String JavaDoc pre = "0"+Integer.toHexString(this.chardata);
1220                        ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1221                    } else
1222                        ret = ""+(char)this.chardata;
1223                }
1224                break;
1225
1226              case ANCHOR:
1227                if (this == Token.token_linebeginning || this == Token.token_lineend)
1228                    ret = ""+(char)this.chardata;
1229                else
1230                    ret = "\\"+(char)this.chardata;
1231                break;
1232
1233              default:
1234                ret = null;
1235            }
1236            return ret;
1237        }
1238
1239        boolean match(int ch) {
1240            if (this.type == CHAR) {
1241                return ch == this.chardata;
1242            } else
1243                throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
1244        }
1245    }
1246
1247    /**
1248     * This class represents a node in parse tree.
1249     */

1250    static class ClosureToken extends Token implements java.io.Serializable JavaDoc {
1251        int min;
1252        int max;
1253        Token child;
1254
1255        ClosureToken(int type, Token tok) {
1256            super(type);
1257            this.child = tok;
1258            this.setMin(-1);
1259            this.setMax(-1);
1260        }
1261
1262        int size() {
1263            return 1;
1264        }
1265        Token getChild(int index) {
1266            return this.child;
1267        }
1268
1269        final void setMin(int min) {
1270            this.min = min;
1271        }
1272        final void setMax(int max) {
1273            this.max = max;
1274        }
1275        final int getMin() {
1276            return this.min;
1277        }
1278        final int getMax() {
1279            return this.max;
1280        }
1281
1282        public String JavaDoc toString(int options) {
1283            String JavaDoc ret;
1284            if (this.type == CLOSURE) {
1285                if (this.getMin() < 0 && this.getMax() < 0) {
1286                    ret = this.child.toString(options)+"*";
1287                } else if (this.getMin() == this.getMax()) {
1288                    ret = this.child.toString(options)+"{"+this.getMin()+"}";
1289                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1290                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1291                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1292                    ret = this.child.toString(options)+"{"+this.getMin()+",}";
1293                } else
1294                    throw new RuntimeException JavaDoc("Token#toString(): CLOSURE "
1295                                               +this.getMin()+", "+this.getMax());
1296            } else {
1297                if (this.getMin() < 0 && this.getMax() < 0) {
1298                    ret = this.child.toString(options)+"*?";
1299                } else if (this.getMin() == this.getMax()) {
1300                    ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1301                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1302                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1303                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1304                    ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1305                } else
1306                    throw new RuntimeException JavaDoc("Token#toString(): NONGREEDYCLOSURE "
1307                                               +this.getMin()+", "+this.getMax());
1308            }
1309            return ret;
1310        }
1311    }
1312
1313    /**
1314     * This class represents a node in parse tree.
1315     */

1316    static class ParenToken extends Token implements java.io.Serializable JavaDoc {
1317        Token child;
1318        int parennumber;
1319
1320        ParenToken(int type, Token tok, int paren) {
1321            super(type);
1322            this.child = tok;
1323            this.parennumber = paren;
1324        }
1325
1326        int size() {
1327            return 1;
1328        }
1329        Token getChild(int index) {
1330            return this.child;
1331        }
1332
1333        int getParenNumber() {
1334            return this.parennumber;
1335        }
1336
1337        public String JavaDoc toString(int options) {
1338            String JavaDoc ret = null;
1339            switch (this.type) {
1340              case PAREN:
1341                if (this.parennumber == 0) {
1342                    ret = "(?:"+this.child.toString(options)+")";
1343                } else {
1344                    ret = "("+this.child.toString(options)+")";
1345                }
1346                break;
1347
1348              case LOOKAHEAD:
1349                ret = "(?="+this.child.toString(options)+")";
1350                break;
1351              case NEGATIVELOOKAHEAD:
1352                ret = "(?!"+this.child.toString(options)+")";
1353                break;
1354              case LOOKBEHIND:
1355                ret = "(?<="+this.child.toString(options)+")";
1356                break;
1357              case NEGATIVELOOKBEHIND:
1358                ret = "(?<!"+this.child.toString(options)+")";
1359                break;
1360              case INDEPENDENT:
1361                ret = "(?>"+this.child.toString(options)+")";
1362                break;
1363            }
1364            return ret;
1365        }
1366    }
1367
1368    /**
1369     * (?(condition)yes-pattern|no-pattern)
1370     */

1371    static class ConditionToken extends Token implements java.io.Serializable JavaDoc {
1372        int refNumber;
1373        Token condition;
1374        Token yes;
1375        Token no;
1376        ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1377            super(Token.CONDITION);
1378            this.refNumber = refno;
1379            this.condition = cond;
1380            this.yes = yespat;
1381            this.no = nopat;
1382        }
1383        int size() {
1384            return this.no == null ? 1 : 2;
1385        }
1386        Token getChild(int index) {
1387            if (index == 0) return this.yes;
1388            if (index == 1) return this.no;
1389            throw new RuntimeException JavaDoc("Internal Error: "+index);
1390        }
1391
1392        public String JavaDoc toString(int options) {
1393            String JavaDoc ret;
1394            if (refNumber > 0) {
1395                ret = "(?("+refNumber+")";
1396            } else if (this.condition.type == Token.ANCHOR) {
1397                ret = "(?("+this.condition+")";
1398            } else {
1399                ret = "(?"+this.condition;
1400            }
1401
1402            if (this.no == null) {
1403                ret += this.yes+")";
1404            } else {
1405                ret += this.yes+"|"+this.no+")";
1406            }
1407            return ret;
1408        }
1409    }
1410
1411    /**
1412     * (ims-ims: .... )
1413     */

1414    static class ModifierToken extends Token implements java.io.Serializable JavaDoc {
1415        Token child;
1416        int add;
1417        int mask;
1418
1419        ModifierToken(Token tok, int add, int mask) {
1420            super(Token.MODIFIERGROUP);
1421            this.child = tok;
1422            this.add = add;
1423            this.mask = mask;
1424        }
1425
1426        int size() {
1427            return 1;
1428        }
1429        Token getChild(int index) {
1430            return this.child;
1431        }
1432
1433        int getOptions() {
1434            return this.add;
1435        }
1436        int getOptionsMask() {
1437            return this.mask;
1438        }
1439
1440        public String JavaDoc toString(int options) {
1441            return "(?"
1442                +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1443                +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1444                +":"
1445                +this.child.toString(options)
1446                +")";
1447        }
1448    }
1449
1450    /**
1451     * This class represents a node in parse tree.
1452     * for UNION or CONCAT.
1453     */

1454    static class UnionToken extends Token implements java.io.Serializable JavaDoc {
1455        Vector JavaDoc children;
1456
1457        UnionToken(int type) {
1458            super(type);
1459        }
1460
1461        void addChild(Token tok) {
1462            if (tok == null) return;
1463            if (this.children == null) this.children = new Vector JavaDoc();
1464            if (this.type == UNION) {
1465                this.children.addElement(tok);
1466                return;
1467            }
1468                                                // This is CONCAT, and new child is CONCAT.
1469
if (tok.type == CONCAT) {
1470                for (int i = 0; i < tok.size(); i ++)
1471                    this.addChild(tok.getChild(i)); // Recursion
1472
return;
1473            }
1474            int size = this.children.size();
1475            if (size == 0) {
1476                this.children.addElement(tok);
1477                return;
1478            }
1479            Token previous = (Token)this.children.elementAt(size-1);
1480            if (!((previous.type == CHAR || previous.type == STRING)
1481                  && (tok.type == CHAR || tok.type == STRING))) {
1482                this.children.addElement(tok);
1483                return;
1484            }
1485            
1486            //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1487

1488            StringBuffer JavaDoc buffer;
1489            int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1490            if (previous.type == CHAR) { // Replace previous token by STRING
1491
buffer = new StringBuffer JavaDoc(2 + nextMaxLength);
1492                int ch = previous.getChar();
1493                if (ch >= 0x10000)
1494                    buffer.append(REUtil.decomposeToSurrogates(ch));
1495                else
1496                    buffer.append((char)ch);
1497                previous = Token.createString(null);
1498                this.children.setElementAt(previous, size-1);
1499            } else { // STRING
1500
buffer = new StringBuffer JavaDoc(previous.getString().length() + nextMaxLength);
1501                buffer.append(previous.getString());
1502            }
1503
1504            if (tok.type == CHAR) {
1505                int ch = tok.getChar();
1506                if (ch >= 0x10000)
1507                    buffer.append(REUtil.decomposeToSurrogates(ch));
1508                else
1509                    buffer.append((char)ch);
1510            } else {
1511                buffer.append(tok.getString());
1512            }
1513
1514            ((StringToken)previous).string = new String JavaDoc(buffer);
1515        }
1516
1517        int size() {
1518            return this.children == null ? 0 : this.children.size();
1519        }
1520        Token getChild(int index) {
1521            return (Token)this.children.elementAt(index);
1522        }
1523
1524        public String JavaDoc toString(int options) {
1525            String JavaDoc ret;
1526            if (this.type == CONCAT) {
1527                if (this.children.size() == 2) {
1528                    Token ch = this.getChild(0);
1529                    Token ch2 = this.getChild(1);
1530                    if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1531                        ret = ch.toString(options)+"+";
1532                    } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1533                        ret = ch.toString(options)+"+?";
1534                    } else
1535                        ret = ch.toString(options)+ch2.toString(options);
1536                } else {
1537                    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1538                    for (int i = 0; i < this.children.size(); i ++) {
1539                        sb.append(((Token)this.children.elementAt(i)).toString(options));
1540                    }
1541                    ret = new String JavaDoc(sb);
1542                }
1543                return ret;
1544            }
1545            if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1546                ret = this.getChild(0).toString(options)+"?";
1547            } else if (this.children.size() == 2
1548                       && this.getChild(0).type == EMPTY) {
1549                ret = this.getChild(1).toString(options)+"??";
1550            } else {
1551                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1552                sb.append(((Token)this.children.elementAt(0)).toString(options));
1553                for (int i = 1; i < this.children.size(); i ++) {
1554                    sb.append((char)'|');
1555                    sb.append(((Token)this.children.elementAt(i)).toString(options));
1556                }
1557                ret = new String JavaDoc(sb);
1558            }
1559            return ret;
1560        }
1561    }
1562}
1563
Popular Tags