KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xerces > impl > xpath > regex > Token


1 /*
2  * Copyright 1999-2002,2004,2005 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.xerces.impl.xpath.regex;
18
19 import java.util.Vector JavaDoc;
20 import java.util.Hashtable JavaDoc;
21
22 /**
23  * This class represents a node in parse tree.
24  *
25  * @xerces.internal
26  *
27  * @version $Id: Token.java,v 1.10 2005/03/22 03:26:24 mrglavas Exp $
28  */

29 class Token implements java.io.Serializable JavaDoc {
30
31     private static final long serialVersionUID = 4049923761862293040L;
32
33     static final boolean COUNTTOKENS = true;
34     static int tokens = 0;
35
36     static final int CHAR = 0; // Literal char
37
static final int DOT = 11; // .
38
static final int CONCAT = 1; // XY
39
static final int UNION = 2; // X|Y|Z
40
static final int CLOSURE = 3; // X*
41
static final int RANGE = 4; // [a-zA-Z] etc.
42
static final int NRANGE = 5; // [^a-zA-Z] etc.
43
static final int PAREN = 6; // (X) or (?:X)
44
static final int EMPTY = 7; //
45
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
46
static final int NONGREEDYCLOSURE = 9; // *? +?
47
static final int STRING = 10; // strings
48
static final int BACKREFERENCE = 12; // back references
49
static final int LOOKAHEAD = 20; // (?=...)
50
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
51
static final int LOOKBEHIND = 22; // (?<=...)
52
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
53
static final int INDEPENDENT = 24; // (?>...)
54
static final int MODIFIERGROUP = 25; // (?ims-ims:...)
55
static final int CONDITION = 26; // (?(...)yes|no)
56

57     static final int UTF16_MAX = 0x10ffff;
58
59     int type;
60
61     static Token token_dot;
62     static Token token_0to9;
63     static Token token_wordchars;
64     static Token token_not_0to9;
65     static Token token_not_wordchars;
66     static Token token_spaces;
67     static Token token_not_spaces;
68     static Token token_empty;
69     static Token token_linebeginning;
70     static Token token_linebeginning2;
71     static Token token_lineend;
72     static Token token_stringbeginning;
73     static Token token_stringend;
74     static Token token_stringend2;
75     static Token token_wordedge;
76     static Token token_not_wordedge;
77     static Token token_wordbeginning;
78     static Token token_wordend;
79     static {
80         Token.token_empty = new Token(Token.EMPTY);
81
82         Token.token_linebeginning = Token.createAnchor('^');
83         Token.token_linebeginning2 = Token.createAnchor('@');
84         Token.token_lineend = Token.createAnchor('$');
85         Token.token_stringbeginning = Token.createAnchor('A');
86         Token.token_stringend = Token.createAnchor('z');
87         Token.token_stringend2 = Token.createAnchor('Z');
88         Token.token_wordedge = Token.createAnchor('b');
89         Token.token_not_wordedge = Token.createAnchor('B');
90         Token.token_wordbeginning = Token.createAnchor('<');
91         Token.token_wordend = Token.createAnchor('>');
92
93         Token.token_dot = new Token(Token.DOT);
94
95         Token.token_0to9 = Token.createRange();
96         Token.token_0to9.addRange('0', '9');
97         Token.token_wordchars = Token.createRange();
98         Token.token_wordchars.addRange('0', '9');
99         Token.token_wordchars.addRange('A', 'Z');
100         Token.token_wordchars.addRange('_', '_');
101         Token.token_wordchars.addRange('a', 'z');
102         Token.token_spaces = Token.createRange();
103         Token.token_spaces.addRange('\t', '\t');
104         Token.token_spaces.addRange('\n', '\n');
105         Token.token_spaces.addRange('\f', '\f');
106         Token.token_spaces.addRange('\r', '\r');
107         Token.token_spaces.addRange(' ', ' ');
108
109         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
110         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
111         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
112     }
113
114     static Token.ParenToken createLook(int type, Token child) {
115         if (COUNTTOKENS) Token.tokens ++;
116         return new Token.ParenToken(type, child, 0);
117     }
118     static Token.ParenToken createParen(Token child, int pnumber) {
119         if (COUNTTOKENS) Token.tokens ++;
120         return new Token.ParenToken(Token.PAREN, child, pnumber);
121     }
122     static Token.ClosureToken createClosure(Token tok) {
123         if (COUNTTOKENS) Token.tokens ++;
124         return new Token.ClosureToken(Token.CLOSURE, tok);
125     }
126     static Token.ClosureToken createNGClosure(Token tok) {
127         if (COUNTTOKENS) Token.tokens ++;
128         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
129     }
130     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
131         if (COUNTTOKENS) Token.tokens ++;
132         return new Token.ConcatToken(tok1, tok2);
133     }
134     static Token.UnionToken createConcat() {
135         if (COUNTTOKENS) Token.tokens ++;
136         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
137
}
138     static Token.UnionToken createUnion() {
139         if (COUNTTOKENS) Token.tokens ++;
140         return new Token.UnionToken(Token.UNION);
141     }
142     static Token createEmpty() {
143         return Token.token_empty;
144     }
145     static RangeToken createRange() {
146         if (COUNTTOKENS) Token.tokens ++;
147         return new RangeToken(Token.RANGE);
148     }
149     static RangeToken createNRange() {
150         if (COUNTTOKENS) Token.tokens ++;
151         return new RangeToken(Token.NRANGE);
152     }
153     static Token.CharToken createChar(int ch) {
154         if (COUNTTOKENS) Token.tokens ++;
155         return new Token.CharToken(Token.CHAR, ch);
156     }
157     static private Token.CharToken createAnchor(int ch) {
158         if (COUNTTOKENS) Token.tokens ++;
159         return new Token.CharToken(Token.ANCHOR, ch);
160     }
161     static Token.StringToken createBackReference(int refno) {
162         if (COUNTTOKENS) Token.tokens ++;
163         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
164     }
165     static Token.StringToken createString(String JavaDoc str) {
166         if (COUNTTOKENS) Token.tokens ++;
167         return new Token.StringToken(Token.STRING, str, 0);
168     }
169     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
170         if (COUNTTOKENS) Token.tokens ++;
171         return new Token.ModifierToken(child, add, mask);
172     }
173     static Token.ConditionToken createCondition(int refno, Token condition,
174                                                 Token yespat, Token nopat) {
175         if (COUNTTOKENS) Token.tokens ++;
176         return new Token.ConditionToken(refno, condition, yespat, nopat);
177     }
178
179     protected Token(int type) {
180         this.type = type;
181     }
182
183     /**
184      * A number of children.
185      */

186     int size() {
187         return 0;
188     }
189     Token getChild(int index) {
190         return null;
191     }
192     void addChild(Token tok) {
193         throw new RuntimeException JavaDoc("Not supported.");
194     }
195
196                                                 // for RANGE or NRANGE
197
protected void addRange(int start, int end) {
198         throw new RuntimeException JavaDoc("Not supported.");
199     }
200     protected void sortRanges() {
201         throw new RuntimeException JavaDoc("Not supported.");
202     }
203     protected void compactRanges() {
204         throw new RuntimeException JavaDoc("Not supported.");
205     }
206     protected void mergeRanges(Token tok) {
207         throw new RuntimeException JavaDoc("Not supported.");
208     }
209     protected void subtractRanges(Token tok) {
210         throw new RuntimeException JavaDoc("Not supported.");
211     }
212     protected void intersectRanges(Token tok) {
213         throw new RuntimeException JavaDoc("Not supported.");
214     }
215     static Token complementRanges(Token tok) {
216         return RangeToken.complementRanges(tok);
217     }
218
219
220     void setMin(int min) { // for CLOSURE
221
}
222     void setMax(int max) { // for CLOSURE
223
}
224     int getMin() { // for CLOSURE
225
return -1;
226     }
227     int getMax() { // for CLOSURE
228
return -1;
229     }
230     int getReferenceNumber() { // for STRING
231
return 0;
232     }
233     String JavaDoc getString() { // for STRING
234
return null;
235     }
236
237     int getParenNumber() {
238         return 0;
239     }
240     int getChar() {
241         return -1;
242     }
243
244     public String JavaDoc toString() {
245         return this.toString(0);
246     }
247     public String JavaDoc toString(int options) {
248         return this.type == Token.DOT ? "." : "";
249     }
250
251     /**
252      * How many characters are needed?
253      */

254     final int getMinLength() {
255         switch (this.type) {
256           case CONCAT:
257             int sum = 0;
258             for (int i = 0; i < this.size(); i ++)
259                 sum += this.getChild(i).getMinLength();
260             return sum;
261
262           case CONDITION:
263           case UNION:
264             if (this.size() == 0)
265                 return 0;
266             int ret = this.getChild(0).getMinLength();
267             for (int i = 1; i < this.size(); i ++) {
268                 int min = this.getChild(i).getMinLength();
269                 if (min < ret) ret = min;
270             }
271             return ret;
272
273           case CLOSURE:
274           case NONGREEDYCLOSURE:
275             if (this.getMin() >= 0)
276                 return this.getMin() * this.getChild(0).getMinLength();
277             return 0;
278
279           case EMPTY:
280           case ANCHOR:
281             return 0;
282
283           case DOT:
284           case CHAR:
285           case RANGE:
286           case NRANGE:
287             return 1;
288
289           case INDEPENDENT:
290           case PAREN:
291           case MODIFIERGROUP:
292             return this.getChild(0).getMinLength();
293
294           case BACKREFERENCE:
295             return 0; // *******
296

297           case STRING:
298             return this.getString().length();
299
300           case LOOKAHEAD:
301           case NEGATIVELOOKAHEAD:
302           case LOOKBEHIND:
303           case NEGATIVELOOKBEHIND:
304             return 0; // ***** Really?
305

306           default:
307             throw new RuntimeException JavaDoc("Token#getMinLength(): Invalid Type: "+this.type);
308         }
309     }
310
311     final int getMaxLength() {
312         switch (this.type) {
313           case CONCAT:
314             int sum = 0;
315             for (int i = 0; i < this.size(); i ++) {
316                 int d = this.getChild(i).getMaxLength();
317                 if (d < 0) return -1;
318                 sum += d;
319             }
320             return sum;
321
322           case CONDITION:
323           case UNION:
324             if (this.size() == 0)
325                 return 0;
326             int ret = this.getChild(0).getMaxLength();
327             for (int i = 1; ret >= 0 && i < this.size(); i ++) {
328                 int max = this.getChild(i).getMaxLength();
329                 if (max < 0) { // infinity
330
ret = -1;
331                     break;
332                 }
333                 if (max > ret) ret = max;
334             }
335             return ret;
336
337           case CLOSURE:
338           case NONGREEDYCLOSURE:
339             if (this.getMax() >= 0)
340                                                 // When this.child.getMaxLength() < 0,
341
// this returns minus value
342
return this.getMax() * this.getChild(0).getMaxLength();
343             return -1;
344
345           case EMPTY:
346           case ANCHOR:
347             return 0;
348
349           case CHAR:
350             return 1;
351           case DOT:
352           case RANGE:
353           case NRANGE:
354             return 2;
355
356           case INDEPENDENT:
357           case PAREN:
358           case MODIFIERGROUP:
359             return this.getChild(0).getMaxLength();
360
361           case BACKREFERENCE:
362             return -1; // ******
363

364           case STRING:
365             return this.getString().length();
366
367           case LOOKAHEAD:
368           case NEGATIVELOOKAHEAD:
369           case LOOKBEHIND:
370           case NEGATIVELOOKBEHIND:
371             return 0; // ***** Really?
372

373           default:
374             throw new RuntimeException JavaDoc("Token#getMaxLength(): Invalid Type: "+this.type);
375         }
376     }
377
378     static final int FC_CONTINUE = 0;
379     static final int FC_TERMINAL = 1;
380     static final int FC_ANY = 2;
381     private static final boolean isSet(int options, int flag) {
382         return (options & flag) == flag;
383     }
384     final int analyzeFirstCharacter(RangeToken result, int options) {
385         switch (this.type) {
386           case CONCAT:
387             int ret = FC_CONTINUE;
388             for (int i = 0; i < this.size(); i ++)
389                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
390                     break;
391             return ret;
392
393           case UNION:
394             if (this.size() == 0)
395                 return FC_CONTINUE;
396             /*
397              * a|b|c -> FC_TERMINAL
398              * a|.|c -> FC_ANY
399              * a|b| -> FC_CONTINUE
400              */

401             int ret2 = FC_CONTINUE;
402             boolean hasEmpty = false;
403             for (int i = 0; i < this.size(); i ++) {
404                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
405                 if (ret2 == FC_ANY)
406                     break;
407                 else if (ret2 == FC_CONTINUE)
408                     hasEmpty = true;
409             }
410             return hasEmpty ? FC_CONTINUE : ret2;
411
412           case CONDITION:
413             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
414             if (this.size() == 1) return FC_CONTINUE;
415             if (ret3 == FC_ANY) return ret3;
416             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
417             if (ret4 == FC_ANY) return ret4;
418             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
419
420           case CLOSURE:
421           case NONGREEDYCLOSURE:
422             this.getChild(0).analyzeFirstCharacter(result, options);
423             return FC_CONTINUE;
424
425           case EMPTY:
426           case ANCHOR:
427             return FC_CONTINUE;
428
429           case CHAR:
430             int ch = this.getChar();
431             result.addRange(ch, ch);
432             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
433                 ch = Character.toUpperCase((char)ch);
434                 result.addRange(ch, ch);
435                 ch = Character.toLowerCase((char)ch);
436                 result.addRange(ch, ch);
437             }
438             return FC_TERMINAL;
439
440           case DOT: // ****
441
if (isSet(options, RegularExpression.SINGLE_LINE)) {
442                 return FC_CONTINUE; // **** We can not optimize.
443
} else {
444                 return FC_CONTINUE;
445                 /*
446                 result.addRange(0, RegularExpression.LINE_FEED-1);
447                 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
448                 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
449                                 RegularExpression.LINE_SEPARATOR-1);
450                 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
451                 return 1;
452                 */

453             }
454
455           case RANGE:
456             if (isSet(options, RegularExpression.IGNORE_CASE)) {
457                 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
458             } else {
459                 result.mergeRanges(this);
460             }
461             return FC_TERMINAL;
462
463           case NRANGE: // ****
464
if (isSet(options, RegularExpression.IGNORE_CASE)) {
465                 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
466             } else {
467                 result.mergeRanges(Token.complementRanges(this));
468             }
469             return FC_TERMINAL;
470
471           case INDEPENDENT:
472           case PAREN:
473             return this.getChild(0).analyzeFirstCharacter(result, options);
474
475           case MODIFIERGROUP:
476             options |= ((ModifierToken)this).getOptions();
477             options &= ~((ModifierToken)this).getOptionsMask();
478             return this.getChild(0).analyzeFirstCharacter(result, options);
479
480           case BACKREFERENCE:
481             result.addRange(0, UTF16_MAX); // **** We can not optimize.
482
return FC_ANY;
483
484           case STRING:
485             int cha = this.getString().charAt(0);
486             int ch2;
487             if (REUtil.isHighSurrogate(cha)
488                 && this.getString().length() >= 2
489                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
490                 cha = REUtil.composeFromSurrogates(cha, ch2);
491             result.addRange(cha, cha);
492             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
493                 cha = Character.toUpperCase((char)cha);
494                 result.addRange(cha, cha);
495                 cha = Character.toLowerCase((char)cha);
496                 result.addRange(cha, cha);
497             }
498             return FC_TERMINAL;
499
500           case LOOKAHEAD:
501           case NEGATIVELOOKAHEAD:
502           case LOOKBEHIND:
503           case NEGATIVELOOKBEHIND:
504             return FC_CONTINUE;
505
506           default:
507             throw new RuntimeException JavaDoc("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
508         }
509     }
510
511     private final boolean isShorterThan(Token tok) {
512         if (tok == null) return false;
513         /*
514         int mylength;
515         if (this.type == STRING) mylength = this.getString().length();
516         else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
517         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
518         int otherlength;
519         if (tok.type == STRING) otherlength = tok.getString().length();
520         else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
521         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
522         */

523         int mylength;
524         if (this.type == STRING) mylength = this.getString().length();
525         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+this.type);
526         int otherlength;
527         if (tok.type == STRING) otherlength = tok.getString().length();
528         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+tok.type);
529         return mylength < otherlength;
530     }
531
532     static class FixedStringContainer {
533         Token token = null;
534         int options = 0;
535         FixedStringContainer() {
536         }
537     }
538
539     final void findFixedString(FixedStringContainer container, int options) {
540         switch (this.type) {
541           case CONCAT:
542             Token prevToken = null;
543             int prevOptions = 0;
544             for (int i = 0; i < this.size(); i ++) {
545                 this.getChild(i).findFixedString(container, options);
546                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
547                     prevToken = container.token;
548                     prevOptions = container.options;
549                 }
550             }
551             container.token = prevToken;
552             container.options = prevOptions;
553             return;
554
555           case UNION:
556           case CLOSURE:
557           case NONGREEDYCLOSURE:
558           case EMPTY:
559           case ANCHOR:
560           case RANGE:
561           case DOT:
562           case NRANGE:
563           case BACKREFERENCE:
564           case LOOKAHEAD:
565           case NEGATIVELOOKAHEAD:
566           case LOOKBEHIND:
567           case NEGATIVELOOKBEHIND:
568           case CONDITION:
569             container.token = null;
570             return;
571
572           case CHAR: // Ignore CHAR tokens.
573
container.token = null; // **
574
return; // **
575

576           case STRING:
577             container.token = this;
578             container.options = options;
579             return;
580
581           case INDEPENDENT:
582           case PAREN:
583             this.getChild(0).findFixedString(container, options);
584             return;
585
586           case MODIFIERGROUP:
587             options |= ((ModifierToken)this).getOptions();
588             options &= ~((ModifierToken)this).getOptionsMask();
589             this.getChild(0).findFixedString(container, options);
590             return;
591
592           default:
593             throw new RuntimeException JavaDoc("Token#findFixedString(): Invalid Type: "+this.type);
594         }
595     }
596
597     boolean match(int ch) {
598         throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
599     }
600
601     // ------------------------------------------------------
602
private final static Hashtable JavaDoc categories = new Hashtable JavaDoc();
603     private final static Hashtable JavaDoc categories2 = new Hashtable JavaDoc();
604     private static final String JavaDoc[] categoryNames = {
605         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
606         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
607         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
608
"Pi", "Pf", // 29, 30
609
"L", "M", "N", "Z", "C", "P", "S", // 31-37
610
};
611
612     // Schema Rec. {Datatypes} - Punctuation
613
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
614
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
615
static final int CHAR_LETTER = 31;
616     static final int CHAR_MARK = 32;
617     static final int CHAR_NUMBER = 33;
618     static final int CHAR_SEPARATOR = 34;
619     static final int CHAR_OTHER = 35;
620     static final int CHAR_PUNCTUATION = 36;
621     static final int CHAR_SYMBOL = 37;
622     
623     //blockNames in UNICODE 3.1 that supported by XML Schema REC
624
private static final String JavaDoc[] blockNames = {
625         /*0000..007F;*/ "Basic Latin",
626         /*0080..00FF;*/ "Latin-1 Supplement",
627         /*0100..017F;*/ "Latin Extended-A",
628         /*0180..024F;*/ "Latin Extended-B",
629         /*0250..02AF;*/ "IPA Extensions",
630         /*02B0..02FF;*/ "Spacing Modifier Letters",
631         /*0300..036F;*/ "Combining Diacritical Marks",
632         /*0370..03FF;*/ "Greek",
633         /*0400..04FF;*/ "Cyrillic",
634         /*0530..058F;*/ "Armenian",
635         /*0590..05FF;*/ "Hebrew",
636         /*0600..06FF;*/ "Arabic",
637         /*0700..074F;*/ "Syriac",
638         /*0780..07BF;*/ "Thaana",
639         /*0900..097F;*/ "Devanagari",
640         /*0980..09FF;*/ "Bengali",
641         /*0A00..0A7F;*/ "Gurmukhi",
642         /*0A80..0AFF;*/ "Gujarati",
643         /*0B00..0B7F;*/ "Oriya",
644         /*0B80..0BFF;*/ "Tamil",
645         /*0C00..0C7F;*/ "Telugu",
646         /*0C80..0CFF;*/ "Kannada",
647         /*0D00..0D7F;*/ "Malayalam",
648         /*0D80..0DFF;*/ "Sinhala",
649         /*0E00..0E7F;*/ "Thai",
650         /*0E80..0EFF;*/ "Lao",
651         /*0F00..0FFF;*/ "Tibetan",
652         /*1000..109F;*/ "Myanmar",
653         /*10A0..10FF;*/ "Georgian",
654         /*1100..11FF;*/ "Hangul Jamo",
655         /*1200..137F;*/ "Ethiopic",
656         /*13A0..13FF;*/ "Cherokee",
657         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
658         /*1680..169F;*/ "Ogham",
659         /*16A0..16FF;*/ "Runic",
660         /*1780..17FF;*/ "Khmer",
661         /*1800..18AF;*/ "Mongolian",
662         /*1E00..1EFF;*/ "Latin Extended Additional",
663         /*1F00..1FFF;*/ "Greek Extended",
664         /*2000..206F;*/ "General Punctuation",
665         /*2070..209F;*/ "Superscripts and Subscripts",
666         /*20A0..20CF;*/ "Currency Symbols",
667         /*20D0..20FF;*/ "Combining Marks for Symbols",
668         /*2100..214F;*/ "Letterlike Symbols",
669         /*2150..218F;*/ "Number Forms",
670         /*2190..21FF;*/ "Arrows",
671         /*2200..22FF;*/ "Mathematical Operators",
672         /*2300..23FF;*/ "Miscellaneous Technical",
673         /*2400..243F;*/ "Control Pictures",
674         /*2440..245F;*/ "Optical Character Recognition",
675         /*2460..24FF;*/ "Enclosed Alphanumerics",
676         /*2500..257F;*/ "Box Drawing",
677         /*2580..259F;*/ "Block Elements",
678         /*25A0..25FF;*/ "Geometric Shapes",
679         /*2600..26FF;*/ "Miscellaneous Symbols",
680         /*2700..27BF;*/ "Dingbats",
681         /*2800..28FF;*/ "Braille Patterns",
682         /*2E80..2EFF;*/ "CJK Radicals Supplement",
683         /*2F00..2FDF;*/ "Kangxi Radicals",
684         /*2FF0..2FFF;*/ "Ideographic Description Characters",
685         /*3000..303F;*/ "CJK Symbols and Punctuation",
686         /*3040..309F;*/ "Hiragana",
687         /*30A0..30FF;*/ "Katakana",
688         /*3100..312F;*/ "Bopomofo",
689         /*3130..318F;*/ "Hangul Compatibility Jamo",
690         /*3190..319F;*/ "Kanbun",
691         /*31A0..31BF;*/ "Bopomofo Extended",
692         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
693         /*3300..33FF;*/ "CJK Compatibility",
694         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
695         /*4E00..9FFF;*/ "CJK Unified Ideographs",
696         /*A000..A48F;*/ "Yi Syllables",
697         /*A490..A4CF;*/ "Yi Radicals",
698         /*AC00..D7A3;*/ "Hangul Syllables",
699         /*E000..F8FF;*/ "Private Use",
700         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
701         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
702         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
703         /*FE20..FE2F;*/ "Combining Half Marks",
704         /*FE30..FE4F;*/ "CJK Compatibility Forms",
705         /*FE50..FE6F;*/ "Small Form Variants",
706         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
707         /*FEFF..FEFF;*/ "Specials",
708         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
709          //missing Specials add manually
710
/*10300..1032F;*/ "Old Italic", // 84
711
/*10330..1034F;*/ "Gothic",
712         /*10400..1044F;*/ "Deseret",
713         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
714         /*1D100..1D1FF;*/ "Musical Symbols",
715         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
716         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
717         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
718         /*E0000..E007F;*/ "Tags",
719         //missing 2 private use add manually
720

721     };
722     //ADD THOSE MANUALLY
723
//F0000..FFFFD; "Private Use",
724
//100000..10FFFD; "Private Use"
725
//FFF0..FFFD; "Specials",
726
static final String JavaDoc blockRanges =
727        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
728         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
729         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
730         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
731         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
732         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
733         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
734         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
735         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
736         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
737         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
738     static final int[] nonBMPBlockRanges = {
739         0x10300, 0x1032F, // 84
740
0x10330, 0x1034F,
741         0x10400, 0x1044F,
742         0x1D000, 0x1D0FF,
743         0x1D100, 0x1D1FF,
744         0x1D400, 0x1D7FF,
745         0x20000, 0x2A6D6,
746         0x2F800, 0x2FA1F,
747         0xE0000, 0xE007F
748     };
749     private static final int NONBMP_BLOCK_START = 84;
750
751     static protected RangeToken getRange(String JavaDoc name, boolean positive) {
752         if (Token.categories.size() == 0) {
753             synchronized (Token.categories) {
754                 Token[] ranges = new Token[Token.categoryNames.length];
755                 for (int i = 0; i < ranges.length; i ++) {
756                     ranges[i] = Token.createRange();
757                 }
758                 int type;
759                 for (int i = 0; i < 0x10000; i ++) {
760                     type = Character.getType((char)i);
761                     if (type == Character.START_PUNCTUATION ||
762                         type == Character.END_PUNCTUATION) {
763                         //build table of Pi values
764
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
765                             i == 0x201F || i == 0x2039) {
766                             type = CHAR_INIT_QUOTE;
767                         }
768                         //build table of Pf values
769
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
770                             type = CHAR_FINAL_QUOTE;
771                         }
772                     }
773                     ranges[type].addRange(i, i);
774                     switch (type) {
775                       case Character.UPPERCASE_LETTER:
776                       case Character.LOWERCASE_LETTER:
777                       case Character.TITLECASE_LETTER:
778                       case Character.MODIFIER_LETTER:
779                       case Character.OTHER_LETTER:
780                         type = CHAR_LETTER;
781                         break;
782                       case Character.NON_SPACING_MARK:
783                       case Character.COMBINING_SPACING_MARK:
784                       case Character.ENCLOSING_MARK:
785                         type = CHAR_MARK;
786                         break;
787                       case Character.DECIMAL_DIGIT_NUMBER:
788                       case Character.LETTER_NUMBER:
789                       case Character.OTHER_NUMBER:
790                         type = CHAR_NUMBER;
791                         break;
792                       case Character.SPACE_SEPARATOR:
793                       case Character.LINE_SEPARATOR:
794                       case Character.PARAGRAPH_SEPARATOR:
795                         type = CHAR_SEPARATOR;
796                         break;
797                       case Character.CONTROL:
798                       case Character.FORMAT:
799                       case Character.SURROGATE:
800                       case Character.PRIVATE_USE:
801                       case Character.UNASSIGNED:
802                         type = CHAR_OTHER;
803                         break;
804                       case Character.CONNECTOR_PUNCTUATION:
805                       case Character.DASH_PUNCTUATION:
806                       case Character.START_PUNCTUATION:
807                       case Character.END_PUNCTUATION:
808                       case CHAR_INIT_QUOTE:
809                       case CHAR_FINAL_QUOTE:
810                       case Character.OTHER_PUNCTUATION:
811                         type = CHAR_PUNCTUATION;
812                         break;
813                       case Character.MATH_SYMBOL:
814                       case Character.CURRENCY_SYMBOL:
815                       case Character.MODIFIER_SYMBOL:
816                       case Character.OTHER_SYMBOL:
817                         type = CHAR_SYMBOL;
818                         break;
819                       default:
820                         throw new RuntimeException JavaDoc("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
821                     }
822                     ranges[type].addRange(i, i);
823                 } // for all characters
824
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
825
826                 for (int i = 0; i < ranges.length; i ++) {
827                     if (Token.categoryNames[i] != null) {
828                         if (i == Character.UNASSIGNED) { // Unassigned
829
ranges[i].addRange(0x10000, Token.UTF16_MAX);
830                         }
831                         Token.categories.put(Token.categoryNames[i], ranges[i]);
832                         Token.categories2.put(Token.categoryNames[i],
833                                               Token.complementRanges(ranges[i]));
834                     }
835                 }
836                 //REVISIT: do we really need to support block names as in Unicode 3.1
837
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
838
//
839
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(50);
840                 for (int i = 0; i < Token.blockNames.length; i ++) {
841                     Token r1 = Token.createRange();
842                     int location;
843                     if (i < NONBMP_BLOCK_START) {
844                         location = i*2;
845                         int rstart = Token.blockRanges.charAt(location);
846                         int rend = Token.blockRanges.charAt(location+1);
847                         //DEBUGING
848
//System.out.println(n+" " +Integer.toHexString(rstart)
849
// +"-"+ Integer.toHexString(rend));
850
r1.addRange(rstart, rend);
851                     } else {
852                         location = (i - NONBMP_BLOCK_START) * 2;
853                         r1.addRange(Token.nonBMPBlockRanges[location],
854                                     Token.nonBMPBlockRanges[location + 1]);
855                     }
856                     String JavaDoc n = Token.blockNames[i];
857                     if (n.equals("Specials"))
858                         r1.addRange(0xfff0, 0xfffd);
859                     if (n.equals("Private Use")) {
860                         r1.addRange(0xF0000,0xFFFFD);
861                         r1.addRange(0x100000,0x10FFFD);
862                     }
863                     Token.categories.put(n, r1);
864                     Token.categories2.put(n, Token.complementRanges(r1));
865                     buffer.setLength(0);
866                     buffer.append("Is");
867                     if (n.indexOf(' ') >= 0) {
868                         for (int ci = 0; ci < n.length(); ci ++)
869                             if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
870                     }
871                     else {
872                         buffer.append(n);
873                     }
874                     Token.setAlias(buffer.toString(), n, true);
875                 }
876
877                 // TR#18 1.2
878
Token.setAlias("ASSIGNED", "Cn", false);
879                 Token.setAlias("UNASSIGNED", "Cn", true);
880                 Token all = Token.createRange();
881                 all.addRange(0, Token.UTF16_MAX);
882                 Token.categories.put("ALL", all);
883                 Token.categories2.put("ALL", Token.complementRanges(all));
884                 Token.registerNonXS("ASSIGNED");
885                 Token.registerNonXS("UNASSIGNED");
886                 Token.registerNonXS("ALL");
887
888                 Token isalpha = Token.createRange();
889                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
890
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
891
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
892
Token.categories.put("IsAlpha", isalpha);
893                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
894                 Token.registerNonXS("IsAlpha");
895
896                 Token isalnum = Token.createRange();
897                 isalnum.mergeRanges(isalpha); // Lu Ll Lo
898
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
899
Token.categories.put("IsAlnum", isalnum);
900                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
901                 Token.registerNonXS("IsAlnum");
902
903                 Token isspace = Token.createRange();
904                 isspace.mergeRanges(Token.token_spaces);
905                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
906
Token.categories.put("IsSpace", isspace);
907                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
908                 Token.registerNonXS("IsSpace");
909
910                 Token isword = Token.createRange();
911                 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
912
isword.addRange('_', '_');
913                 Token.categories.put("IsWord", isword);
914                 Token.categories2.put("IsWord", Token.complementRanges(isword));
915                 Token.registerNonXS("IsWord");
916
917                 Token isascii = Token.createRange();
918                 isascii.addRange(0, 127);
919                 Token.categories.put("IsASCII", isascii);
920                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
921                 Token.registerNonXS("IsASCII");
922
923                 Token isnotgraph = Token.createRange();
924                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
925                 isnotgraph.addRange(' ', ' ');
926                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
927                 Token.categories2.put("IsGraph", isnotgraph);
928                 Token.registerNonXS("IsGraph");
929
930                 Token isxdigit = Token.createRange();
931                 isxdigit.addRange('0', '9');
932                 isxdigit.addRange('A', 'F');
933                 isxdigit.addRange('a', 'f');
934                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
935                 Token.categories2.put("IsXDigit", isxdigit);
936                 Token.registerNonXS("IsXDigit");
937
938                 Token.setAlias("IsDigit", "Nd", true);
939                 Token.setAlias("IsUpper", "Lu", true);
940                 Token.setAlias("IsLower", "Ll", true);
941                 Token.setAlias("IsCntrl", "C", true);
942                 Token.setAlias("IsPrint", "C", false);
943                 Token.setAlias("IsPunct", "P", true);
944                 Token.registerNonXS("IsDigit");
945                 Token.registerNonXS("IsUpper");
946                 Token.registerNonXS("IsLower");
947                 Token.registerNonXS("IsCntrl");
948                 Token.registerNonXS("IsPrint");
949                 Token.registerNonXS("IsPunct");
950
951                 Token.setAlias("alpha", "IsAlpha", true);
952                 Token.setAlias("alnum", "IsAlnum", true);
953                 Token.setAlias("ascii", "IsASCII", true);
954                 Token.setAlias("cntrl", "IsCntrl", true);
955                 Token.setAlias("digit", "IsDigit", true);
956                 Token.setAlias("graph", "IsGraph", true);
957                 Token.setAlias("lower", "IsLower", true);
958                 Token.setAlias("print", "IsPrint", true);
959                 Token.setAlias("punct", "IsPunct", true);
960                 Token.setAlias("space", "IsSpace", true);
961                 Token.setAlias("upper", "IsUpper", true);
962                 Token.setAlias("word", "IsWord", true); // Perl extension
963
Token.setAlias("xdigit", "IsXDigit", true);
964                 Token.registerNonXS("alpha");
965                 Token.registerNonXS("alnum");
966                 Token.registerNonXS("ascii");
967                 Token.registerNonXS("cntrl");
968                 Token.registerNonXS("digit");
969                 Token.registerNonXS("graph");
970                 Token.registerNonXS("lower");
971                 Token.registerNonXS("print");
972                 Token.registerNonXS("punct");
973                 Token.registerNonXS("space");
974                 Token.registerNonXS("upper");
975                 Token.registerNonXS("word");
976                 Token.registerNonXS("xdigit");
977             } // synchronized
978
} // if null
979
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
980             : (RangeToken)Token.categories2.get(name);
981         //if (tok == null) System.out.println(name);
982
return tok;
983     }
984     static protected RangeToken getRange(String JavaDoc name, boolean positive, boolean xs) {
985         RangeToken range = Token.getRange(name, positive);
986         if (xs && range != null && Token.isRegisterNonXS(name))
987             range = null;
988         return range;
989     }
990
991     static Hashtable JavaDoc nonxs = null;
992     /**
993      * This method is called by only getRange().
994      * So this method need not MT-safe.
995      */

996     static protected void registerNonXS(String JavaDoc name) {
997         if (Token.nonxs == null)
998             Token.nonxs = new Hashtable JavaDoc();
999         Token.nonxs.put(name, name);
1000    }
1001    static protected boolean isRegisterNonXS(String JavaDoc name) {
1002        if (Token.nonxs == null)
1003            return false;
1004        //DEBUG
1005
//System.err.println("isRegisterNonXS: "+name);
1006
return Token.nonxs.containsKey(name);
1007    }
1008
1009    private static void setAlias(String JavaDoc newName, String JavaDoc name, boolean positive) {
1010        Token t1 = (Token)Token.categories.get(name);
1011        Token t2 = (Token)Token.categories2.get(name);
1012        if (positive) {
1013            Token.categories.put(newName, t1);
1014            Token.categories2.put(newName, t2);
1015        } else {
1016            Token.categories2.put(newName, t1);
1017            Token.categories.put(newName, t2);
1018        }
1019    }
1020
1021    // ------------------------------------------------------
1022

1023    static final String JavaDoc viramaString =
1024    "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1025
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1026
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1027
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1028
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1029
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1030
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1031
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1032
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1033
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1034
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1035

1036    static private Token token_grapheme = null;
1037    static synchronized Token getGraphemePattern() {
1038        if (Token.token_grapheme != null)
1039            return Token.token_grapheme;
1040
1041        Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1042
base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1043        base_char.subtractRanges(Token.getRange("M", true));
1044        base_char.subtractRanges(Token.getRange("C", true));
1045
1046        Token virama = Token.createRange();
1047        for (int i = 0; i < Token.viramaString.length(); i ++) {
1048            int ch = viramaString.charAt(i);
1049            virama.addRange(i, i);
1050        }
1051
1052        Token combiner_wo_virama = Token.createRange();
1053        combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1054        combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1055
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1056

1057        Token left = Token.createUnion(); // base_char?
1058
left.addChild(base_char);
1059        left.addChild(Token.token_empty);
1060
1061        Token foo = Token.createUnion();
1062        foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1063        foo.addChild(combiner_wo_virama);
1064
1065        foo = Token.createClosure(foo);
1066
1067        foo = Token.createConcat(left, foo);
1068
1069        Token.token_grapheme = foo;
1070        return Token.token_grapheme;
1071    }
1072
1073    /**
1074     * Combing Character Sequence in Perl 5.6.
1075     */

1076    static private Token token_ccs = null;
1077    static synchronized Token getCombiningCharacterSequence() {
1078        if (Token.token_ccs != null)
1079            return Token.token_ccs;
1080
1081        Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1082
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1083
Token.token_ccs = foo;
1084        return Token.token_ccs;
1085    }
1086
1087    // ------------------------------------------------------
1088

1089    // ------------------------------------------------------
1090
/**
1091     * This class represents a node in parse tree.
1092     */

1093    static class StringToken extends Token implements java.io.Serializable JavaDoc {
1094
1095        private static final long serialVersionUID = 3257288015452780086L;
1096        
1097        String JavaDoc string;
1098        int refNumber;
1099
1100        StringToken(int type, String JavaDoc str, int n) {
1101            super(type);
1102            this.string = str;
1103            this.refNumber = n;
1104        }
1105
1106        int getReferenceNumber() { // for STRING
1107
return this.refNumber;
1108        }
1109        String JavaDoc getString() { // for STRING
1110
return this.string;
1111        }
1112        
1113        public String JavaDoc toString(int options) {
1114            if (this.type == BACKREFERENCE)
1115                return "\\"+this.refNumber;
1116            else
1117                return REUtil.quoteMeta(this.string);
1118        }
1119    }
1120
1121    /**
1122     * This class represents a node in parse tree.
1123     */

1124    static class ConcatToken extends Token implements java.io.Serializable JavaDoc {
1125
1126        private static final long serialVersionUID = 4050760502994940212L;
1127        
1128        Token child;
1129        Token child2;
1130        
1131        ConcatToken(Token t1, Token t2) {
1132            super(Token.CONCAT);
1133            this.child = t1;
1134            this.child2 = t2;
1135        }
1136
1137        int size() {
1138            return 2;
1139        }
1140        Token getChild(int index) {
1141            return index == 0 ? this.child : this.child2;
1142        }
1143
1144        public String JavaDoc toString(int options) {
1145            String JavaDoc ret;
1146            if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1147                ret = this.child.toString(options)+"+";
1148            } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1149                ret = this.child.toString(options)+"+?";
1150            } else
1151                ret = this.child.toString(options)+this.child2.toString(options);
1152            return ret;
1153        }
1154    }
1155
1156    /**
1157     * This class represents a node in parse tree.
1158     */

1159    static class CharToken extends Token implements java.io.Serializable JavaDoc {
1160
1161        private static final long serialVersionUID = 3257284751277569842L;
1162        
1163        int chardata;
1164
1165        CharToken(int type, int ch) {
1166            super(type);
1167            this.chardata = ch;
1168        }
1169
1170        int getChar() {
1171            return this.chardata;
1172        }
1173
1174        public String JavaDoc toString(int options) {
1175            String JavaDoc ret;
1176            switch (this.type) {
1177              case CHAR:
1178                switch (this.chardata) {
1179                  case '|': case '*': case '+': case '?':
1180                  case '(': case ')': case '.': case '[':
1181                  case '{': case '\\':
1182                    ret = "\\"+(char)this.chardata;
1183                    break;
1184                  case '\f': ret = "\\f"; break;
1185                  case '\n': ret = "\\n"; break;
1186                  case '\r': ret = "\\r"; break;
1187                  case '\t': ret = "\\t"; break;
1188                  case 0x1b: ret = "\\e"; break;
1189                    //case 0x0b: ret = "\\v"; break;
1190
default:
1191                    if (this.chardata >= 0x10000) {
1192                        String JavaDoc pre = "0"+Integer.toHexString(this.chardata);
1193                        ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1194                    } else
1195                        ret = ""+(char)this.chardata;
1196                }
1197                break;
1198
1199              case ANCHOR:
1200                if (this == Token.token_linebeginning || this == Token.token_lineend)
1201                    ret = ""+(char)this.chardata;
1202                else
1203                    ret = "\\"+(char)this.chardata;
1204                break;
1205
1206              default:
1207                ret = null;
1208            }
1209            return ret;
1210        }
1211
1212        boolean match(int ch) {
1213            if (this.type == CHAR) {
1214                return ch == this.chardata;
1215            } else
1216                throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
1217        }
1218    }
1219
1220    /**
1221     * This class represents a node in parse tree.
1222     */

1223    static class ClosureToken extends Token implements java.io.Serializable JavaDoc {
1224
1225        private static final long serialVersionUID = 3545230349706932537L;
1226        
1227        int min;
1228        int max;
1229        Token child;
1230
1231        ClosureToken(int type, Token tok) {
1232            super(type);
1233            this.child = tok;
1234            this.setMin(-1);
1235            this.setMax(-1);
1236        }
1237
1238        int size() {
1239            return 1;
1240        }
1241        Token getChild(int index) {
1242            return this.child;
1243        }
1244
1245        final void setMin(int min) {
1246            this.min = min;
1247        }
1248        final void setMax(int max) {
1249            this.max = max;
1250        }
1251        final int getMin() {
1252            return this.min;
1253        }
1254        final int getMax() {
1255            return this.max;
1256        }
1257
1258        public String JavaDoc toString(int options) {
1259            String JavaDoc ret;
1260            if (this.type == CLOSURE) {
1261                if (this.getMin() < 0 && this.getMax() < 0) {
1262                    ret = this.child.toString(options)+"*";
1263                } else if (this.getMin() == this.getMax()) {
1264                    ret = this.child.toString(options)+"{"+this.getMin()+"}";
1265                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1266                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1267                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1268                    ret = this.child.toString(options)+"{"+this.getMin()+",}";
1269                } else
1270                    throw new RuntimeException JavaDoc("Token#toString(): CLOSURE "
1271                                               +this.getMin()+", "+this.getMax());
1272            } else {
1273                if (this.getMin() < 0 && this.getMax() < 0) {
1274                    ret = this.child.toString(options)+"*?";
1275                } else if (this.getMin() == this.getMax()) {
1276                    ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1277                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1278                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1279                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1280                    ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1281                } else
1282                    throw new RuntimeException JavaDoc("Token#toString(): NONGREEDYCLOSURE "
1283                                               +this.getMin()+", "+this.getMax());
1284            }
1285            return ret;
1286        }
1287    }
1288
1289    /**
1290     * This class represents a node in parse tree.
1291     */

1292    static class ParenToken extends Token implements java.io.Serializable JavaDoc {
1293
1294        private static final long serialVersionUID = 3257572797621219636L;
1295        
1296        Token child;
1297        int parennumber;
1298
1299        ParenToken(int type, Token tok, int paren) {
1300            super(type);
1301            this.child = tok;
1302            this.parennumber = paren;
1303        }
1304
1305        int size() {
1306            return 1;
1307        }
1308        Token getChild(int index) {
1309            return this.child;
1310        }
1311
1312        int getParenNumber() {
1313            return this.parennumber;
1314        }
1315
1316        public String JavaDoc toString(int options) {
1317            String JavaDoc ret = null;
1318            switch (this.type) {
1319              case PAREN:
1320                if (this.parennumber == 0) {
1321                    ret = "(?:"+this.child.toString(options)+")";
1322                } else {
1323                    ret = "("+this.child.toString(options)+")";
1324                }
1325                break;
1326
1327              case LOOKAHEAD:
1328                ret = "(?="+this.child.toString(options)+")";
1329                break;
1330              case NEGATIVELOOKAHEAD:
1331                ret = "(?!"+this.child.toString(options)+")";
1332                break;
1333              case LOOKBEHIND:
1334                ret = "(?<="+this.child.toString(options)+")";
1335                break;
1336              case NEGATIVELOOKBEHIND:
1337                ret = "(?<!"+this.child.toString(options)+")";
1338                break;
1339              case INDEPENDENT:
1340                ret = "(?>"+this.child.toString(options)+")";
1341                break;
1342            }
1343            return ret;
1344        }
1345    }
1346
1347    /**
1348     * (?(condition)yes-pattern|no-pattern)
1349     */

1350    static class ConditionToken extends Token implements java.io.Serializable JavaDoc {
1351
1352        private static final long serialVersionUID = 3761408607870399794L;
1353        
1354        int refNumber;
1355        Token condition;
1356        Token yes;
1357        Token no;
1358        ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1359            super(Token.CONDITION);
1360            this.refNumber = refno;
1361            this.condition = cond;
1362            this.yes = yespat;
1363            this.no = nopat;
1364        }
1365        int size() {
1366            return this.no == null ? 1 : 2;
1367        }
1368        Token getChild(int index) {
1369            if (index == 0) return this.yes;
1370            if (index == 1) return this.no;
1371            throw new RuntimeException JavaDoc("Internal Error: "+index);
1372        }
1373
1374        public String JavaDoc toString(int options) {
1375            String JavaDoc ret;
1376            if (refNumber > 0) {
1377                ret = "(?("+refNumber+")";
1378            } else if (this.condition.type == Token.ANCHOR) {
1379                ret = "(?("+this.condition+")";
1380            } else {
1381                ret = "(?"+this.condition;
1382            }
1383
1384            if (this.no == null) {
1385                ret += this.yes+")";
1386            } else {
1387                ret += this.yes+"|"+this.no+")";
1388            }
1389            return ret;
1390        }
1391    }
1392
1393    /**
1394     * (ims-ims: .... )
1395     */

1396    static class ModifierToken extends Token implements java.io.Serializable JavaDoc {
1397
1398        private static final long serialVersionUID = 3258689892778324790L;
1399        
1400        Token child;
1401        int add;
1402        int mask;
1403
1404        ModifierToken(Token tok, int add, int mask) {
1405            super(Token.MODIFIERGROUP);
1406            this.child = tok;
1407            this.add = add;
1408            this.mask = mask;
1409        }
1410
1411        int size() {
1412            return 1;
1413        }
1414        Token getChild(int index) {
1415            return this.child;
1416        }
1417
1418        int getOptions() {
1419            return this.add;
1420        }
1421        int getOptionsMask() {
1422            return this.mask;
1423        }
1424
1425        public String JavaDoc toString(int options) {
1426            return "(?"
1427                +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1428                +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1429                +":"
1430                +this.child.toString(options)
1431                +")";
1432        }
1433    }
1434
1435    /**
1436     * This class represents a node in parse tree.
1437     * for UNION or CONCAT.
1438     */

1439    static class UnionToken extends Token implements java.io.Serializable JavaDoc {
1440
1441        private static final long serialVersionUID = 3256723987530003507L;
1442        
1443        Vector JavaDoc children;
1444
1445        UnionToken(int type) {
1446            super(type);
1447        }
1448
1449        void addChild(Token tok) {
1450            if (tok == null) return;
1451            if (this.children == null) this.children = new Vector JavaDoc();
1452            if (this.type == UNION) {
1453                this.children.addElement(tok);
1454                return;
1455            }
1456                                                // This is CONCAT, and new child is CONCAT.
1457
if (tok.type == CONCAT) {
1458                for (int i = 0; i < tok.size(); i ++)
1459                    this.addChild(tok.getChild(i)); // Recursion
1460
return;
1461            }
1462            int size = this.children.size();
1463            if (size == 0) {
1464                this.children.addElement(tok);
1465                return;
1466            }
1467            Token previous = (Token)this.children.elementAt(size-1);
1468            if (!((previous.type == CHAR || previous.type == STRING)
1469                  && (tok.type == CHAR || tok.type == STRING))) {
1470                this.children.addElement(tok);
1471                return;
1472            }
1473            
1474            //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1475

1476            StringBuffer JavaDoc buffer;
1477            int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1478            if (previous.type == CHAR) { // Replace previous token by STRING
1479
buffer = new StringBuffer JavaDoc(2 + nextMaxLength);
1480                int ch = previous.getChar();
1481                if (ch >= 0x10000)
1482                    buffer.append(REUtil.decomposeToSurrogates(ch));
1483                else
1484                    buffer.append((char)ch);
1485                previous = Token.createString(null);
1486                this.children.setElementAt(previous, size-1);
1487            } else { // STRING
1488
buffer = new StringBuffer JavaDoc(previous.getString().length() + nextMaxLength);
1489                buffer.append(previous.getString());
1490            }
1491
1492            if (tok.type == CHAR) {
1493                int ch = tok.getChar();
1494                if (ch >= 0x10000)
1495                    buffer.append(REUtil.decomposeToSurrogates(ch));
1496                else
1497                    buffer.append((char)ch);
1498            } else {
1499                buffer.append(tok.getString());
1500            }
1501
1502            ((StringToken)previous).string = new String JavaDoc(buffer);
1503        }
1504
1505        int size() {
1506            return this.children == null ? 0 : this.children.size();
1507        }
1508        Token getChild(int index) {
1509            return (Token)this.children.elementAt(index);
1510        }
1511
1512        public String JavaDoc toString(int options) {
1513            String JavaDoc ret;
1514            if (this.type == CONCAT) {
1515                if (this.children.size() == 2) {
1516                    Token ch = this.getChild(0);
1517                    Token ch2 = this.getChild(1);
1518                    if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1519                        ret = ch.toString(options)+"+";
1520                    } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1521                        ret = ch.toString(options)+"+?";
1522                    } else
1523                        ret = ch.toString(options)+ch2.toString(options);
1524                } else {
1525                    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1526                    for (int i = 0; i < this.children.size(); i ++) {
1527                        sb.append(((Token)this.children.elementAt(i)).toString(options));
1528                    }
1529                    ret = new String JavaDoc(sb);
1530                }
1531                return ret;
1532            }
1533            if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1534                ret = this.getChild(0).toString(options)+"?";
1535            } else if (this.children.size() == 2
1536                       && this.getChild(0).type == EMPTY) {
1537                ret = this.getChild(1).toString(options)+"??";
1538            } else {
1539                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1540                sb.append(((Token)this.children.elementAt(0)).toString(options));
1541                for (int i = 1; i < this.children.size(); i ++) {
1542                    sb.append((char)'|');
1543                    sb.append(((Token)this.children.elementAt(i)).toString(options));
1544                }
1545                ret = new String JavaDoc(sb);
1546            }
1547            return ret;
1548        }
1549    }
1550}
1551
Popular Tags