KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > apache > xerces > utils > regex > Token


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 1999, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package org.enhydra.apache.xerces.utils.regex;
59
60 import java.util.Hashtable JavaDoc;
61 import java.util.Vector JavaDoc;
62
63 /**
64  * This class represents a node in parse tree.
65  */

66 class Token implements java.io.Serializable JavaDoc {
67     static final boolean COUNTTOKENS = true;
68     static int tokens = 0;
69
70     static final int CHAR = 0; // Literal char
71
static final int DOT = 11; // .
72
static final int CONCAT = 1; // XY
73
static final int UNION = 2; // X|Y|Z
74
static final int CLOSURE = 3; // X*
75
static final int RANGE = 4; // [a-zA-Z] etc.
76
static final int NRANGE = 5; // [^a-zA-Z] etc.
77
static final int PAREN = 6; // (X) or (?:X)
78
static final int EMPTY = 7; //
79
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
80
static final int NONGREEDYCLOSURE = 9; // *? +?
81
static final int STRING = 10; // strings
82
static final int BACKREFERENCE = 12; // back references
83
static final int LOOKAHEAD = 20; // (?=...)
84
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
85
static final int LOOKBEHIND = 22; // (?<=...)
86
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
87
static final int INDEPENDENT = 24; // (?>...)
88
static final int MODIFIERGROUP = 25; // (?ims-ims:...)
89
static final int CONDITION = 26; // (?(...)yes|no)
90

91     static final int UTF16_MAX = 0x10ffff;
92
93     int type;
94
95     static protected Token token_dot;
96     static protected Token token_0to9;
97     static protected Token token_wordchars;
98     static protected Token token_not_0to9;
99     static protected Token token_not_wordchars;
100     static protected Token token_spaces;
101     static protected Token token_not_spaces;
102     static protected Token token_empty;
103     static protected Token token_linebeginning;
104     static protected Token token_linebeginning2;
105     static protected Token token_lineend;
106     static protected Token token_stringbeginning;
107     static protected Token token_stringend;
108     static protected Token token_stringend2;
109     static protected Token token_wordedge;
110     static protected Token token_not_wordedge;
111     static protected Token token_wordbeginning;
112     static protected Token token_wordend;
113     static {
114         Token.token_empty = new Token(Token.EMPTY);
115
116         Token.token_linebeginning = Token.createAnchor('^');
117         Token.token_linebeginning2 = Token.createAnchor('@');
118         Token.token_lineend = Token.createAnchor('$');
119         Token.token_stringbeginning = Token.createAnchor('A');
120         Token.token_stringend = Token.createAnchor('z');
121         Token.token_stringend2 = Token.createAnchor('Z');
122         Token.token_wordedge = Token.createAnchor('b');
123         Token.token_not_wordedge = Token.createAnchor('B');
124         Token.token_wordbeginning = Token.createAnchor('<');
125         Token.token_wordend = Token.createAnchor('>');
126
127         Token.token_dot = new Token(Token.DOT);
128
129         Token.token_0to9 = Token.createRange();
130         Token.token_0to9.addRange('0', '9');
131         Token.token_wordchars = Token.createRange();
132         Token.token_wordchars.addRange('0', '9');
133         Token.token_wordchars.addRange('A', 'Z');
134         Token.token_wordchars.addRange('_', '_');
135         Token.token_wordchars.addRange('a', 'z');
136         Token.token_spaces = Token.createRange();
137         Token.token_spaces.addRange('\t', '\t');
138         Token.token_spaces.addRange('\n', '\n');
139         Token.token_spaces.addRange('\f', '\f');
140         Token.token_spaces.addRange('\r', '\r');
141         Token.token_spaces.addRange(' ', ' ');
142
143         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
144         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
145         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
146     }
147
148     static Token.ParenToken createLook(int type, Token child) {
149         if (COUNTTOKENS) Token.tokens ++;
150         return new Token.ParenToken(type, child, 0);
151     }
152     static Token.ParenToken createParen(Token child, int pnumber) {
153         if (COUNTTOKENS) Token.tokens ++;
154         return new Token.ParenToken(Token.PAREN, child, pnumber);
155     }
156     static Token.ClosureToken createClosure(Token tok) {
157         if (COUNTTOKENS) Token.tokens ++;
158         return new Token.ClosureToken(Token.CLOSURE, tok);
159     }
160     static Token.ClosureToken createNGClosure(Token tok) {
161         if (COUNTTOKENS) Token.tokens ++;
162         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
163     }
164     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
165         if (COUNTTOKENS) Token.tokens ++;
166         return new Token.ConcatToken(tok1, tok2);
167     }
168     static Token.UnionToken createConcat() {
169         if (COUNTTOKENS) Token.tokens ++;
170         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
171
}
172     static Token.UnionToken createUnion() {
173         if (COUNTTOKENS) Token.tokens ++;
174         return new Token.UnionToken(Token.UNION);
175     }
176     static Token createEmpty() {
177         return Token.token_empty;
178     }
179     static RangeToken createRange() {
180         if (COUNTTOKENS) Token.tokens ++;
181         return new RangeToken(Token.RANGE);
182     }
183     static RangeToken createNRange() {
184         if (COUNTTOKENS) Token.tokens ++;
185         return new RangeToken(Token.NRANGE);
186     }
187     static Token.CharToken createChar(int ch) {
188         if (COUNTTOKENS) Token.tokens ++;
189         return new Token.CharToken(Token.CHAR, ch);
190     }
191     static private Token.CharToken createAnchor(int ch) {
192         if (COUNTTOKENS) Token.tokens ++;
193         return new Token.CharToken(Token.ANCHOR, ch);
194     }
195     static Token.StringToken createBackReference(int refno) {
196         if (COUNTTOKENS) Token.tokens ++;
197         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
198     }
199     static Token.StringToken createString(String JavaDoc str) {
200         if (COUNTTOKENS) Token.tokens ++;
201         return new Token.StringToken(Token.STRING, str, 0);
202     }
203     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
204         if (COUNTTOKENS) Token.tokens ++;
205         return new Token.ModifierToken(child, add, mask);
206     }
207     static Token.ConditionToken createCondition(int refno, Token condition,
208                                                 Token yespat, Token nopat) {
209         if (COUNTTOKENS) Token.tokens ++;
210         return new Token.ConditionToken(refno, condition, yespat, nopat);
211     }
212
213     protected Token(int type) {
214         this.type = type;
215     }
216
217     /**
218      * A number of children.
219      */

220     int size() {
221         return 0;
222     }
223     Token getChild(int index) {
224         return null;
225     }
226     void addChild(Token tok) {
227         throw new RuntimeException JavaDoc("Not supported.");
228     }
229
230                                                 // for RANGE or NRANGE
231
protected void addRange(int start, int end) {
232         throw new RuntimeException JavaDoc("Not supported.");
233     }
234     protected void sortRanges() {
235         throw new RuntimeException JavaDoc("Not supported.");
236     }
237     protected void compactRanges() {
238         throw new RuntimeException JavaDoc("Not supported.");
239     }
240     protected void mergeRanges(Token tok) {
241         throw new RuntimeException JavaDoc("Not supported.");
242     }
243     protected void subtractRanges(Token tok) {
244         throw new RuntimeException JavaDoc("Not supported.");
245     }
246     protected void intersectRanges(Token tok) {
247         throw new RuntimeException JavaDoc("Not supported.");
248     }
249     static Token complementRanges(Token tok) {
250         return RangeToken.complementRanges(tok);
251     }
252
253
254     void setMin(int min) { // for CLOSURE
255
}
256     void setMax(int max) { // for CLOSURE
257
}
258     int getMin() { // for CLOSURE
259
return -1;
260     }
261     int getMax() { // for CLOSURE
262
return -1;
263     }
264     int getReferenceNumber() { // for STRING
265
return 0;
266     }
267     String JavaDoc getString() { // for STRING
268
return null;
269     }
270
271     int getParenNumber() {
272         return 0;
273     }
274     int getChar() {
275         return -1;
276     }
277
278     public String JavaDoc toString() {
279         return this.toString(0);
280     }
281     public String JavaDoc toString(int options) {
282         return this.type == Token.DOT ? "." : "";
283     }
284
285     /**
286      * How many characters are needed?
287      */

288     final int getMinLength() {
289         switch (this.type) {
290           case CONCAT:
291             int sum = 0;
292             for (int i = 0; i < this.size(); i ++)
293                 sum += this.getChild(i).getMinLength();
294             return sum;
295
296           case CONDITION:
297           case UNION:
298             if (this.size() == 0)
299                 return 0;
300             int ret = this.getChild(0).getMinLength();
301             for (int i = 1; i < this.size(); i ++) {
302                 int min = this.getChild(i).getMinLength();
303                 if (min < ret) ret = min;
304             }
305             return ret;
306
307           case CLOSURE:
308           case NONGREEDYCLOSURE:
309             if (this.getMin() >= 0)
310                 return this.getMin() * this.getChild(0).getMinLength();
311             return 0;
312
313           case EMPTY:
314           case ANCHOR:
315             return 0;
316
317           case DOT:
318           case CHAR:
319           case RANGE:
320           case NRANGE:
321             return 1;
322
323           case INDEPENDENT:
324           case PAREN:
325           case MODIFIERGROUP:
326             return this.getChild(0).getMinLength();
327
328           case BACKREFERENCE:
329             return 0; // *******
330

331           case STRING:
332             return this.getString().length();
333
334           case LOOKAHEAD:
335           case NEGATIVELOOKAHEAD:
336           case LOOKBEHIND:
337           case NEGATIVELOOKBEHIND:
338             return 0; // ***** Really?
339

340           default:
341             throw new RuntimeException JavaDoc("Token#getMinLength(): Invalid Type: "+this.type);
342         }
343     }
344
345     final int getMaxLength() {
346         switch (this.type) {
347           case CONCAT:
348             int sum = 0;
349             for (int i = 0; i < this.size(); i ++) {
350                 int d = this.getChild(i).getMaxLength();
351                 if (d < 0) return -1;
352                 sum += d;
353             }
354             return sum;
355
356           case CONDITION:
357           case UNION:
358             if (this.size() == 0)
359                 return 0;
360             int ret = this.getChild(0).getMaxLength();
361             for (int i = 1; ret >= 0 && i < this.size(); i ++) {
362                 int max = this.getChild(i).getMaxLength();
363                 if (max < 0) { // infinity
364
ret = -1;
365                     break;
366                 }
367                 if (max > ret) ret = max;
368             }
369             return ret;
370
371           case CLOSURE:
372           case NONGREEDYCLOSURE:
373             if (this.getMax() >= 0)
374                                                 // When this.child.getMaxLength() < 0,
375
// this returns minus value
376
return this.getMax() * this.getChild(0).getMaxLength();
377             return -1;
378
379           case EMPTY:
380           case ANCHOR:
381             return 0;
382
383           case CHAR:
384             return 1;
385           case DOT:
386           case RANGE:
387           case NRANGE:
388             return 2;
389
390           case INDEPENDENT:
391           case PAREN:
392           case MODIFIERGROUP:
393             return this.getChild(0).getMaxLength();
394
395           case BACKREFERENCE:
396             return -1; // ******
397

398           case STRING:
399             return this.getString().length();
400
401           case LOOKAHEAD:
402           case NEGATIVELOOKAHEAD:
403           case LOOKBEHIND:
404           case NEGATIVELOOKBEHIND:
405             return 0; // ***** Really?
406

407           default:
408             throw new RuntimeException JavaDoc("Token#getMaxLength(): Invalid Type: "+this.type);
409         }
410     }
411
412     static final int FC_CONTINUE = 0;
413     static final int FC_TERMINAL = 1;
414     static final int FC_ANY = 2;
415     private static final boolean isSet(int options, int flag) {
416         return (options & flag) == flag;
417     }
418     final int analyzeFirstCharacter(RangeToken result, int options) {
419         switch (this.type) {
420           case CONCAT:
421             int ret = FC_CONTINUE;
422             for (int i = 0; i < this.size(); i ++)
423                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
424                     break;
425             return ret;
426
427           case UNION:
428             if (this.size() == 0)
429                 return FC_CONTINUE;
430             /*
431              * a|b|c -> FC_TERMINAL
432              * a|.|c -> FC_ANY
433              * a|b| -> FC_CONTINUE
434              */

435             int ret2 = FC_CONTINUE;
436             boolean hasEmpty = false;
437             for (int i = 0; i < this.size(); i ++) {
438                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
439                 if (ret2 == FC_ANY)
440                     break;
441                 else if (ret2 == FC_CONTINUE)
442                     hasEmpty = true;
443             }
444             return hasEmpty ? FC_CONTINUE : ret2;
445
446           case CONDITION:
447             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
448             if (this.size() == 1) return FC_CONTINUE;
449             if (ret3 == FC_ANY) return ret3;
450             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
451             if (ret4 == FC_ANY) return ret4;
452             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
453
454           case CLOSURE:
455           case NONGREEDYCLOSURE:
456             this.getChild(0).analyzeFirstCharacter(result, options);
457             return FC_CONTINUE;
458
459           case EMPTY:
460           case ANCHOR:
461             return FC_CONTINUE;
462
463           case CHAR:
464             int ch = this.getChar();
465             result.addRange(ch, ch);
466             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
467                 ch = Character.toUpperCase((char)ch);
468                 result.addRange(ch, ch);
469                 ch = Character.toLowerCase((char)ch);
470                 result.addRange(ch, ch);
471             }
472             return FC_TERMINAL;
473
474           case DOT: // ****
475
if (isSet(options, RegularExpression.SINGLE_LINE)) {
476                 return FC_CONTINUE; // **** We can not optimize.
477
} else {
478                 return FC_CONTINUE;
479                 /*
480                 result.addRange(0, RegularExpression.LINE_FEED-1);
481                 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
482                 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
483                                 RegularExpression.LINE_SEPARATOR-1);
484                 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
485                 return 1;
486                 */

487             }
488
489           case RANGE:
490             if (isSet(options, RegularExpression.IGNORE_CASE)) {
491                 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
492             } else {
493                 result.mergeRanges(this);
494             }
495             return FC_TERMINAL;
496
497           case NRANGE: // ****
498
if (isSet(options, RegularExpression.IGNORE_CASE)) {
499                 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
500             } else {
501                 result.mergeRanges(Token.complementRanges(this));
502             }
503             return FC_TERMINAL;
504
505           case INDEPENDENT:
506           case PAREN:
507             return this.getChild(0).analyzeFirstCharacter(result, options);
508
509           case MODIFIERGROUP:
510             options |= ((ModifierToken)this).getOptions();
511             options &= ~((ModifierToken)this).getOptionsMask();
512             return this.getChild(0).analyzeFirstCharacter(result, options);
513
514           case BACKREFERENCE:
515             result.addRange(0, UTF16_MAX); // **** We can not optimize.
516
return FC_ANY;
517
518           case STRING:
519             int cha = this.getString().charAt(0);
520             int ch2;
521             if (REUtil.isHighSurrogate(cha)
522                 && this.getString().length() >= 2
523                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
524                 cha = REUtil.composeFromSurrogates(cha, ch2);
525             result.addRange(cha, cha);
526             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527                 cha = Character.toUpperCase((char)cha);
528                 result.addRange(cha, cha);
529                 cha = Character.toLowerCase((char)cha);
530                 result.addRange(cha, cha);
531             }
532             return FC_TERMINAL;
533
534           case LOOKAHEAD:
535           case NEGATIVELOOKAHEAD:
536           case LOOKBEHIND:
537           case NEGATIVELOOKBEHIND:
538             return FC_CONTINUE;
539
540           default:
541             throw new RuntimeException JavaDoc("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
542         }
543     }
544
545     private final boolean isShorterThan(Token tok) {
546         if (tok == null) return false;
547         /*
548         int mylength;
549         if (this.type == STRING) mylength = this.getString().length();
550         else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
551         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
552         int otherlength;
553         if (tok.type == STRING) otherlength = tok.getString().length();
554         else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
555         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
556         */

557         int mylength;
558         if (this.type == STRING) mylength = this.getString().length();
559         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+this.type);
560         int otherlength;
561         if (tok.type == STRING) otherlength = tok.getString().length();
562         else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+tok.type);
563         return mylength < otherlength;
564     }
565
566     static class FixedStringContainer {
567         Token token = null;
568         int options = 0;
569         FixedStringContainer() {
570         }
571     }
572
573     final void findFixedString(FixedStringContainer container, int options) {
574         switch (this.type) {
575           case CONCAT:
576             Token prevToken = null;
577             int prevOptions = 0;
578             for (int i = 0; i < this.size(); i ++) {
579                 this.getChild(i).findFixedString(container, options);
580                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
581                     prevToken = container.token;
582                     prevOptions = container.options;
583                 }
584             }
585             container.token = prevToken;
586             container.options = prevOptions;
587             return;
588
589           case UNION:
590           case CLOSURE:
591           case NONGREEDYCLOSURE:
592           case EMPTY:
593           case ANCHOR:
594           case RANGE:
595           case DOT:
596           case NRANGE:
597           case BACKREFERENCE:
598           case LOOKAHEAD:
599           case NEGATIVELOOKAHEAD:
600           case LOOKBEHIND:
601           case NEGATIVELOOKBEHIND:
602           case CONDITION:
603             container.token = null;
604             return;
605
606           case CHAR: // Ignore CHAR tokens.
607
container.token = null; // **
608
return; // **
609

610           case STRING:
611             container.token = this;
612             container.options = options;
613             return;
614
615           case INDEPENDENT:
616           case PAREN:
617             this.getChild(0).findFixedString(container, options);
618             return;
619
620           case MODIFIERGROUP:
621             options |= ((ModifierToken)this).getOptions();
622             options &= ~((ModifierToken)this).getOptionsMask();
623             this.getChild(0).findFixedString(container, options);
624             return;
625
626           default:
627             throw new RuntimeException JavaDoc("Token#findFixedString(): Invalid Type: "+this.type);
628         }
629     }
630
631     boolean match(int ch) {
632         throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
633     }
634
635     // ------------------------------------------------------
636
static protected Hashtable JavaDoc categories = new Hashtable JavaDoc();
637     static protected Hashtable JavaDoc categories2 = null;
638     static final String JavaDoc[] categoryNames = {
639         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
640         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
641         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
642
"Pi", "Pf", // 29, 30
643
"L", "M", "N", "Z", "C", "P", "S", // 31-37
644
};
645
646     // Schema Rec. {Datatypes} - Punctuation
647
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
648
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
649
static final int CHAR_LETTER = 31;
650     static final int CHAR_MARK = 32;
651     static final int CHAR_NUMBER = 33;
652     static final int CHAR_SEPARATOR = 34;
653     static final int CHAR_OTHER = 35;
654     static final int CHAR_PUNCTUATION = 36;
655     static final int CHAR_SYMBOL = 37;
656     
657     //blockNames in UNICODE 3.1 that supported by XML Schema REC
658
static final String JavaDoc[] blockNames = {
659         /*0000..007F;*/ "Basic Latin",
660         /*0080..00FF;*/ "Latin-1 Supplement",
661         /*0100..017F;*/ "Latin Extended-A",
662         /*0180..024F;*/ "Latin Extended-B",
663         /*0250..02AF;*/ "IPA Extensions",
664         /*02B0..02FF;*/ "Spacing Modifier Letters",
665         /*0300..036F;*/ "Combining Diacritical Marks",
666         /*0370..03FF;*/ "Greek",
667         /*0400..04FF;*/ "Cyrillic",
668         /*0530..058F;*/ "Armenian",
669         /*0590..05FF;*/ "Hebrew",
670         /*0600..06FF;*/ "Arabic",
671         /*0700..074F;*/ "Syriac",
672         /*0780..07BF;*/ "Thaana",
673         /*0900..097F;*/ "Devanagari",
674         /*0980..09FF;*/ "Bengali",
675         /*0A00..0A7F;*/ "Gurmukhi",
676         /*0A80..0AFF;*/ "Gujarati",
677         /*0B00..0B7F;*/ "Oriya",
678         /*0B80..0BFF;*/ "Tamil",
679         /*0C00..0C7F;*/ "Telugu",
680         /*0C80..0CFF;*/ "Kannada",
681         /*0D00..0D7F;*/ "Malayalam",
682         /*0D80..0DFF;*/ "Sinhala",
683         /*0E00..0E7F;*/ "Thai",
684         /*0E80..0EFF;*/ "Lao",
685         /*0F00..0FFF;*/ "Tibetan",
686         /*1000..109F;*/ "Myanmar",
687         /*10A0..10FF;*/ "Georgian",
688         /*1100..11FF;*/ "Hangul Jamo",
689         /*1200..137F;*/ "Ethiopic",
690         /*13A0..13FF;*/ "Cherokee",
691         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
692         /*1680..169F;*/ "Ogham",
693         /*16A0..16FF;*/ "Runic",
694         /*1780..17FF;*/ "Khmer",
695         /*1800..18AF;*/ "Mongolian",
696         /*1E00..1EFF;*/ "Latin Extended Additional",
697         /*1F00..1FFF;*/ "Greek Extended",
698         /*2000..206F;*/ "General Punctuation",
699         /*2070..209F;*/ "Superscripts and Subscripts",
700         /*20A0..20CF;*/ "Currency Symbols",
701         /*20D0..20FF;*/ "Combining Marks for Symbols",
702         /*2100..214F;*/ "Letterlike Symbols",
703         /*2150..218F;*/ "Number Forms",
704         /*2190..21FF;*/ "Arrows",
705         /*2200..22FF;*/ "Mathematical Operators",
706         /*2300..23FF;*/ "Miscellaneous Technical",
707         /*2400..243F;*/ "Control Pictures",
708         /*2440..245F;*/ "Optical Character Recognition",
709         /*2460..24FF;*/ "Enclosed Alphanumerics",
710         /*2500..257F;*/ "Box Drawing",
711         /*2580..259F;*/ "Block Elements",
712         /*25A0..25FF;*/ "Geometric Shapes",
713         /*2600..26FF;*/ "Miscellaneous Symbols",
714         /*2700..27BF;*/ "Dingbats",
715         /*2800..28FF;*/ "Braille Patterns",
716         /*2E80..2EFF;*/ "CJK Radicals Supplement",
717         /*2F00..2FDF;*/ "Kangxi Radicals",
718         /*2FF0..2FFF;*/ "Ideographic Description Characters",
719         /*3000..303F;*/ "CJK Symbols and Punctuation",
720         /*3040..309F;*/ "Hiragana",
721         /*30A0..30FF;*/ "Katakana",
722         /*3100..312F;*/ "Bopomofo",
723         /*3130..318F;*/ "Hangul Compatibility Jamo",
724         /*3190..319F;*/ "Kanbun",
725         /*31A0..31BF;*/ "Bopomofo Extended",
726         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
727         /*3300..33FF;*/ "CJK Compatibility",
728         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
729         /*4E00..9FFF;*/ "CJK Unified Ideographs",
730         /*A000..A48F;*/ "Yi Syllables",
731         /*A490..A4CF;*/ "Yi Radicals",
732         /*AC00..D7A3;*/ "Hangul Syllables",
733         /*D800..DB7F;*/ "High Surrogates",
734         /*DB80..DBFF;*/ "High Private Use Surrogates",
735         /*DC00..DFFF;*/ "Low Surrogates",
736         /*E000..F8FF;*/ "Private Use",
737         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
738         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
739         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
740         /*FE20..FE2F;*/ "Combining Half Marks",
741         /*FE30..FE4F;*/ "CJK Compatibility Forms",
742         /*FE50..FE6F;*/ "Small Form Variants",
743         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
744         /*FEFF..FEFF;*/ "Specials",
745         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
746          //missing Specials add manually
747
/*10300..1032F;*/ "Old Italic",
748         /*10330..1034F;*/ "Gothic",
749         /*10400..1044F;*/ "Deseret",
750         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
751         /*1D100..1D1FF;*/ "Musical Symbols",
752         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
753         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
754         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
755         /*E0000..E007F;*/ "Tags",
756         //missing 2 private use add manually
757

758     };
759     //ADD THOSE MANUALLY
760
//F0000..FFFFD; "Private Use",
761
//100000..10FFFD; "Private Use"
762
//FFF0..FFFD; "Specials",
763
static final String JavaDoc blockRanges =
764        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
765         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
766         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
767         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
768         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
769         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
770         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
771         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
772         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
773         +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
774         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F"
775         +"\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F";
776
777      static protected RangeToken getRange(String JavaDoc name, boolean positive) {
778         if (Token.categories.size() == 0) {
779             synchronized (Token.categories) {
780                 Token[] ranges = new Token[Token.categoryNames.length];
781                 for (int i = 0; i < ranges.length; i ++) {
782                     ranges[i] = Token.createRange();
783                 }
784                 int type;
785                 for (int i = 0; i < 0x10000; i ++) {
786                     type = Character.getType((char)i);
787                     if (type == Character.START_PUNCTUATION ||
788                         type == Character.END_PUNCTUATION) {
789                         //build table of Pi values
790
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
791                             i == 0x201F || i == 0x2039) {
792                             type = CHAR_INIT_QUOTE;
793                         }
794                         //build table of Pf values
795
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
796                             type = CHAR_FINAL_QUOTE;
797                         }
798                     }
799                     ranges[type].addRange(i, i);
800                     switch (type) {
801                       case Character.UPPERCASE_LETTER:
802                       case Character.LOWERCASE_LETTER:
803                       case Character.TITLECASE_LETTER:
804                       case Character.MODIFIER_LETTER:
805                       case Character.OTHER_LETTER:
806                         type = CHAR_LETTER;
807                         break;
808                       case Character.NON_SPACING_MARK:
809                       case Character.COMBINING_SPACING_MARK:
810                       case Character.ENCLOSING_MARK:
811                         type = CHAR_MARK;
812                         break;
813                       case Character.DECIMAL_DIGIT_NUMBER:
814                       case Character.LETTER_NUMBER:
815                       case Character.OTHER_NUMBER:
816                         type = CHAR_NUMBER;
817                         break;
818                       case Character.SPACE_SEPARATOR:
819                       case Character.LINE_SEPARATOR:
820                       case Character.PARAGRAPH_SEPARATOR:
821                         type = CHAR_SEPARATOR;
822                         break;
823                       case Character.CONTROL:
824                       case Character.FORMAT:
825                       case Character.SURROGATE:
826                       case Character.PRIVATE_USE:
827                       case Character.UNASSIGNED:
828                         type = CHAR_OTHER;
829                         break;
830                       case Character.CONNECTOR_PUNCTUATION:
831                       case Character.DASH_PUNCTUATION:
832                       case Character.START_PUNCTUATION:
833                       case Character.END_PUNCTUATION:
834                       case CHAR_INIT_QUOTE:
835                       case CHAR_FINAL_QUOTE:
836                       case Character.OTHER_PUNCTUATION:
837                         type = CHAR_PUNCTUATION;
838                         break;
839                       case Character.MATH_SYMBOL:
840                       case Character.CURRENCY_SYMBOL:
841                       case Character.MODIFIER_SYMBOL:
842                       case Character.OTHER_SYMBOL:
843                         type = CHAR_SYMBOL;
844                         break;
845                       default:
846                         throw new RuntimeException JavaDoc("org.enhydra.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
847                     }
848                     ranges[type].addRange(i, i);
849                 } // for all characters
850
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
851
852                 Token.categories2 = new Hashtable JavaDoc();
853                 for (int i = 0; i < ranges.length; i ++) {
854                     if (Token.categoryNames[i] != null) {
855                         if (i == Character.UNASSIGNED) { // Unassigned
856
ranges[i].addRange(0x10000, Token.UTF16_MAX);
857                         }
858                         Token.categories.put(Token.categoryNames[i], ranges[i]);
859                         Token.categories2.put(Token.categoryNames[i],
860                                               Token.complementRanges(ranges[i]));
861                     }
862                 }
863                 //REVISIT: do we really need to support block names as in Unicode 3.1
864
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
865
//
866
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(50);
867                 int location = 0;
868                 for (int i = 0; i < Token.blockNames.length; i ++) {
869                     Token r1 = Token.createRange();
870                     location = i*2;
871                     int rstart = Token.blockRanges.charAt(location);
872                     int rend = Token.blockRanges.charAt(location+1);
873                     String JavaDoc n = Token.blockNames[i];
874                     //DEBUGING
875
//System.out.println(n+" " +Integer.toHexString(rstart)
876
// +"-"+ Integer.toHexString(rend));
877
r1.addRange(rstart, rend);
878                     if (n.equals("Specials"))
879                         r1.addRange(0xfff0, 0xfffd);
880                     if (n.equals("Private Use")) {
881                         r1.addRange(0xF0000,0xFFFFD);
882                         r1.addRange(0x100000,0x10FFFD);
883                     }
884                     Token.categories.put(n, r1);
885                     Token.categories2.put(n, Token.complementRanges(r1));
886                     buffer.setLength(0);
887                     buffer.append("Is");
888                     if (n.indexOf(' ') >= 0) {
889                         for (int ci = 0; ci < n.length(); ci ++)
890                             if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
891                     }
892                     else {
893                         buffer.append(n);
894                     }
895                     Token.setAlias(buffer.toString(), n, true);
896                 }
897
898                 // REVISIT: remove this code later
899
// the following does not match the XML Schema definition
900
// for Regular Expressions
901

902                 /*
903                 // TR#18 1.2
904                 Token.setAlias("ASSIGNED", "Cn", false);
905                 Token.setAlias("UNASSIGNED", "Cn", true);
906                 Token all = Token.createRange();
907                 all.addRange(0, Token.UTF16_MAX);
908                 Token.categories.put("ALL", all);
909                 Token.categories2.put("ALL", Token.complementRanges(all));
910                 */

911                 
912                 /*
913                 Token isalpha = Token.createRange();
914                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
915                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
916                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
917                 Token.categories.put("IsAlpha", isalpha);
918                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
919                 
920                 Token isalnum = Token.createRange();
921                 isalnum.mergeRanges(isalpha); // Lu Ll Lo
922                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
923                 Token.categories.put("IsAlnum", isalnum);
924                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
925
926                 Token isspace = Token.createRange();
927                 isspace.mergeRanges(Token.token_spaces);
928                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
929                 Token.categories.put("IsSpace", isspace);
930                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
931
932                 Token isword = Token.createRange();
933                 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
934                 isword.addRange('_', '_');
935                 Token.categories.put("IsWord", isword);
936                 Token.categories2.put("IsWord", Token.complementRanges(isword));
937
938                 Token isascii = Token.createRange();
939                 isascii.addRange(0, 127);
940                 Token.categories.put("IsASCII", isascii);
941                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
942
943                 Token isnotgraph = Token.createRange();
944                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
945                 isnotgraph.addRange(' ', ' ');
946                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
947                 Token.categories2.put("IsGraph", isnotgraph);
948
949                 Token isxdigit = Token.createRange();
950                 isxdigit.addRange('0', '9');
951                 isxdigit.addRange('A', 'F');
952                 isxdigit.addRange('a', 'f');
953                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
954                 Token.categories2.put("IsXDigit", isxdigit);
955                 
956                 Token.setAlias("IsDigit", "Nd", true);
957                 Token.setAlias("IsUpper", "Lu", true);
958                 Token.setAlias("IsLower", "Ll", true);
959                 Token.setAlias("IsCntrl", "C", true);
960                 Token.setAlias("IsPrint", "C", false);
961                 Token.setAlias("IsPunct", "P", true);
962
963                 Token.setAlias("alpha", "IsAlpha", true);
964                 Token.setAlias("alnum", "IsAlnum", true);
965                 Token.setAlias("ascii", "IsASCII", true);
966                 Token.setAlias("cntrl", "IsCntrl", true);
967                 Token.setAlias("digit", "IsDigit", true);
968                 Token.setAlias("graph", "IsGraph", true);
969                 Token.setAlias("lower", "IsLower", true);
970                 Token.setAlias("print", "IsPrint", true);
971                 Token.setAlias("punct", "IsPunct", true);
972                 Token.setAlias("space", "IsSpace", true);
973                 Token.setAlias("upper", "IsUpper", true);
974                 Token.setAlias("word", "IsWord", true); // Perl extension
975                 Token.setAlias("xdigit", "IsXDigit", true);
976                  */

977             } // synchronized
978
} // if null
979
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
980             : (RangeToken)Token.categories2.get(name);
981         if (tok == null) System.out.println(name);
982         return tok;
983     }
984
985     private static void setAlias(String JavaDoc newName, String JavaDoc name, boolean positive) {
986         Token t1 = (Token)Token.categories.get(name);
987         Token t2 = (Token)Token.categories2.get(name);
988         if (positive) {
989             Token.categories.put(newName, t1);
990             Token.categories2.put(newName, t2);
991         } else {
992             Token.categories2.put(newName, t1);
993             Token.categories.put(newName, t2);
994         }
995     }
996
997     // ------------------------------------------------------
998

999     static final String JavaDoc viramaString =
1000    "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1001
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1002
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1003
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1004
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1005
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1006
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1007
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1008
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1009
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1010
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1011

1012    static private Token token_grapheme = null;
1013    static synchronized protected Token getGraphemePattern() {
1014        if (Token.token_grapheme != null)
1015            return Token.token_grapheme;
1016
1017        Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1018
base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1019        base_char.subtractRanges(Token.getRange("M", true));
1020        base_char.subtractRanges(Token.getRange("C", true));
1021
1022        Token virama = Token.createRange();
1023        for (int i = 0; i < Token.viramaString.length(); i ++) {
1024            int ch = viramaString.charAt(i);
1025            virama.addRange(i, i);
1026        }
1027
1028        Token combiner_wo_virama = Token.createRange();
1029        combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1030        combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1031
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1032

1033        Token left = Token.createUnion(); // base_char?
1034
left.addChild(base_char);
1035        left.addChild(Token.token_empty);
1036
1037        Token foo = Token.createUnion();
1038        foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1039        foo.addChild(combiner_wo_virama);
1040
1041        foo = Token.createClosure(foo);
1042
1043        foo = Token.createConcat(left, foo);
1044
1045        Token.token_grapheme = foo;
1046        return Token.token_grapheme;
1047    }
1048
1049    /**
1050     * Combing Character Sequence in Perl 5.6.
1051     */

1052    static private Token token_ccs = null;
1053    static synchronized protected Token getCombiningCharacterSequence() {
1054        if (Token.token_ccs != null)
1055            return Token.token_ccs;
1056
1057        Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1058
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1059
Token.token_ccs = foo;
1060        return Token.token_ccs;
1061    }
1062
1063    // ------------------------------------------------------
1064

1065    // ------------------------------------------------------
1066
/**
1067     * This class represents a node in parse tree.
1068     */

1069    static class StringToken extends Token implements java.io.Serializable JavaDoc {
1070        String JavaDoc string;
1071        int refNumber;
1072
1073        StringToken(int type, String JavaDoc str, int n) {
1074            super(type);
1075            this.string = str;
1076            this.refNumber = n;
1077        }
1078
1079        int getReferenceNumber() { // for STRING
1080
return this.refNumber;
1081        }
1082        String JavaDoc getString() { // for STRING
1083
return this.string;
1084        }
1085        
1086        public String JavaDoc toString(int options) {
1087            if (this.type == BACKREFERENCE)
1088                return "\\"+this.refNumber;
1089            else
1090                return REUtil.quoteMeta(this.string);
1091        }
1092    }
1093
1094    /**
1095     * This class represents a node in parse tree.
1096     */

1097    static class ConcatToken extends Token implements java.io.Serializable JavaDoc {
1098        Token child;
1099        Token child2;
1100        
1101        ConcatToken(Token t1, Token t2) {
1102            super(Token.CONCAT);
1103            this.child = t1;
1104            this.child2 = t2;
1105        }
1106
1107        int size() {
1108            return 2;
1109        }
1110        Token getChild(int index) {
1111            return index == 0 ? this.child : this.child2;
1112        }
1113
1114        public String JavaDoc toString(int options) {
1115            String JavaDoc ret;
1116            if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1117                ret = this.child.toString(options)+"+";
1118            } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1119                ret = this.child.toString(options)+"+?";
1120            } else
1121                ret = this.child.toString(options)+this.child2.toString(options);
1122            return ret;
1123        }
1124    }
1125
1126    /**
1127     * This class represents a node in parse tree.
1128     */

1129    static class CharToken extends Token implements java.io.Serializable JavaDoc {
1130        int chardata;
1131
1132        CharToken(int type, int ch) {
1133            super(type);
1134            this.chardata = ch;
1135        }
1136
1137        int getChar() {
1138            return this.chardata;
1139        }
1140
1141        public String JavaDoc toString(int options) {
1142            String JavaDoc ret;
1143            switch (this.type) {
1144              case CHAR:
1145                switch (this.chardata) {
1146                  case '|': case '*': case '+': case '?':
1147                  case '(': case ')': case '.': case '[':
1148                  case '{': case '\\':
1149                    ret = "\\"+(char)this.chardata;
1150                    break;
1151                  case '\f': ret = "\\f"; break;
1152                  case '\n': ret = "\\n"; break;
1153                  case '\r': ret = "\\r"; break;
1154                  case '\t': ret = "\\t"; break;
1155                  case 0x1b: ret = "\\e"; break;
1156                    //case 0x0b: ret = "\\v"; break;
1157
default:
1158                    if (this.chardata >= 0x10000) {
1159                        String JavaDoc pre = "0"+Integer.toHexString(this.chardata);
1160                        ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1161                    } else
1162                        ret = ""+(char)this.chardata;
1163                }
1164                break;
1165
1166              case ANCHOR:
1167                if (this == Token.token_linebeginning || this == Token.token_lineend)
1168                    ret = ""+(char)this.chardata;
1169                else
1170                    ret = "\\"+(char)this.chardata;
1171                break;
1172
1173              default:
1174                ret = null;
1175            }
1176            return ret;
1177        }
1178
1179        boolean match(int ch) {
1180            if (this.type == CHAR) {
1181                return ch == this.chardata;
1182            } else
1183                throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
1184        }
1185    }
1186
1187    /**
1188     * This class represents a node in parse tree.
1189     */

1190    static class ClosureToken extends Token implements java.io.Serializable JavaDoc {
1191        int min;
1192        int max;
1193        Token child;
1194
1195        ClosureToken(int type, Token tok) {
1196            super(type);
1197            this.child = tok;
1198            this.setMin(-1);
1199            this.setMax(-1);
1200        }
1201
1202        int size() {
1203            return 1;
1204        }
1205        Token getChild(int index) {
1206            return this.child;
1207        }
1208
1209        final void setMin(int min) {
1210            this.min = min;
1211        }
1212        final void setMax(int max) {
1213            this.max = max;
1214        }
1215        final int getMin() {
1216            return this.min;
1217        }
1218        final int getMax() {
1219            return this.max;
1220        }
1221
1222        public String JavaDoc toString(int options) {
1223            String JavaDoc ret;
1224            if (this.type == CLOSURE) {
1225                if (this.getMin() < 0 && this.getMax() < 0) {
1226                    ret = this.child.toString(options)+"*";
1227                } else if (this.getMin() == this.getMax()) {
1228                    ret = this.child.toString(options)+"{"+this.getMin()+"}";
1229                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1230                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1231                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1232                    ret = this.child.toString(options)+"{"+this.getMin()+",}";
1233                } else
1234                    throw new RuntimeException JavaDoc("Token#toString(): CLOSURE "
1235                                               +this.getMin()+", "+this.getMax());
1236            } else {
1237                if (this.getMin() < 0 && this.getMax() < 0) {
1238                    ret = this.child.toString(options)+"*?";
1239                } else if (this.getMin() == this.getMax()) {
1240                    ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1241                } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1242                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1243                } else if (this.getMin() >= 0 && this.getMax() < 0) {
1244                    ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1245                } else
1246                    throw new RuntimeException JavaDoc("Token#toString(): NONGREEDYCLOSURE "
1247                                               +this.getMin()+", "+this.getMax());
1248            }
1249            return ret;
1250        }
1251    }
1252
1253    /**
1254     * This class represents a node in parse tree.
1255     */

1256    static class ParenToken extends Token implements java.io.Serializable JavaDoc {
1257        Token child;
1258        int parennumber;
1259
1260        ParenToken(int type, Token tok, int paren) {
1261            super(type);
1262            this.child = tok;
1263            this.parennumber = paren;
1264        }
1265
1266        int size() {
1267            return 1;
1268        }
1269        Token getChild(int index) {
1270            return this.child;
1271        }
1272
1273        int getParenNumber() {
1274            return this.parennumber;
1275        }
1276
1277        public String JavaDoc toString(int options) {
1278            String JavaDoc ret = null;
1279            switch (this.type) {
1280              case PAREN:
1281                if (this.parennumber == 0) {
1282                    ret = "(?:"+this.child.toString(options)+")";
1283                } else {
1284                    ret = "("+this.child.toString(options)+")";
1285                }
1286                break;
1287
1288              case LOOKAHEAD:
1289                ret = "(?="+this.child.toString(options)+")";
1290                break;
1291              case NEGATIVELOOKAHEAD:
1292                ret = "(?!"+this.child.toString(options)+")";
1293                break;
1294              case LOOKBEHIND:
1295                ret = "(?<="+this.child.toString(options)+")";
1296                break;
1297              case NEGATIVELOOKBEHIND:
1298                ret = "(?<!"+this.child.toString(options)+")";
1299                break;
1300              case INDEPENDENT:
1301                ret = "(?>"+this.child.toString(options)+")";
1302                break;
1303            }
1304            return ret;
1305        }
1306    }
1307
1308    /**
1309     * (?(condition)yes-pattern|no-pattern)
1310     */

1311    static class ConditionToken extends Token implements java.io.Serializable JavaDoc {
1312        int refNumber;
1313        Token condition;
1314        Token yes;
1315        Token no;
1316        ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1317            super(Token.CONDITION);
1318            this.refNumber = refno;
1319            this.condition = cond;
1320            this.yes = yespat;
1321            this.no = nopat;
1322        }
1323        int size() {
1324            return this.no == null ? 1 : 2;
1325        }
1326        Token getChild(int index) {
1327            if (index == 0) return this.yes;
1328            if (index == 1) return this.no;
1329            throw new RuntimeException JavaDoc("Internal Error: "+index);
1330        }
1331
1332        public String JavaDoc toString(int options) {
1333            String JavaDoc ret;
1334            if (refNumber > 0) {
1335                ret = "(?("+refNumber+")";
1336            } else if (this.condition.type == Token.ANCHOR) {
1337                ret = "(?("+this.condition+")";
1338            } else {
1339                ret = "(?"+this.condition;
1340            }
1341
1342            if (this.no == null) {
1343                ret += this.yes+")";
1344            } else {
1345                ret += this.yes+"|"+this.no+")";
1346            }
1347            return ret;
1348        }
1349    }
1350
1351    /**
1352     * (ims-ims: .... )
1353     */

1354    static class ModifierToken extends Token implements java.io.Serializable JavaDoc {
1355        Token child;
1356        int add;
1357        int mask;
1358
1359        ModifierToken(Token tok, int add, int mask) {
1360            super(Token.MODIFIERGROUP);
1361            this.child = tok;
1362            this.add = add;
1363            this.mask = mask;
1364        }
1365
1366        int size() {
1367            return 1;
1368        }
1369        Token getChild(int index) {
1370            return this.child;
1371        }
1372
1373        int getOptions() {
1374            return this.add;
1375        }
1376        int getOptionsMask() {
1377            return this.mask;
1378        }
1379
1380        public String JavaDoc toString(int options) {
1381            return "(?"
1382                +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1383                +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1384                +":"
1385                +this.child.toString(options)
1386                +")";
1387        }
1388    }
1389
1390    /**
1391     * This class represents a node in parse tree.
1392     * for UNION or CONCAT.
1393     */

1394    static class UnionToken extends Token implements java.io.Serializable JavaDoc {
1395        Vector JavaDoc children;
1396
1397        UnionToken(int type) {
1398            super(type);
1399        }
1400
1401        void addChild(Token tok) {
1402            if (tok == null) return;
1403            if (this.children == null) this.children = new Vector JavaDoc();
1404            if (this.type == UNION) {
1405                this.children.addElement(tok);
1406                return;
1407            }
1408                                                // This is CONCAT, and new child is CONCAT.
1409
if (tok.type == CONCAT) {
1410                for (int i = 0; i < tok.size(); i ++)
1411                    this.addChild(tok.getChild(i)); // Recursion
1412
return;
1413            }
1414            int size = this.children.size();
1415            if (size == 0) {
1416                this.children.addElement(tok);
1417                return;
1418            }
1419            Token previous = (Token)this.children.elementAt(size-1);
1420            if (!((previous.type == CHAR || previous.type == STRING)
1421                  && (tok.type == CHAR || tok.type == STRING))) {
1422                this.children.addElement(tok);
1423                return;
1424            }
1425            
1426            //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1427

1428            StringBuffer JavaDoc buffer;
1429            int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1430            if (previous.type == CHAR) { // Replace previous token by STRING
1431
buffer = new StringBuffer JavaDoc(2 + nextMaxLength);
1432                int ch = previous.getChar();
1433                if (ch >= 0x10000)
1434                    buffer.append(REUtil.decomposeToSurrogates(ch));
1435                else
1436                    buffer.append((char)ch);
1437                previous = Token.createString(null);
1438                this.children.setElementAt(previous, size-1);
1439            } else { // STRING
1440
buffer = new StringBuffer JavaDoc(previous.getString().length() + nextMaxLength);
1441                buffer.append(previous.getString());
1442            }
1443
1444            if (tok.type == CHAR) {
1445                int ch = tok.getChar();
1446                if (ch >= 0x10000)
1447                    buffer.append(REUtil.decomposeToSurrogates(ch));
1448                else
1449                    buffer.append((char)ch);
1450            } else {
1451                buffer.append(tok.getString());
1452            }
1453
1454            ((StringToken)previous).string = new String JavaDoc(buffer);
1455        }
1456
1457        int size() {
1458            return this.children == null ? 0 : this.children.size();
1459        }
1460        Token getChild(int index) {
1461            return (Token)this.children.elementAt(index);
1462        }
1463
1464        public String JavaDoc toString(int options) {
1465            String JavaDoc ret;
1466            if (this.type == CONCAT) {
1467                if (this.children.size() == 2) {
1468                    Token ch = this.getChild(0);
1469                    Token ch2 = this.getChild(1);
1470                    if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1471                        ret = ch.toString(options)+"+";
1472                    } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1473                        ret = ch.toString(options)+"+?";
1474                    } else
1475                        ret = ch.toString(options)+ch2.toString(options);
1476                } else {
1477                    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1478                    for (int i = 0; i < this.children.size(); i ++) {
1479                        sb.append(((Token)this.children.elementAt(i)).toString(options));
1480                    }
1481                    ret = new String JavaDoc(sb);
1482                }
1483                return ret;
1484            }
1485            if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1486                ret = this.getChild(0).toString(options)+"?";
1487            } else if (this.children.size() == 2
1488                       && this.getChild(0).type == EMPTY) {
1489                ret = this.getChild(1).toString(options)+"??";
1490            } else {
1491                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1492                sb.append(((Token)this.children.elementAt(0)).toString(options));
1493                for (int i = 1; i < this.children.size(); i ++) {
1494                    sb.append((char)'|');
1495                    sb.append(((Token)this.children.elementAt(i)).toString(options));
1496                }
1497                ret = new String JavaDoc(sb);
1498            }
1499            return ret;
1500        }
1501    }
1502}
1503
Popular Tags