KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xerces > impl > xpath > regex > RegexParser


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.xerces.impl.xpath.regex;
18
19 import java.util.Locale JavaDoc;
20 import java.util.MissingResourceException JavaDoc;
21 import java.util.ResourceBundle JavaDoc;
22 import java.util.Vector JavaDoc;
23
24 /**
25  * A Regular Expression Parser.
26  *
27  * @xerces.internal
28  *
29  * @version $Id: RegexParser.java,v 1.10 2004/10/04 22:07:40 mrglavas Exp $
30  */

31 class RegexParser {
32     static final int T_CHAR = 0;
33     static final int T_EOF = 1;
34     static final int T_OR = 2; // '|'
35
static final int T_STAR = 3; // '*'
36
static final int T_PLUS = 4; // '+'
37
static final int T_QUESTION = 5; // '?'
38
static final int T_LPAREN = 6; // '('
39
static final int T_RPAREN = 7; // ')'
40
static final int T_DOT = 8; // '.'
41
static final int T_LBRACKET = 9; // '['
42
static final int T_BACKSOLIDUS = 10; // '\'
43
static final int T_CARET = 11; // '^'
44
static final int T_DOLLAR = 12; // '$'
45
static final int T_LPAREN2 = 13; // '(?:'
46
static final int T_LOOKAHEAD = 14; // '(?='
47
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
48
static final int T_LOOKBEHIND = 16; // '(?<='
49
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
50
static final int T_INDEPENDENT = 18; // '(?>'
51
static final int T_SET_OPERATIONS = 19; // '(?['
52
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
53
static final int T_COMMENT = 21; // '(?#'
54
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
55
static final int T_CONDITION = 23; // '(?('
56
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
57

58     static class ReferencePosition {
59         int refNumber;
60         int position;
61         ReferencePosition(int n, int pos) {
62             this.refNumber = n;
63             this.position = pos;
64         }
65     }
66
67     int offset;
68     String JavaDoc regex;
69     int regexlen;
70     int options;
71     ResourceBundle JavaDoc resources;
72     int chardata;
73     int nexttoken;
74     static protected final int S_NORMAL = 0;
75     static protected final int S_INBRACKETS = 1;
76     static protected final int S_INXBRACKETS = 2;
77     int context = S_NORMAL;
78     int parennumber = 1;
79     boolean hasBackReferences;
80     Vector JavaDoc references = null;
81
82     public RegexParser() {
83         this.setLocale(Locale.getDefault());
84     }
85     public RegexParser(Locale JavaDoc locale) {
86         this.setLocale(locale);
87     }
88
89     public void setLocale(Locale JavaDoc locale) {
90         try {
91             this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale);
92         } catch (MissingResourceException JavaDoc mre) {
93             throw new RuntimeException JavaDoc("Installation Problem??? Couldn't load messages: "
94                                        +mre.getMessage());
95         }
96     }
97
98     final ParseException ex(String JavaDoc key, int loc) {
99         return new ParseException(this.resources.getString(key), loc);
100     }
101
102     private final boolean isSet(int flag) {
103         return (this.options & flag) == flag;
104     }
105
106     synchronized Token parse(String JavaDoc regex, int options) throws ParseException {
107         this.options = options;
108         this.offset = 0;
109         this.setContext(S_NORMAL);
110         this.parennumber = 1;
111         this.hasBackReferences = false;
112         this.regex = regex;
113         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
114             this.regex = REUtil.stripExtendedComment(this.regex);
115         this.regexlen = this.regex.length();
116
117
118         this.next();
119         Token ret = this.parseRegex();
120         if (this.offset != this.regexlen)
121             throw ex("parser.parse.1", this.offset);
122         if (this.references != null) {
123             for (int i = 0; i < this.references.size(); i ++) {
124                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
125                 if (this.parennumber <= position.refNumber)
126                     throw ex("parser.parse.2", position.position);
127             }
128             this.references.removeAllElements();
129         }
130         return ret;
131     }
132
133     /*
134     public RegularExpression createRegex(String regex, int options) throws ParseException {
135         Token tok = this.parse(regex, options);
136         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
137     }
138     */

139
140     protected final void setContext(int con) {
141         this.context = con;
142     }
143
144     final int read() {
145         return this.nexttoken;
146     }
147
148     final void next() {
149         if (this.offset >= this.regexlen) {
150             this.chardata = -1;
151             this.nexttoken = T_EOF;
152             return;
153         }
154
155         int ret;
156         int ch = this.regex.charAt(this.offset++);
157         this.chardata = ch;
158
159         if (this.context == S_INBRACKETS) {
160             // In a character class, this.chardata has one character, that is to say,
161
// a pair of surrogates is composed and stored to this.chardata.
162
switch (ch) {
163               case '\\':
164                 ret = T_BACKSOLIDUS;
165                 if (this.offset >= this.regexlen)
166                     throw ex("parser.next.1", this.offset-1);
167                 this.chardata = this.regex.charAt(this.offset++);
168                 break;
169
170               case '-':
171                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
172                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
173                     this.offset++;
174                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
175                 } else
176                     ret = T_CHAR;
177                 break;
178
179               case '[':
180                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
181                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
182                     this.offset++;
183                     ret = T_POSIX_CHARCLASS_START;
184                     break;
185                 } // Through down
186
default:
187                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
188                     int low = this.regex.charAt(this.offset);
189                     if (REUtil.isLowSurrogate(low)) {
190                         this.chardata = REUtil.composeFromSurrogates(ch, low);
191                         this.offset ++;
192                     }
193                 }
194                 ret = T_CHAR;
195             }
196             this.nexttoken = ret;
197             return;
198         }
199
200         switch (ch) {
201           case '|': ret = T_OR; break;
202           case '*': ret = T_STAR; break;
203           case '+': ret = T_PLUS; break;
204           case '?': ret = T_QUESTION; break;
205           case ')': ret = T_RPAREN; break;
206           case '.': ret = T_DOT; break;
207           case '[': ret = T_LBRACKET; break;
208           case '^': ret = T_CARET; break;
209           case '$': ret = T_DOLLAR; break;
210           case '(':
211             ret = T_LPAREN;
212             if (this.offset >= this.regexlen)
213                 break;
214             if (this.regex.charAt(this.offset) != '?')
215                 break;
216             if (++this.offset >= this.regexlen)
217                 throw ex("parser.next.2", this.offset-1);
218             ch = this.regex.charAt(this.offset++);
219             switch (ch) {
220               case ':': ret = T_LPAREN2; break;
221               case '=': ret = T_LOOKAHEAD; break;
222               case '!': ret = T_NEGATIVELOOKAHEAD; break;
223               case '[': ret = T_SET_OPERATIONS; break;
224               case '>': ret = T_INDEPENDENT; break;
225               case '<':
226                 if (this.offset >= this.regexlen)
227                     throw ex("parser.next.2", this.offset-3);
228                 ch = this.regex.charAt(this.offset++);
229                 if (ch == '=') {
230                     ret = T_LOOKBEHIND;
231                 } else if (ch == '!') {
232                     ret = T_NEGATIVELOOKBEHIND;
233                 } else
234                     throw ex("parser.next.3", this.offset-3);
235                 break;
236               case '#':
237                 while (this.offset < this.regexlen) {
238                     ch = this.regex.charAt(this.offset++);
239                     if (ch == ')') break;
240                 }
241                 if (ch != ')')
242                     throw ex("parser.next.4", this.offset-1);
243                 ret = T_COMMENT;
244                 break;
245               default:
246                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
247
this.offset --;
248                     ret = T_MODIFIERS;
249                     break;
250                 } else if (ch == '(') { // conditional
251
ret = T_CONDITION; // this.offsets points the next of '('.
252
break;
253                 }
254                 throw ex("parser.next.2", this.offset-2);
255             }
256             break;
257             
258           case '\\':
259             ret = T_BACKSOLIDUS;
260             if (this.offset >= this.regexlen)
261                 throw ex("parser.next.1", this.offset-1);
262             this.chardata = this.regex.charAt(this.offset++);
263             break;
264
265           default:
266             ret = T_CHAR;
267         }
268         this.nexttoken = ret;
269     }
270
271     /**
272      * regex ::= term (`|` term)*
273      * term ::= factor+
274      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
275      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
276      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
277      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
278      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
279      */

280     Token parseRegex() throws ParseException {
281         Token tok = this.parseTerm();
282         Token parent = null;
283         while (this.read() == T_OR) {
284             this.next(); // '|'
285
if (parent == null) {
286                 parent = Token.createUnion();
287                 parent.addChild(tok);
288                 tok = parent;
289             }
290             tok.addChild(this.parseTerm());
291         }
292         return tok;
293     }
294
295     /**
296      * term ::= factor+
297      */

298     Token parseTerm() throws ParseException {
299         int ch = this.read();
300         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
301             return Token.createEmpty();
302         } else {
303             Token tok = this.parseFactor();
304             Token concat = null;
305             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
306                 if (concat == null) {
307                     concat = Token.createConcat();
308                     concat.addChild(tok);
309                     tok = concat;
310                 }
311                 concat.addChild(this.parseFactor());
312                 //tok = Token.createConcat(tok, this.parseFactor());
313
}
314             return tok;
315         }
316     }
317
318     // ----------------------------------------------------------------
319

320     Token processCaret() throws ParseException {
321         this.next();
322         return Token.token_linebeginning;
323     }
324     Token processDollar() throws ParseException {
325         this.next();
326         return Token.token_lineend;
327     }
328     Token processLookahead() throws ParseException {
329         this.next();
330         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
331         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
332         this.next(); // ')'
333
return tok;
334     }
335     Token processNegativelookahead() throws ParseException {
336         this.next();
337         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
338         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
339         this.next(); // ')'
340
return tok;
341     }
342     Token processLookbehind() throws ParseException {
343         this.next();
344         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
345         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
346         this.next(); // ')'
347
return tok;
348     }
349     Token processNegativelookbehind() throws ParseException {
350         this.next();
351         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
352         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
353         this.next(); // ')'
354
return tok;
355     }
356     Token processBacksolidus_A() throws ParseException {
357         this.next();
358         return Token.token_stringbeginning;
359     }
360     Token processBacksolidus_Z() throws ParseException {
361         this.next();
362         return Token.token_stringend2;
363     }
364     Token processBacksolidus_z() throws ParseException {
365         this.next();
366         return Token.token_stringend;
367     }
368     Token processBacksolidus_b() throws ParseException {
369         this.next();
370         return Token.token_wordedge;
371     }
372     Token processBacksolidus_B() throws ParseException {
373         this.next();
374         return Token.token_not_wordedge;
375     }
376     Token processBacksolidus_lt() throws ParseException {
377         this.next();
378         return Token.token_wordbeginning;
379     }
380     Token processBacksolidus_gt() throws ParseException {
381         this.next();
382         return Token.token_wordend;
383     }
384     Token processStar(Token tok) throws ParseException {
385         this.next();
386         if (this.read() == T_QUESTION) {
387             this.next();
388             return Token.createNGClosure(tok);
389         } else
390             return Token.createClosure(tok);
391     }
392     Token processPlus(Token tok) throws ParseException {
393         // X+ -> XX*
394
this.next();
395         if (this.read() == T_QUESTION) {
396             this.next();
397             return Token.createConcat(tok, Token.createNGClosure(tok));
398         } else
399             return Token.createConcat(tok, Token.createClosure(tok));
400     }
401     Token processQuestion(Token tok) throws ParseException {
402         // X? -> X|
403
this.next();
404         Token par = Token.createUnion();
405         if (this.read() == T_QUESTION) {
406             this.next();
407             par.addChild(Token.createEmpty());
408             par.addChild(tok);
409         } else {
410             par.addChild(tok);
411             par.addChild(Token.createEmpty());
412         }
413         return par;
414     }
415     boolean checkQuestion(int off) {
416         return off < this.regexlen && this.regex.charAt(off) == '?';
417     }
418     Token processParen() throws ParseException {
419         this.next();
420         int p = this.parennumber++;
421         Token tok = Token.createParen(this.parseRegex(), p);
422         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
423         this.next(); // Skips ')'
424
return tok;
425     }
426     Token processParen2() throws ParseException {
427         this.next();
428         Token tok = Token.createParen(this.parseRegex(), 0);
429         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
430         this.next(); // Skips ')'
431
return tok;
432     }
433     Token processCondition() throws ParseException {
434                                                 // this.offset points the next of '('
435
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
436                                                 // Parses a condition.
437
int refno = -1;
438         Token condition = null;
439         int ch = this.regex.charAt(this.offset);
440         if ('1' <= ch && ch <= '9') {
441             refno = ch-'0';
442             this.hasBackReferences = true;
443             if (this.references == null) this.references = new Vector JavaDoc();
444             this.references.addElement(new ReferencePosition(refno, this.offset));
445             this.offset ++;
446             if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
447             this.offset ++;
448         } else {
449             if (ch == '?') this.offset --; // Points '('.
450
this.next();
451             condition = this.parseFactor();
452             switch (condition.type) {
453               case Token.LOOKAHEAD:
454               case Token.NEGATIVELOOKAHEAD:
455               case Token.LOOKBEHIND:
456               case Token.NEGATIVELOOKBEHIND:
457                 break;
458               case Token.ANCHOR:
459                 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
460                 break;
461               default:
462                 throw ex("parser.factor.5", this.offset);
463             }
464         }
465                                                 // Parses yes/no-patterns.
466
this.next();
467         Token yesPattern = this.parseRegex();
468         Token noPattern = null;
469         if (yesPattern.type == Token.UNION) {
470             if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
471             noPattern = yesPattern.getChild(1);
472             yesPattern = yesPattern.getChild(0);
473         }
474         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
475         this.next();
476         return Token.createCondition(refno, condition, yesPattern, noPattern);
477     }
478     Token processModifiers() throws ParseException {
479                                                 // this.offset points the next of '?'.
480
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
481
int add = 0, mask = 0, ch = -1;
482         while (this.offset < this.regexlen) {
483             ch = this.regex.charAt(this.offset);
484             int v = REUtil.getOptionValue(ch);
485             if (v == 0) break; // '-' or ':'?
486
add |= v;
487             this.offset ++;
488         }
489         if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
490         if (ch == '-') {
491             this.offset ++;
492             while (this.offset < this.regexlen) {
493                 ch = this.regex.charAt(this.offset);
494                 int v = REUtil.getOptionValue(ch);
495                 if (v == 0) break; // ':'?
496
mask |= v;
497                 this.offset ++;
498             }
499             if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
500         }
501         Token tok;
502         if (ch == ':') {
503             this.offset ++;
504             this.next();
505             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
506             if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
507             this.next();
508         } else if (ch == ')') { // such as (?-i)
509
this.offset ++;
510             this.next();
511             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
512         } else
513             throw ex("parser.factor.3", this.offset);
514
515         return tok;
516     }
517     Token processIndependent() throws ParseException {
518         this.next();
519         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
520         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
521         this.next(); // Skips ')'
522
return tok;
523     }
524     Token processBacksolidus_c() throws ParseException {
525         int ch2; // Must be in 0x0040-0x005f
526
if (this.offset >= this.regexlen
527             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
528             throw ex("parser.atom.1", this.offset-1);
529         this.next();
530         return Token.createChar(ch2-0x40);
531     }
532     Token processBacksolidus_C() throws ParseException {
533         throw ex("parser.process.1", this.offset);
534     }
535     Token processBacksolidus_i() throws ParseException {
536         Token tok = Token.createChar('i');
537         this.next();
538         return tok;
539     }
540     Token processBacksolidus_I() throws ParseException {
541         throw ex("parser.process.1", this.offset);
542     }
543     Token processBacksolidus_g() throws ParseException {
544         this.next();
545         return Token.getGraphemePattern();
546     }
547     Token processBacksolidus_X() throws ParseException {
548         this.next();
549         return Token.getCombiningCharacterSequence();
550     }
551     Token processBackreference() throws ParseException {
552         int refnum = this.chardata-'0';
553         Token tok = Token.createBackReference(refnum);
554         this.hasBackReferences = true;
555         if (this.references == null) this.references = new Vector JavaDoc();
556         this.references.addElement(new ReferencePosition(refnum, this.offset-2));
557         this.next();
558         return tok;
559     }
560
561     // ----------------------------------------------------------------
562

563     /**
564      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
565      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
566      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
567      * | '(?#' [^)]* ')'
568      * minmax ::= '{' min (',' max?)? '}'
569      * min ::= [0-9]+
570      * max ::= [0-9]+
571      */

572     Token parseFactor() throws ParseException {
573         int ch = this.read();
574         Token tok;
575         switch (ch) {
576           case T_CARET: return this.processCaret();
577           case T_DOLLAR: return this.processDollar();
578           case T_LOOKAHEAD: return this.processLookahead();
579           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
580           case T_LOOKBEHIND: return this.processLookbehind();
581           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
582
583           case T_COMMENT:
584             this.next();
585             return Token.createEmpty();
586
587           case T_BACKSOLIDUS:
588             switch (this.chardata) {
589               case 'A': return this.processBacksolidus_A();
590               case 'Z': return this.processBacksolidus_Z();
591               case 'z': return this.processBacksolidus_z();
592               case 'b': return this.processBacksolidus_b();
593               case 'B': return this.processBacksolidus_B();
594               case '<': return this.processBacksolidus_lt();
595               case '>': return this.processBacksolidus_gt();
596             }
597                                                 // through down
598
}
599         tok = this.parseAtom();
600         ch = this.read();
601         switch (ch) {
602           case T_STAR: return this.processStar(tok);
603           case T_PLUS: return this.processPlus(tok);
604           case T_QUESTION: return this.processQuestion(tok);
605           case T_CHAR:
606             if (this.chardata == '{' && this.offset < this.regexlen) {
607
608                 int off = this.offset; // this.offset -> next of '{'
609
int min = 0, max = -1;
610
611                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
612
613                     min = ch -'0';
614                     while (off < this.regexlen
615                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
616                         min = min*10 +ch-'0';
617                         if (min < 0)
618                             throw ex("parser.quantifier.5", this.offset);
619                     }
620                 }
621                 else {
622                     throw ex("parser.quantifier.1", this.offset);
623                 }
624
625                 max = min;
626                 if (ch == ',') {
627
628                    if (off >= this.regexlen) {
629                        throw ex("parser.quantifier.3", this.offset);
630                    }
631                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
632
633                         max = ch -'0'; // {min,max}
634
while (off < this.regexlen
635                                && (ch = this.regex.charAt(off++)) >= '0'
636                                && ch <= '9') {
637                             max = max*10 +ch-'0';
638                             if (max < 0)
639                                 throw ex("parser.quantifier.5", this.offset);
640                         }
641
642                         if (min > max)
643                             throw ex("parser.quantifier.4", this.offset);
644                    }
645                    else { // assume {min,}
646
max = -1;
647                     }
648                 }
649
650                if (ch != '}')
651                    throw ex("parser.quantifier.2", this.offset);
652
653                if (this.checkQuestion(off)) { // off -> next of '}'
654
tok = Token.createNGClosure(tok);
655                     this.offset = off+1;
656                 } else {
657                     tok = Token.createClosure(tok);
658                     this.offset = off;
659                 }
660
661                 tok.setMin(min);
662                 tok.setMax(max);
663                 //System.err.println("CLOSURE: "+min+", "+max);
664
this.next();
665             }
666         }
667         return tok;
668     }
669
670     /**
671      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
672      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
673      * | '(?>' regex ')'
674      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
675      */

676     Token parseAtom() throws ParseException {
677         int ch = this.read();
678         Token tok = null;
679         switch (ch) {
680           case T_LPAREN: return this.processParen();
681           case T_LPAREN2: return this.processParen2(); // '(?:'
682
case T_CONDITION: return this.processCondition(); // '(?('
683
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
684
case T_INDEPENDENT: return this.processIndependent();
685           case T_DOT:
686             this.next(); // Skips '.'
687
tok = Token.token_dot;
688             break;
689
690             /**
691              * char-class ::= '[' ( '^'? range ','?)+ ']'
692              * range ::= '\d' | '\w' | '\s' | category-block | range-char
693              * | range-char '-' range-char
694              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
695              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
696              */

697           case T_LBRACKET: return this.parseCharacterClass(true);
698           case T_SET_OPERATIONS: return this.parseSetOperations();
699
700           case T_BACKSOLIDUS:
701             switch (this.chardata) {
702               case 'd': case 'D':
703               case 'w': case 'W':
704               case 's': case 'S':
705                 tok = this.getTokenForShorthand(this.chardata);
706                 this.next();
707                 return tok;
708
709               case 'e': case 'f': case 'n': case 'r':
710               case 't': case 'u': case 'v': case 'x':
711                 {
712                     int ch2 = this.decodeEscaped();
713                     if (ch2 < 0x10000) {
714                         tok = Token.createChar(ch2);
715                     } else {
716                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
717                     }
718                 }
719                 break;
720
721               case 'c': return this.processBacksolidus_c();
722               case 'C': return this.processBacksolidus_C();
723               case 'i': return this.processBacksolidus_i();
724               case 'I': return this.processBacksolidus_I();
725               case 'g': return this.processBacksolidus_g();
726               case 'X': return this.processBacksolidus_X();
727               case '1': case '2': case '3': case '4':
728               case '5': case '6': case '7': case '8': case '9':
729                 return this.processBackreference();
730
731               case 'P':
732               case 'p':
733                 int pstart = this.offset;
734                 tok = processBacksolidus_pP(this.chardata);
735                 if (tok == null) throw this.ex("parser.atom.5", pstart);
736                 break;
737
738               default:
739                 tok = Token.createChar(this.chardata);
740             }
741             this.next();
742             break;
743
744           case T_CHAR:
745             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
746                 throw this.ex("parser.atom.4", this.offset-1);
747             tok = Token.createChar(this.chardata);
748             int high = this.chardata;
749             this.next();
750             if (REUtil.isHighSurrogate(high)
751                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
752                 char[] sur = new char[2];
753                 sur[0] = (char)high;
754                 sur[1] = (char)this.chardata;
755                 tok = Token.createParen(Token.createString(new String JavaDoc(sur)), 0);
756                 this.next();
757             }
758             break;
759
760           default:
761             throw this.ex("parser.atom.4", this.offset-1);
762         }
763         return tok;
764     }
765
766     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
767
768         this.next();
769         if (this.read() != T_CHAR || this.chardata != '{')
770             throw this.ex("parser.atom.2", this.offset-1);
771
772         // handle category escape
773
boolean positive = c == 'p';
774         int namestart = this.offset;
775         int nameend = this.regex.indexOf('}', namestart);
776
777         if (nameend < 0)
778             throw this.ex("parser.atom.3", this.offset);
779
780         String JavaDoc pname = this.regex.substring(namestart, nameend);
781         this.offset = nameend+1;
782
783         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
784     }
785
786     int processCIinCharacterClass(RangeToken tok, int c) {
787         return this.decodeEscaped();
788     }
789
790     /**
791      * char-class ::= '[' ( '^'? range ','?)+ ']'
792      * range ::= '\d' | '\w' | '\s' | category-block | range-char
793      * | range-char '-' range-char
794      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
795      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
796      */

797     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
798         this.setContext(S_INBRACKETS);
799         this.next(); // '['
800
boolean nrange = false;
801         RangeToken base = null;
802         RangeToken tok;
803         if (this.read() == T_CHAR && this.chardata == '^') {
804             nrange = true;
805             this.next(); // '^'
806
if (useNrange) {
807                 tok = Token.createNRange();
808             } else {
809                 base = Token.createRange();
810                 base.addRange(0, Token.UTF16_MAX);
811                 tok = Token.createRange();
812             }
813         } else {
814             tok = Token.createRange();
815         }
816         int type;
817         boolean firstloop = true;
818         while ((type = this.read()) != T_EOF) {
819             if (type == T_CHAR && this.chardata == ']' && !firstloop)
820                 break;
821             firstloop = false;
822             int c = this.chardata;
823             boolean end = false;
824             if (type == T_BACKSOLIDUS) {
825                 switch (c) {
826                   case 'd': case 'D':
827                   case 'w': case 'W':
828                   case 's': case 'S':
829                     tok.mergeRanges(this.getTokenForShorthand(c));
830                     end = true;
831                     break;
832
833                   case 'i': case 'I':
834                   case 'c': case 'C':
835                     c = this.processCIinCharacterClass(tok, c);
836                     if (c < 0) end = true;
837                     break;
838                     
839                   case 'p':
840                   case 'P':
841                     int pstart = this.offset;
842                     RangeToken tok2 = this.processBacksolidus_pP(c);
843                     if (tok2 == null) throw this.ex("parser.atom.5", pstart);
844                     tok.mergeRanges(tok2);
845                     end = true;
846                     break;
847
848                   default:
849                     c = this.decodeEscaped();
850                 } // \ + c
851
} // backsolidus
852
// POSIX Character class such as [:alnum:]
853
else if (type == T_POSIX_CHARCLASS_START) {
854                 int nameend = this.regex.indexOf(':', this.offset);
855                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
856                 boolean positive = true;
857                 if (this.regex.charAt(this.offset) == '^') {
858                     this.offset ++;
859                     positive = false;
860                 }
861                 String JavaDoc name = this.regex.substring(this.offset, nameend);
862                 RangeToken range = Token.getRange(name, positive,
863                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
864                 if (range == null) throw this.ex("parser.cc.3", this.offset);
865                 tok.mergeRanges(range);
866                 end = true;
867                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
868                     throw this.ex("parser.cc.1", nameend);
869                 this.offset = nameend+2;
870             }
871             this.next();
872             if (!end) { // if not shorthands...
873
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
874
tok.addRange(c, c);
875                 } else {
876                     this.next(); // Skips '-'
877
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
878                     if (type == T_CHAR && this.chardata == ']') {
879                         tok.addRange(c, c);
880                         tok.addRange('-', '-');
881                     } else {
882                         int rangeend = this.chardata;
883                         if (type == T_BACKSOLIDUS)
884                             rangeend = this.decodeEscaped();
885                         this.next();
886                         tok.addRange(c, rangeend);
887                     }
888                 }
889             }
890             if (this.isSet(RegularExpression.SPECIAL_COMMA)
891                 && this.read() == T_CHAR && this.chardata == ',')
892                 this.next();
893         }
894         if (this.read() == T_EOF)
895             throw this.ex("parser.cc.2", this.offset);
896         if (!useNrange && nrange) {
897             base.subtractRanges(tok);
898             tok = base;
899         }
900         tok.sortRanges();
901         tok.compactRanges();
902         //tok.dumpRanges();
903
/*
904         if (this.isSet(RegularExpression.IGNORE_CASE))
905             tok = RangeToken.createCaseInsensitiveToken(tok);
906         */

907         this.setContext(S_NORMAL);
908         this.next(); // Skips ']'
909

910         return tok;
911     }
912
913     /**
914      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
915      */

916     protected RangeToken parseSetOperations() throws ParseException {
917         RangeToken tok = this.parseCharacterClass(false);
918         int type;
919         while ((type = this.read()) != T_RPAREN) {
920             int ch = this.chardata;
921             if (type == T_CHAR && (ch == '-' || ch == '&')
922                 || type == T_PLUS) {
923                 this.next();
924                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
925                 RangeToken t2 = this.parseCharacterClass(false);
926                 if (type == T_PLUS)
927                     tok.mergeRanges(t2);
928                 else if (ch == '-')
929                     tok.subtractRanges(t2);
930                 else if (ch == '&')
931                     tok.intersectRanges(t2);
932                 else
933                     throw new RuntimeException JavaDoc("ASSERT");
934             } else {
935                 throw ex("parser.ope.2", this.offset-1);
936             }
937         }
938         this.next();
939         return tok;
940     }
941
942     Token getTokenForShorthand(int ch) {
943         Token tok;
944         switch (ch) {
945           case 'd':
946             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
947                 ? Token.getRange("Nd", true) : Token.token_0to9;
948             break;
949           case 'D':
950             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
951                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
952             break;
953           case 'w':
954             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
955                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
956             break;
957           case 'W':
958             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
959                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
960             break;
961           case 's':
962             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
963                 ? Token.getRange("IsSpace", true) : Token.token_spaces;
964             break;
965           case 'S':
966             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
967                 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
968             break;
969
970           default:
971             throw new RuntimeException JavaDoc("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
972         }
973         return tok;
974     }
975
976     /**
977      */

978     int decodeEscaped() throws ParseException {
979         if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
980         int c = this.chardata;
981         switch (c) {
982           case 'e': c = 0x1b; break; // ESCAPE U+001B
983
case 'f': c = '\f'; break; // FORM FEED U+000C
984
case 'n': c = '\n'; break; // LINE FEED U+000A
985
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
986
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
987
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
988
case 'x':
989             this.next();
990             if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
991             if (this.chardata == '{') {
992                 int v1 = 0;
993                 int uv = 0;
994                 do {
995                     this.next();
996                     if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
997                     if ((v1 = hexChar(this.chardata)) < 0)
998                         break;
999                     if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1000                    uv = uv*16+v1;
1001                } while (true);
1002                if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
1003                if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
1004                c = uv;
1005            } else {
1006                int v1 = 0;
1007                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1008                    throw ex("parser.descape.1", this.offset-1);
1009                int uv = v1;
1010                this.next();
1011                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1012                    throw ex("parser.descape.1", this.offset-1);
1013                uv = uv*16+v1;
1014                c = uv;
1015            }
1016            break;
1017
1018          case 'u':
1019            int v1 = 0;
1020            this.next();
1021            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1022                throw ex("parser.descape.1", this.offset-1);
1023            int uv = v1;
1024            this.next();
1025            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1026                throw ex("parser.descape.1", this.offset-1);
1027            uv = uv*16+v1;
1028            this.next();
1029            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1030                throw ex("parser.descape.1", this.offset-1);
1031            uv = uv*16+v1;
1032            this.next();
1033            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1034                throw ex("parser.descape.1", this.offset-1);
1035            uv = uv*16+v1;
1036            c = uv;
1037            break;
1038
1039          case 'v':
1040            this.next();
1041            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1042                throw ex("parser.descape.1", this.offset-1);
1043            uv = v1;
1044            this.next();
1045            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1046                throw ex("parser.descape.1", this.offset-1);
1047            uv = uv*16+v1;
1048            this.next();
1049            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1050                throw ex("parser.descape.1", this.offset-1);
1051            uv = uv*16+v1;
1052            this.next();
1053            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1054                throw ex("parser.descape.1", this.offset-1);
1055            uv = uv*16+v1;
1056            this.next();
1057            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1058                throw ex("parser.descape.1", this.offset-1);
1059            uv = uv*16+v1;
1060            this.next();
1061            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1062                throw ex("parser.descape.1", this.offset-1);
1063            uv = uv*16+v1;
1064            if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
1065            c = uv;
1066            break;
1067          case 'A':
1068          case 'Z':
1069          case 'z':
1070            throw ex("parser.descape.5", this.offset-2);
1071          default:
1072        }
1073        return c;
1074    }
1075
1076    static private final int hexChar(int ch) {
1077        if (ch < '0') return -1;
1078        if (ch > 'f') return -1;
1079        if (ch <= '9') return ch-'0';
1080        if (ch < 'A') return -1;
1081        if (ch <= 'F') return ch-'A'+10;
1082        if (ch < 'a') return -1;
1083        return ch-'a'+10;
1084    }
1085}
1086
Popular Tags