KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > org > apache > xerces > internal > impl > xpath > regex > RegexParser


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 1999, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
59
60 import java.util.Locale JavaDoc;
61 import java.util.MissingResourceException JavaDoc;
62 import java.util.ResourceBundle JavaDoc;
63 import java.util.Vector JavaDoc;
64
65 /**
66  * A Regular Expression Parser.
67  *
68  * @version $Id: RegexParser.java,v 1.8 2003/03/25 14:47:06 sandygao Exp $
69  */

70 class RegexParser {
71     static final int T_CHAR = 0;
72     static final int T_EOF = 1;
73     static final int T_OR = 2; // '|'
74
static final int T_STAR = 3; // '*'
75
static final int T_PLUS = 4; // '+'
76
static final int T_QUESTION = 5; // '?'
77
static final int T_LPAREN = 6; // '('
78
static final int T_RPAREN = 7; // ')'
79
static final int T_DOT = 8; // '.'
80
static final int T_LBRACKET = 9; // '['
81
static final int T_BACKSOLIDUS = 10; // '\'
82
static final int T_CARET = 11; // '^'
83
static final int T_DOLLAR = 12; // '$'
84
static final int T_LPAREN2 = 13; // '(?:'
85
static final int T_LOOKAHEAD = 14; // '(?='
86
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
87
static final int T_LOOKBEHIND = 16; // '(?<='
88
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
89
static final int T_INDEPENDENT = 18; // '(?>'
90
static final int T_SET_OPERATIONS = 19; // '(?['
91
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
92
static final int T_COMMENT = 21; // '(?#'
93
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
94
static final int T_CONDITION = 23; // '(?('
95
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
96

97     static class ReferencePosition {
98         int refNumber;
99         int position;
100         ReferencePosition(int n, int pos) {
101             this.refNumber = n;
102             this.position = pos;
103         }
104     }
105
106     int offset;
107     String JavaDoc regex;
108     int regexlen;
109     int options;
110     ResourceBundle JavaDoc resources;
111     int chardata;
112     int nexttoken;
113     static protected final int S_NORMAL = 0;
114     static protected final int S_INBRACKETS = 1;
115     static protected final int S_INXBRACKETS = 2;
116     int context = S_NORMAL;
117     int parennumber = 1;
118     boolean hasBackReferences;
119     Vector JavaDoc references = null;
120
121     public RegexParser() {
122         this.setLocale(Locale.getDefault());
123     }
124     public RegexParser(Locale JavaDoc locale) {
125         this.setLocale(locale);
126     }
127
128     public void setLocale(Locale JavaDoc locale) {
129         try {
130             this.resources = ResourceBundle.getBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
131         } catch (MissingResourceException JavaDoc mre) {
132             throw new RuntimeException JavaDoc("Installation Problem??? Couldn't load messages: "
133                                        +mre.getMessage());
134         }
135     }
136
137     final ParseException ex(String JavaDoc key, int loc) {
138         return new ParseException(this.resources.getString(key), loc);
139     }
140
141     private final boolean isSet(int flag) {
142         return (this.options & flag) == flag;
143     }
144
145     synchronized Token parse(String JavaDoc regex, int options) throws ParseException {
146         this.options = options;
147         this.offset = 0;
148         this.setContext(S_NORMAL);
149         this.parennumber = 1;
150         this.hasBackReferences = false;
151         this.regex = regex;
152         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
153             this.regex = REUtil.stripExtendedComment(this.regex);
154         this.regexlen = this.regex.length();
155
156
157         this.next();
158         Token ret = this.parseRegex();
159         if (this.offset != this.regexlen)
160             throw ex("parser.parse.1", this.offset);
161         if (this.references != null) {
162             for (int i = 0; i < this.references.size(); i ++) {
163                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
164                 if (this.parennumber <= position.refNumber)
165                     throw ex("parser.parse.2", position.position);
166             }
167             this.references.removeAllElements();
168         }
169         return ret;
170     }
171
172     /*
173     public RegularExpression createRegex(String regex, int options) throws ParseException {
174         Token tok = this.parse(regex, options);
175         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
176     }
177     */

178
179     protected final void setContext(int con) {
180         this.context = con;
181     }
182
183     final int read() {
184         return this.nexttoken;
185     }
186
187     final void next() {
188         if (this.offset >= this.regexlen) {
189             this.chardata = -1;
190             this.nexttoken = T_EOF;
191             return;
192         }
193
194         int ret;
195         int ch = this.regex.charAt(this.offset++);
196         this.chardata = ch;
197
198         if (this.context == S_INBRACKETS) {
199             // In a character class, this.chardata has one character, that is to say,
200
// a pair of surrogates is composed and stored to this.chardata.
201
switch (ch) {
202               case '\\':
203                 ret = T_BACKSOLIDUS;
204                 if (this.offset >= this.regexlen)
205                     throw ex("parser.next.1", this.offset-1);
206                 this.chardata = this.regex.charAt(this.offset++);
207                 break;
208
209               case '-':
210                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
211                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
212                     this.offset++;
213                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
214                 } else
215                     ret = T_CHAR;
216                 break;
217
218               case '[':
219                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
220                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
221                     this.offset++;
222                     ret = T_POSIX_CHARCLASS_START;
223                     break;
224                 } // Through down
225
default:
226                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
227                     int low = this.regex.charAt(this.offset);
228                     if (REUtil.isLowSurrogate(low)) {
229                         this.chardata = REUtil.composeFromSurrogates(ch, low);
230                         this.offset ++;
231                     }
232                 }
233                 ret = T_CHAR;
234             }
235             this.nexttoken = ret;
236             return;
237         }
238
239         switch (ch) {
240           case '|': ret = T_OR; break;
241           case '*': ret = T_STAR; break;
242           case '+': ret = T_PLUS; break;
243           case '?': ret = T_QUESTION; break;
244           case ')': ret = T_RPAREN; break;
245           case '.': ret = T_DOT; break;
246           case '[': ret = T_LBRACKET; break;
247           case '^': ret = T_CARET; break;
248           case '$': ret = T_DOLLAR; break;
249           case '(':
250             ret = T_LPAREN;
251             if (this.offset >= this.regexlen)
252                 break;
253             if (this.regex.charAt(this.offset) != '?')
254                 break;
255             if (++this.offset >= this.regexlen)
256                 throw ex("parser.next.2", this.offset-1);
257             ch = this.regex.charAt(this.offset++);
258             switch (ch) {
259               case ':': ret = T_LPAREN2; break;
260               case '=': ret = T_LOOKAHEAD; break;
261               case '!': ret = T_NEGATIVELOOKAHEAD; break;
262               case '[': ret = T_SET_OPERATIONS; break;
263               case '>': ret = T_INDEPENDENT; break;
264               case '<':
265                 if (this.offset >= this.regexlen)
266                     throw ex("parser.next.2", this.offset-3);
267                 ch = this.regex.charAt(this.offset++);
268                 if (ch == '=') {
269                     ret = T_LOOKBEHIND;
270                 } else if (ch == '!') {
271                     ret = T_NEGATIVELOOKBEHIND;
272                 } else
273                     throw ex("parser.next.3", this.offset-3);
274                 break;
275               case '#':
276                 while (this.offset < this.regexlen) {
277                     ch = this.regex.charAt(this.offset++);
278                     if (ch == ')') break;
279                 }
280                 if (ch != ')')
281                     throw ex("parser.next.4", this.offset-1);
282                 ret = T_COMMENT;
283                 break;
284               default:
285                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
286
this.offset --;
287                     ret = T_MODIFIERS;
288                     break;
289                 } else if (ch == '(') { // conditional
290
ret = T_CONDITION; // this.offsets points the next of '('.
291
break;
292                 }
293                 throw ex("parser.next.2", this.offset-2);
294             }
295             break;
296             
297           case '\\':
298             ret = T_BACKSOLIDUS;
299             if (this.offset >= this.regexlen)
300                 throw ex("parser.next.1", this.offset-1);
301             this.chardata = this.regex.charAt(this.offset++);
302             break;
303
304           default:
305             ret = T_CHAR;
306         }
307         this.nexttoken = ret;
308     }
309
310     /**
311      * regex ::= term (`|` term)*
312      * term ::= factor+
313      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
314      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
315      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
316      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
317      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
318      */

319     Token parseRegex() throws ParseException {
320         Token tok = this.parseTerm();
321         Token parent = null;
322         while (this.read() == T_OR) {
323             this.next(); // '|'
324
if (parent == null) {
325                 parent = Token.createUnion();
326                 parent.addChild(tok);
327                 tok = parent;
328             }
329             tok.addChild(this.parseTerm());
330         }
331         return tok;
332     }
333
334     /**
335      * term ::= factor+
336      */

337     Token parseTerm() throws ParseException {
338         int ch = this.read();
339         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
340             return Token.createEmpty();
341         } else {
342             Token tok = this.parseFactor();
343             Token concat = null;
344             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
345                 if (concat == null) {
346                     concat = Token.createConcat();
347                     concat.addChild(tok);
348                     tok = concat;
349                 }
350                 concat.addChild(this.parseFactor());
351                 //tok = Token.createConcat(tok, this.parseFactor());
352
}
353             return tok;
354         }
355     }
356
357     // ----------------------------------------------------------------
358

359     Token processCaret() throws ParseException {
360         this.next();
361         return Token.token_linebeginning;
362     }
363     Token processDollar() throws ParseException {
364         this.next();
365         return Token.token_lineend;
366     }
367     Token processLookahead() throws ParseException {
368         this.next();
369         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
370         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
371         this.next(); // ')'
372
return tok;
373     }
374     Token processNegativelookahead() throws ParseException {
375         this.next();
376         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
377         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
378         this.next(); // ')'
379
return tok;
380     }
381     Token processLookbehind() throws ParseException {
382         this.next();
383         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
384         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
385         this.next(); // ')'
386
return tok;
387     }
388     Token processNegativelookbehind() throws ParseException {
389         this.next();
390         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
391         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
392         this.next(); // ')'
393
return tok;
394     }
395     Token processBacksolidus_A() throws ParseException {
396         this.next();
397         return Token.token_stringbeginning;
398     }
399     Token processBacksolidus_Z() throws ParseException {
400         this.next();
401         return Token.token_stringend2;
402     }
403     Token processBacksolidus_z() throws ParseException {
404         this.next();
405         return Token.token_stringend;
406     }
407     Token processBacksolidus_b() throws ParseException {
408         this.next();
409         return Token.token_wordedge;
410     }
411     Token processBacksolidus_B() throws ParseException {
412         this.next();
413         return Token.token_not_wordedge;
414     }
415     Token processBacksolidus_lt() throws ParseException {
416         this.next();
417         return Token.token_wordbeginning;
418     }
419     Token processBacksolidus_gt() throws ParseException {
420         this.next();
421         return Token.token_wordend;
422     }
423     Token processStar(Token tok) throws ParseException {
424         this.next();
425         if (this.read() == T_QUESTION) {
426             this.next();
427             return Token.createNGClosure(tok);
428         } else
429             return Token.createClosure(tok);
430     }
431     Token processPlus(Token tok) throws ParseException {
432         // X+ -> XX*
433
this.next();
434         if (this.read() == T_QUESTION) {
435             this.next();
436             return Token.createConcat(tok, Token.createNGClosure(tok));
437         } else
438             return Token.createConcat(tok, Token.createClosure(tok));
439     }
440     Token processQuestion(Token tok) throws ParseException {
441         // X? -> X|
442
this.next();
443         Token par = Token.createUnion();
444         if (this.read() == T_QUESTION) {
445             this.next();
446             par.addChild(Token.createEmpty());
447             par.addChild(tok);
448         } else {
449             par.addChild(tok);
450             par.addChild(Token.createEmpty());
451         }
452         return par;
453     }
454     boolean checkQuestion(int off) {
455         return off < this.regexlen && this.regex.charAt(off) == '?';
456     }
457     Token processParen() throws ParseException {
458         this.next();
459         int p = this.parennumber++;
460         Token tok = Token.createParen(this.parseRegex(), p);
461         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
462         this.next(); // Skips ')'
463
return tok;
464     }
465     Token processParen2() throws ParseException {
466         this.next();
467         Token tok = Token.createParen(this.parseRegex(), 0);
468         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
469         this.next(); // Skips ')'
470
return tok;
471     }
472     Token processCondition() throws ParseException {
473                                                 // this.offset points the next of '('
474
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
475                                                 // Parses a condition.
476
int refno = -1;
477         Token condition = null;
478         int ch = this.regex.charAt(this.offset);
479         if ('1' <= ch && ch <= '9') {
480             refno = ch-'0';
481             this.hasBackReferences = true;
482             if (this.references == null) this.references = new Vector JavaDoc();
483             this.references.addElement(new ReferencePosition(refno, this.offset));
484             this.offset ++;
485             if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
486             this.offset ++;
487         } else {
488             if (ch == '?') this.offset --; // Points '('.
489
this.next();
490             condition = this.parseFactor();
491             switch (condition.type) {
492               case Token.LOOKAHEAD:
493               case Token.NEGATIVELOOKAHEAD:
494               case Token.LOOKBEHIND:
495               case Token.NEGATIVELOOKBEHIND:
496                 break;
497               case Token.ANCHOR:
498                 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
499                 break;
500               default:
501                 throw ex("parser.factor.5", this.offset);
502             }
503         }
504                                                 // Parses yes/no-patterns.
505
this.next();
506         Token yesPattern = this.parseRegex();
507         Token noPattern = null;
508         if (yesPattern.type == Token.UNION) {
509             if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
510             noPattern = yesPattern.getChild(1);
511             yesPattern = yesPattern.getChild(0);
512         }
513         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
514         this.next();
515         return Token.createCondition(refno, condition, yesPattern, noPattern);
516     }
517     Token processModifiers() throws ParseException {
518                                                 // this.offset points the next of '?'.
519
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
520
int add = 0, mask = 0, ch = -1;
521         while (this.offset < this.regexlen) {
522             ch = this.regex.charAt(this.offset);
523             int v = REUtil.getOptionValue(ch);
524             if (v == 0) break; // '-' or ':'?
525
add |= v;
526             this.offset ++;
527         }
528         if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
529         if (ch == '-') {
530             this.offset ++;
531             while (this.offset < this.regexlen) {
532                 ch = this.regex.charAt(this.offset);
533                 int v = REUtil.getOptionValue(ch);
534                 if (v == 0) break; // ':'?
535
mask |= v;
536                 this.offset ++;
537             }
538             if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
539         }
540         Token tok;
541         if (ch == ':') {
542             this.offset ++;
543             this.next();
544             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
545             if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
546             this.next();
547         } else if (ch == ')') { // such as (?-i)
548
this.offset ++;
549             this.next();
550             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
551         } else
552             throw ex("parser.factor.3", this.offset);
553
554         return tok;
555     }
556     Token processIndependent() throws ParseException {
557         this.next();
558         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
559         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
560         this.next(); // Skips ')'
561
return tok;
562     }
563     Token processBacksolidus_c() throws ParseException {
564         int ch2; // Must be in 0x0040-0x005f
565
if (this.offset >= this.regexlen
566             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
567             throw ex("parser.atom.1", this.offset-1);
568         this.next();
569         return Token.createChar(ch2-0x40);
570     }
571     Token processBacksolidus_C() throws ParseException {
572         throw ex("parser.process.1", this.offset);
573     }
574     Token processBacksolidus_i() throws ParseException {
575         Token tok = Token.createChar('i');
576         this.next();
577         return tok;
578     }
579     Token processBacksolidus_I() throws ParseException {
580         throw ex("parser.process.1", this.offset);
581     }
582     Token processBacksolidus_g() throws ParseException {
583         this.next();
584         return Token.getGraphemePattern();
585     }
586     Token processBacksolidus_X() throws ParseException {
587         this.next();
588         return Token.getCombiningCharacterSequence();
589     }
590     Token processBackreference() throws ParseException {
591         int refnum = this.chardata-'0';
592         Token tok = Token.createBackReference(refnum);
593         this.hasBackReferences = true;
594         if (this.references == null) this.references = new Vector JavaDoc();
595         this.references.addElement(new ReferencePosition(refnum, this.offset-2));
596         this.next();
597         return tok;
598     }
599
600     // ----------------------------------------------------------------
601

602     /**
603      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
604      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
605      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
606      * | '(?#' [^)]* ')'
607      * minmax ::= '{' min (',' max?)? '}'
608      * min ::= [0-9]+
609      * max ::= [0-9]+
610      */

611     Token parseFactor() throws ParseException {
612         int ch = this.read();
613         Token tok;
614         switch (ch) {
615           case T_CARET: return this.processCaret();
616           case T_DOLLAR: return this.processDollar();
617           case T_LOOKAHEAD: return this.processLookahead();
618           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
619           case T_LOOKBEHIND: return this.processLookbehind();
620           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
621
622           case T_COMMENT:
623             this.next();
624             return Token.createEmpty();
625
626           case T_BACKSOLIDUS:
627             switch (this.chardata) {
628               case 'A': return this.processBacksolidus_A();
629               case 'Z': return this.processBacksolidus_Z();
630               case 'z': return this.processBacksolidus_z();
631               case 'b': return this.processBacksolidus_b();
632               case 'B': return this.processBacksolidus_B();
633               case '<': return this.processBacksolidus_lt();
634               case '>': return this.processBacksolidus_gt();
635             }
636                                                 // through down
637
}
638         tok = this.parseAtom();
639         ch = this.read();
640         switch (ch) {
641           case T_STAR: return this.processStar(tok);
642           case T_PLUS: return this.processPlus(tok);
643           case T_QUESTION: return this.processQuestion(tok);
644           case T_CHAR:
645             if (this.chardata == '{' && this.offset < this.regexlen) {
646
647                 int off = this.offset; // this.offset -> next of '{'
648
int min = 0, max = -1;
649
650                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
651
652                     min = ch -'0';
653                     while (off < this.regexlen
654                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
655                         min = min*10 +ch-'0';
656                         if (min < 0)
657                             throw ex("parser.quantifier.5", this.offset);
658                     }
659                 }
660                 else {
661                     throw ex("parser.quantifier.1", this.offset);
662                 }
663
664                 max = min;
665                 if (ch == ',') {
666
667                    if (off >= this.regexlen) {
668                        throw ex("parser.quantifier.3", this.offset);
669                    }
670                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
671
672                         max = ch -'0'; // {min,max}
673
while (off < this.regexlen
674                                && (ch = this.regex.charAt(off++)) >= '0'
675                                && ch <= '9') {
676                             max = max*10 +ch-'0';
677                             if (max < 0)
678                                 throw ex("parser.quantifier.5", this.offset);
679                         }
680
681                         if (min > max)
682                             throw ex("parser.quantifier.4", this.offset);
683                    }
684                    else { // assume {min,}
685
max = -1;
686                     }
687                 }
688
689                if (ch != '}')
690                    throw ex("parser.quantifier.2", this.offset);
691
692                if (this.checkQuestion(off)) { // off -> next of '}'
693
tok = Token.createNGClosure(tok);
694                     this.offset = off+1;
695                 } else {
696                     tok = Token.createClosure(tok);
697                     this.offset = off;
698                 }
699
700                 tok.setMin(min);
701                 tok.setMax(max);
702                 //System.err.println("CLOSURE: "+min+", "+max);
703
this.next();
704             }
705         }
706         return tok;
707     }
708
709     /**
710      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
711      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
712      * | '(?>' regex ')'
713      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
714      */

715     Token parseAtom() throws ParseException {
716         int ch = this.read();
717         Token tok = null;
718         switch (ch) {
719           case T_LPAREN: return this.processParen();
720           case T_LPAREN2: return this.processParen2(); // '(?:'
721
case T_CONDITION: return this.processCondition(); // '(?('
722
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
723
case T_INDEPENDENT: return this.processIndependent();
724           case T_DOT:
725             this.next(); // Skips '.'
726
tok = Token.token_dot;
727             break;
728
729             /**
730              * char-class ::= '[' ( '^'? range ','?)+ ']'
731              * range ::= '\d' | '\w' | '\s' | category-block | range-char
732              * | range-char '-' range-char
733              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
734              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
735              */

736           case T_LBRACKET: return this.parseCharacterClass(true);
737           case T_SET_OPERATIONS: return this.parseSetOperations();
738
739           case T_BACKSOLIDUS:
740             switch (this.chardata) {
741               case 'd': case 'D':
742               case 'w': case 'W':
743               case 's': case 'S':
744                 tok = this.getTokenForShorthand(this.chardata);
745                 this.next();
746                 return tok;
747
748               case 'e': case 'f': case 'n': case 'r':
749               case 't': case 'u': case 'v': case 'x':
750                 {
751                     int ch2 = this.decodeEscaped();
752                     if (ch2 < 0x10000) {
753                         tok = Token.createChar(ch2);
754                     } else {
755                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
756                     }
757                 }
758                 break;
759
760               case 'c': return this.processBacksolidus_c();
761               case 'C': return this.processBacksolidus_C();
762               case 'i': return this.processBacksolidus_i();
763               case 'I': return this.processBacksolidus_I();
764               case 'g': return this.processBacksolidus_g();
765               case 'X': return this.processBacksolidus_X();
766               case '1': case '2': case '3': case '4':
767               case '5': case '6': case '7': case '8': case '9':
768                 return this.processBackreference();
769
770               case 'P':
771               case 'p':
772                 int pstart = this.offset;
773                 tok = processBacksolidus_pP(this.chardata);
774                 if (tok == null) throw this.ex("parser.atom.5", pstart);
775                 break;
776
777               default:
778                 tok = Token.createChar(this.chardata);
779             }
780             this.next();
781             break;
782
783           case T_CHAR:
784             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
785                 throw this.ex("parser.atom.4", this.offset-1);
786             tok = Token.createChar(this.chardata);
787             int high = this.chardata;
788             this.next();
789             if (REUtil.isHighSurrogate(high)
790                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
791                 char[] sur = new char[2];
792                 sur[0] = (char)high;
793                 sur[1] = (char)this.chardata;
794                 tok = Token.createParen(Token.createString(new String JavaDoc(sur)), 0);
795                 this.next();
796             }
797             break;
798
799           default:
800             throw this.ex("parser.atom.4", this.offset-1);
801         }
802         return tok;
803     }
804
805     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
806
807         this.next();
808         if (this.read() != T_CHAR || this.chardata != '{')
809             throw this.ex("parser.atom.2", this.offset-1);
810
811         // handle category escape
812
boolean positive = c == 'p';
813         int namestart = this.offset;
814         int nameend = this.regex.indexOf('}', namestart);
815
816         if (nameend < 0)
817             throw this.ex("parser.atom.3", this.offset);
818
819         String JavaDoc pname = this.regex.substring(namestart, nameend);
820         this.offset = nameend+1;
821
822         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
823     }
824
825     int processCIinCharacterClass(RangeToken tok, int c) {
826         return this.decodeEscaped();
827     }
828
829     /**
830      * char-class ::= '[' ( '^'? range ','?)+ ']'
831      * range ::= '\d' | '\w' | '\s' | category-block | range-char
832      * | range-char '-' range-char
833      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
834      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
835      */

836     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
837         this.setContext(S_INBRACKETS);
838         this.next(); // '['
839
boolean nrange = false;
840         RangeToken base = null;
841         RangeToken tok;
842         if (this.read() == T_CHAR && this.chardata == '^') {
843             nrange = true;
844             this.next(); // '^'
845
if (useNrange) {
846                 tok = Token.createNRange();
847             } else {
848                 base = Token.createRange();
849                 base.addRange(0, Token.UTF16_MAX);
850                 tok = Token.createRange();
851             }
852         } else {
853             tok = Token.createRange();
854         }
855         int type;
856         boolean firstloop = true;
857         while ((type = this.read()) != T_EOF) {
858             if (type == T_CHAR && this.chardata == ']' && !firstloop)
859                 break;
860             firstloop = false;
861             int c = this.chardata;
862             boolean end = false;
863             if (type == T_BACKSOLIDUS) {
864                 switch (c) {
865                   case 'd': case 'D':
866                   case 'w': case 'W':
867                   case 's': case 'S':
868                     tok.mergeRanges(this.getTokenForShorthand(c));
869                     end = true;
870                     break;
871
872                   case 'i': case 'I':
873                   case 'c': case 'C':
874                     c = this.processCIinCharacterClass(tok, c);
875                     if (c < 0) end = true;
876                     break;
877                     
878                   case 'p':
879                   case 'P':
880                     int pstart = this.offset;
881                     RangeToken tok2 = this.processBacksolidus_pP(c);
882                     if (tok2 == null) throw this.ex("parser.atom.5", pstart);
883                     tok.mergeRanges(tok2);
884                     end = true;
885                     break;
886
887                   default:
888                     c = this.decodeEscaped();
889                 } // \ + c
890
} // backsolidus
891
// POSIX Character class such as [:alnum:]
892
else if (type == T_POSIX_CHARCLASS_START) {
893                 int nameend = this.regex.indexOf(':', this.offset);
894                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
895                 boolean positive = true;
896                 if (this.regex.charAt(this.offset) == '^') {
897                     this.offset ++;
898                     positive = false;
899                 }
900                 String JavaDoc name = this.regex.substring(this.offset, nameend);
901                 RangeToken range = Token.getRange(name, positive,
902                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
903                 if (range == null) throw this.ex("parser.cc.3", this.offset);
904                 tok.mergeRanges(range);
905                 end = true;
906                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
907                     throw this.ex("parser.cc.1", nameend);
908                 this.offset = nameend+2;
909             }
910             this.next();
911             if (!end) { // if not shorthands...
912
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
913
tok.addRange(c, c);
914                 } else {
915                     this.next(); // Skips '-'
916
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
917                     if (type == T_CHAR && this.chardata == ']') {
918                         tok.addRange(c, c);
919                         tok.addRange('-', '-');
920                     } else {
921                         int rangeend = this.chardata;
922                         if (type == T_BACKSOLIDUS)
923                             rangeend = this.decodeEscaped();
924                         this.next();
925                         tok.addRange(c, rangeend);
926                     }
927                 }
928             }
929             if (this.isSet(RegularExpression.SPECIAL_COMMA)
930                 && this.read() == T_CHAR && this.chardata == ',')
931                 this.next();
932         }
933         if (this.read() == T_EOF)
934             throw this.ex("parser.cc.2", this.offset);
935         if (!useNrange && nrange) {
936             base.subtractRanges(tok);
937             tok = base;
938         }
939         tok.sortRanges();
940         tok.compactRanges();
941         //tok.dumpRanges();
942
/*
943         if (this.isSet(RegularExpression.IGNORE_CASE))
944             tok = RangeToken.createCaseInsensitiveToken(tok);
945         */

946         this.setContext(S_NORMAL);
947         this.next(); // Skips ']'
948

949         return tok;
950     }
951
952     /**
953      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
954      */

955     protected RangeToken parseSetOperations() throws ParseException {
956         RangeToken tok = this.parseCharacterClass(false);
957         int type;
958         while ((type = this.read()) != T_RPAREN) {
959             int ch = this.chardata;
960             if (type == T_CHAR && (ch == '-' || ch == '&')
961                 || type == T_PLUS) {
962                 this.next();
963                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
964                 RangeToken t2 = this.parseCharacterClass(false);
965                 if (type == T_PLUS)
966                     tok.mergeRanges(t2);
967                 else if (ch == '-')
968                     tok.subtractRanges(t2);
969                 else if (ch == '&')
970                     tok.intersectRanges(t2);
971                 else
972                     throw new RuntimeException JavaDoc("ASSERT");
973             } else {
974                 throw ex("parser.ope.2", this.offset-1);
975             }
976         }
977         this.next();
978         return tok;
979     }
980
981     Token getTokenForShorthand(int ch) {
982         Token tok;
983         switch (ch) {
984           case 'd':
985             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
986                 ? Token.getRange("Nd", true) : Token.token_0to9;
987             break;
988           case 'D':
989             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
990                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
991             break;
992           case 'w':
993             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
994                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
995             break;
996           case 'W':
997             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
998                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
999             break;
1000          case 's':
1001            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1002                ? Token.getRange("IsSpace", true) : Token.token_spaces;
1003            break;
1004          case 'S':
1005            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1006                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1007            break;
1008
1009          default:
1010            throw new RuntimeException JavaDoc("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1011        }
1012        return tok;
1013    }
1014
1015    /**
1016     */

1017    int decodeEscaped() throws ParseException {
1018        if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
1019        int c = this.chardata;
1020        switch (c) {
1021          case 'e': c = 0x1b; break; // ESCAPE U+001B
1022
case 'f': c = '\f'; break; // FORM FEED U+000C
1023
case 'n': c = '\n'; break; // LINE FEED U+000A
1024
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
1025
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
1026
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
1027
case 'x':
1028            this.next();
1029            if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
1030            if (this.chardata == '{') {
1031                int v1 = 0;
1032                int uv = 0;
1033                do {
1034                    this.next();
1035                    if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
1036                    if ((v1 = hexChar(this.chardata)) < 0)
1037                        break;
1038                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1039                    uv = uv*16+v1;
1040                } while (true);
1041                if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
1042                if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
1043                c = uv;
1044            } else {
1045                int v1 = 0;
1046                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1047                    throw ex("parser.descape.1", this.offset-1);
1048                int uv = v1;
1049                this.next();
1050                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1051                    throw ex("parser.descape.1", this.offset-1);
1052                uv = uv*16+v1;
1053                c = uv;
1054            }
1055            break;
1056
1057          case 'u':
1058            int v1 = 0;
1059            this.next();
1060            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1061                throw ex("parser.descape.1", this.offset-1);
1062            int uv = v1;
1063            this.next();
1064            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1065                throw ex("parser.descape.1", this.offset-1);
1066            uv = uv*16+v1;
1067            this.next();
1068            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1069                throw ex("parser.descape.1", this.offset-1);
1070            uv = uv*16+v1;
1071            this.next();
1072            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1073                throw ex("parser.descape.1", this.offset-1);
1074            uv = uv*16+v1;
1075            c = uv;
1076            break;
1077
1078          case 'v':
1079            this.next();
1080            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1081                throw ex("parser.descape.1", this.offset-1);
1082            uv = v1;
1083            this.next();
1084            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1085                throw ex("parser.descape.1", this.offset-1);
1086            uv = uv*16+v1;
1087            this.next();
1088            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1089                throw ex("parser.descape.1", this.offset-1);
1090            uv = uv*16+v1;
1091            this.next();
1092            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1093                throw ex("parser.descape.1", this.offset-1);
1094            uv = uv*16+v1;
1095            this.next();
1096            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1097                throw ex("parser.descape.1", this.offset-1);
1098            uv = uv*16+v1;
1099            this.next();
1100            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1101                throw ex("parser.descape.1", this.offset-1);
1102            uv = uv*16+v1;
1103            if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
1104            c = uv;
1105            break;
1106          case 'A':
1107          case 'Z':
1108          case 'z':
1109            throw ex("parser.descape.5", this.offset-2);
1110          default:
1111        }
1112        return c;
1113    }
1114
1115    static private final int hexChar(int ch) {
1116        if (ch < '0') return -1;
1117        if (ch > 'f') return -1;
1118        if (ch <= '9') return ch-'0';
1119        if (ch < 'A') return -1;
1120        if (ch <= 'F') return ch-'A'+10;
1121        if (ch < 'a') return -1;
1122        return ch-'a'+10;
1123    }
1124}
1125
Popular Tags