KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > apache > xerces > utils > regex > RegexParser


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 1999, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package org.enhydra.apache.xerces.utils.regex;
59
60
61 import java.util.Locale JavaDoc;
62 import java.util.MissingResourceException JavaDoc;
63 import java.util.ResourceBundle JavaDoc;
64 import java.util.Vector JavaDoc;
65
66 /**
67  * A Regular Expression Parser.
68  */

69 class RegexParser {
70     static final int T_CHAR = 0;
71     static final int T_EOF = 1;
72     static final int T_OR = 2; // '|'
73
static final int T_STAR = 3; // '*'
74
static final int T_PLUS = 4; // '+'
75
static final int T_QUESTION = 5; // '?'
76
static final int T_LPAREN = 6; // '('
77
static final int T_RPAREN = 7; // ')'
78
static final int T_DOT = 8; // '.'
79
static final int T_LBRACKET = 9; // '['
80
static final int T_BACKSOLIDUS = 10; // '\'
81
static final int T_CARET = 11; // '^'
82
static final int T_DOLLAR = 12; // '$'
83
static final int T_LPAREN2 = 13; // '(?:'
84
static final int T_LOOKAHEAD = 14; // '(?='
85
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
86
static final int T_LOOKBEHIND = 16; // '(?<='
87
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
88
static final int T_INDEPENDENT = 18; // '(?>'
89
static final int T_SET_OPERATIONS = 19; // '(?['
90
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
91
static final int T_COMMENT = 21; // '(?#'
92
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
93
static final int T_CONDITION = 23; // '(?('
94
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
95

96     static class ReferencePosition {
97         int refNumber;
98         int position;
99         ReferencePosition(int n, int pos) {
100             this.refNumber = n;
101             this.position = pos;
102         }
103     }
104
105     int offset;
106     String JavaDoc regex;
107     int regexlen;
108     int options;
109     ResourceBundle JavaDoc resources;
110     int chardata;
111     int nexttoken;
112     static protected final int S_NORMAL = 0;
113     static protected final int S_INBRACKETS = 1;
114     static protected final int S_INXBRACKETS = 2;
115     int context = S_NORMAL;
116     int parennumber = 1;
117     boolean hasBackReferences;
118     Vector JavaDoc references = null;
119
120     public RegexParser() {
121         this.setLocale(Locale.getDefault());
122     }
123     public RegexParser(Locale JavaDoc locale) {
124         this.setLocale(locale);
125     }
126
127     public void setLocale(Locale JavaDoc locale) {
128         try {
129             this.resources = ResourceBundle.getBundle("org.enhydra.apache.xerces.utils.regex.message", locale);
130         } catch (MissingResourceException JavaDoc mre) {
131             throw new RuntimeException JavaDoc("Installation Problem??? Couldn't load messages: "
132                                        +mre.getMessage());
133         }
134     }
135
136     final ParseException ex(String JavaDoc key, int loc) {
137         return new ParseException(this.resources.getString(key), loc);
138     }
139
140     private final boolean isSet(int flag) {
141         return (this.options & flag) == flag;
142     }
143
144     synchronized Token parse(String JavaDoc regex, int options) throws ParseException {
145
146         this.options = options;
147         this.offset = 0;
148         this.setContext(S_NORMAL);
149         this.parennumber = 1;
150         this.hasBackReferences = false;
151         this.regex = regex;
152         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
153             this.regex = REUtil.stripExtendedComment(this.regex);
154         this.regexlen = this.regex.length();
155
156
157         this.next();
158         Token ret = this.parseRegex();
159         if (this.offset != this.regexlen)
160             throw ex("parser.parse.1", this.offset);
161         if (this.references != null) {
162             for (int i = 0; i < this.references.size(); i ++) {
163                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
164                 if (this.parennumber <= position.refNumber)
165                     throw ex("parser.parse.2", position.position);
166             }
167             this.references.removeAllElements();
168         }
169         return ret;
170     }
171
172     /*
173     public RegularExpression createRegex(String regex, int options) throws ParseException {
174         Token tok = this.parse(regex, options);
175         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
176     }
177     */

178
179     protected final void setContext(int con) {
180         this.context = con;
181     }
182
183     final int read() {
184         return this.nexttoken;
185     }
186
187     final void next() {
188         if (this.offset >= this.regexlen) {
189             this.chardata = -1;
190             this.nexttoken = T_EOF;
191             return;
192         }
193
194         int ret;
195         int ch = this.regex.charAt(this.offset++);
196         this.chardata = ch;
197
198         if (this.context == S_INBRACKETS) {
199             // In a character class, this.chardata has one character, that is to say,
200
// a pair of surrogates is composed and stored to this.chardata.
201
switch (ch) {
202               case '\\':
203                 ret = T_BACKSOLIDUS;
204                 if (this.offset >= this.regexlen)
205                     throw ex("parser.next.1", this.offset-1);
206                 this.chardata = this.regex.charAt(this.offset++);
207                 break;
208
209               case '-':
210                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
211                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
212                     this.offset++;
213                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
214                 } else
215                     ret = T_CHAR;
216                 break;
217
218               case '[':
219                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
220                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
221                     this.offset++;
222                     ret = T_POSIX_CHARCLASS_START;
223                     break;
224                 } // Through down
225
default:
226                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
227                     int low = this.regex.charAt(this.offset);
228                     if (REUtil.isLowSurrogate(low)) {
229                         this.chardata = REUtil.composeFromSurrogates(ch, low);
230                         this.offset ++;
231                     }
232                 }
233                 ret = T_CHAR;
234             }
235             this.nexttoken = ret;
236             return;
237         }
238
239         switch (ch) {
240           case '|': ret = T_OR; break;
241           case '*': ret = T_STAR; break;
242           case '+': ret = T_PLUS; break;
243           case '?': ret = T_QUESTION; break;
244           case ')': ret = T_RPAREN; break;
245           case '.': ret = T_DOT; break;
246           case '[': ret = T_LBRACKET; break;
247           case '^': ret = T_CARET; break;
248           case '$': ret = T_DOLLAR; break;
249           case '(':
250             ret = T_LPAREN;
251             if (this.offset >= this.regexlen)
252                 break;
253             if (this.regex.charAt(this.offset) != '?')
254                 break;
255             if (++this.offset >= this.regexlen)
256                 throw ex("parser.next.2", this.offset-1);
257             ch = this.regex.charAt(this.offset++);
258             switch (ch) {
259               case ':': ret = T_LPAREN2; break;
260               case '=': ret = T_LOOKAHEAD; break;
261               case '!': ret = T_NEGATIVELOOKAHEAD; break;
262               case '[': ret = T_SET_OPERATIONS; break;
263               case '>': ret = T_INDEPENDENT; break;
264               case '<':
265                 if (this.offset >= this.regexlen)
266                     throw ex("parser.next.2", this.offset-3);
267                 ch = this.regex.charAt(this.offset++);
268                 if (ch == '=') {
269                     ret = T_LOOKBEHIND;
270                 } else if (ch == '!') {
271                     ret = T_NEGATIVELOOKBEHIND;
272                 } else
273                     throw ex("parser.next.3", this.offset-3);
274                 break;
275               case '#':
276                 while (this.offset < this.regexlen) {
277                     ch = this.regex.charAt(this.offset++);
278                     if (ch == ')') break;
279                 }
280                 if (ch != ')')
281                     throw ex("parser.next.4", this.offset-1);
282                 ret = T_COMMENT;
283                 break;
284               default:
285                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
286
this.offset --;
287                     ret = T_MODIFIERS;
288                     break;
289                 } else if (ch == '(') { // conditional
290
ret = T_CONDITION; // this.offsets points the next of '('.
291
break;
292                 }
293                 throw ex("parser.next.2", this.offset-2);
294             }
295             break;
296             
297           case '\\':
298             ret = T_BACKSOLIDUS;
299             if (this.offset >= this.regexlen)
300                 throw ex("parser.next.1", this.offset-1);
301             this.chardata = this.regex.charAt(this.offset++);
302             break;
303
304           default:
305             ret = T_CHAR;
306             if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen)
307                 this.chardata = REUtil.composeFromSurrogates(this.chardata,
308                                                              this.regex.charAt(this.offset++));
309         }
310         this.nexttoken = ret;
311     }
312
313     /**
314      * regex ::= term (`|` term)*
315      * term ::= factor+
316      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
317      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
318      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
319      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
320      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
321      */

322     Token parseRegex() throws ParseException {
323         Token tok = this.parseTerm();
324         Token parent = null;
325         while (this.read() == T_OR) {
326             this.next(); // '|'
327
if (parent == null) {
328                 parent = Token.createUnion();
329                 parent.addChild(tok);
330                 tok = parent;
331             }
332             tok.addChild(this.parseTerm());
333         }
334
335         return tok;
336     }
337
338     /**
339      * term ::= factor+
340      */

341     Token parseTerm() throws ParseException {
342         int ch = this.read();
343         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
344             return Token.createEmpty();
345         } else {
346             Token tok = this.parseFactor();
347             Token concat = null;
348             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
349                 if (concat == null) {
350                     concat = Token.createConcat();
351                     concat.addChild(tok);
352                     tok = concat;
353                 }
354                 concat.addChild(this.parseFactor());
355                 //tok = Token.createConcat(tok, this.parseFactor());
356
}
357             return tok;
358         }
359     }
360
361     // ----------------------------------------------------------------
362

363     Token processCaret() throws ParseException {
364         this.next();
365         return Token.token_linebeginning;
366     }
367     Token processDollar() throws ParseException {
368         this.next();
369         return Token.token_lineend;
370     }
371     Token processLookahead() throws ParseException {
372         this.next();
373         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
374         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
375         this.next(); // ')'
376
return tok;
377     }
378     Token processNegativelookahead() throws ParseException {
379         this.next();
380         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
381         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
382         this.next(); // ')'
383
return tok;
384     }
385     Token processLookbehind() throws ParseException {
386         this.next();
387         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
388         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
389         this.next(); // ')'
390
return tok;
391     }
392     Token processNegativelookbehind() throws ParseException {
393         this.next();
394         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
395         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
396         this.next(); // ')'
397
return tok;
398     }
399     Token processBacksolidus_A() throws ParseException {
400         this.next();
401         return Token.token_stringbeginning;
402     }
403     Token processBacksolidus_Z() throws ParseException {
404         this.next();
405         return Token.token_stringend2;
406     }
407     Token processBacksolidus_z() throws ParseException {
408         this.next();
409         return Token.token_stringend;
410     }
411     Token processBacksolidus_b() throws ParseException {
412         this.next();
413         return Token.token_wordedge;
414     }
415     Token processBacksolidus_B() throws ParseException {
416         this.next();
417         return Token.token_not_wordedge;
418     }
419     Token processBacksolidus_lt() throws ParseException {
420         this.next();
421         return Token.token_wordbeginning;
422     }
423     Token processBacksolidus_gt() throws ParseException {
424         this.next();
425         return Token.token_wordend;
426     }
427     Token processStar(Token tok) throws ParseException {
428         this.next();
429         if (this.read() == T_QUESTION) {
430             this.next();
431             return Token.createNGClosure(tok);
432         } else
433             return Token.createClosure(tok);
434     }
435     Token processPlus(Token tok) throws ParseException {
436         // X+ -> XX*
437
this.next();
438         if (this.read() == T_QUESTION) {
439             this.next();
440             return Token.createConcat(tok, Token.createNGClosure(tok));
441         } else
442             return Token.createConcat(tok, Token.createClosure(tok));
443     }
444     Token processQuestion(Token tok) throws ParseException {
445         // X? -> X|
446
this.next();
447         Token par = Token.createUnion();
448         if (this.read() == T_QUESTION) {
449             this.next();
450             par.addChild(Token.createEmpty());
451             par.addChild(tok);
452         } else {
453             par.addChild(tok);
454             par.addChild(Token.createEmpty());
455         }
456         return par;
457     }
458     boolean checkQuestion(int off) {
459         return off < this.regexlen && this.regex.charAt(off) == '?';
460     }
461     Token processParen() throws ParseException {
462         this.next();
463         int p = this.parennumber++;
464         Token tok = Token.createParen(this.parseRegex(), p);
465         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
466         this.next(); // Skips ')'
467
return tok;
468     }
469     Token processParen2() throws ParseException {
470         this.next();
471         Token tok = Token.createParen(this.parseRegex(), 0);
472         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
473         this.next(); // Skips ')'
474
return tok;
475     }
476     Token processCondition() throws ParseException {
477                                                 // this.offset points the next of '('
478
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
479                                                 // Parses a condition.
480
int refno = -1;
481         Token condition = null;
482         int ch = this.regex.charAt(this.offset);
483         if ('1' <= ch && ch <= '9') {
484             refno = ch-'0';
485             this.hasBackReferences = true;
486             if (this.references == null) this.references = new Vector JavaDoc();
487             this.references.addElement(new ReferencePosition(refno, this.offset));
488             this.offset ++;
489             if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
490             this.offset ++;
491         } else {
492             if (ch == '?') this.offset --; // Points '('.
493
this.next();
494             condition = this.parseFactor();
495             switch (condition.type) {
496               case Token.LOOKAHEAD:
497               case Token.NEGATIVELOOKAHEAD:
498               case Token.LOOKBEHIND:
499               case Token.NEGATIVELOOKBEHIND:
500                 break;
501               case Token.ANCHOR:
502                 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
503                 break;
504               default:
505                 throw ex("parser.factor.5", this.offset);
506             }
507         }
508                                                 // Parses yes/no-patterns.
509
this.next();
510         Token yesPattern = this.parseRegex();
511         Token noPattern = null;
512         if (yesPattern.type == Token.UNION) {
513             if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
514             noPattern = yesPattern.getChild(1);
515             yesPattern = yesPattern.getChild(0);
516         }
517         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
518         this.next();
519         return Token.createCondition(refno, condition, yesPattern, noPattern);
520     }
521     Token processModifiers() throws ParseException {
522                                                 // this.offset points the next of '?'.
523
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
524
int add = 0, mask = 0, ch = -1;
525         while (this.offset < this.regexlen) {
526             ch = this.regex.charAt(this.offset);
527             int v = REUtil.getOptionValue(ch);
528             if (v == 0) break; // '-' or ':'?
529
add |= v;
530             this.offset ++;
531         }
532         if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
533         if (ch == '-') {
534             this.offset ++;
535             while (this.offset < this.regexlen) {
536                 ch = this.regex.charAt(this.offset);
537                 int v = REUtil.getOptionValue(ch);
538                 if (v == 0) break; // ':'?
539
mask |= v;
540                 this.offset ++;
541             }
542             if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
543         }
544         Token tok;
545         if (ch == ':') {
546             this.offset ++;
547             this.next();
548             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
549             if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
550             this.next();
551         } else if (ch == ')') { // such as (?-i)
552
this.offset ++;
553             this.next();
554             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
555         } else
556             throw ex("parser.factor.3", this.offset);
557
558         return tok;
559     }
560     Token processIndependent() throws ParseException {
561         this.next();
562         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
563         if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
564         this.next(); // Skips ')'
565
return tok;
566     }
567     Token processBacksolidus_c() throws ParseException {
568         int ch2; // Must be in 0x0040-0x005f
569
if (this.offset >= this.regexlen
570             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
571             throw ex("parser.atom.1", this.offset-1);
572         this.next();
573         return Token.createChar(ch2-0x40);
574     }
575     Token processBacksolidus_C() throws ParseException {
576         throw ex("parser.process.1", this.offset);
577     }
578     Token processBacksolidus_i() throws ParseException {
579         Token tok = Token.createChar('i');
580         this.next();
581         return tok;
582     }
583     Token processBacksolidus_I() throws ParseException {
584         throw ex("parser.process.1", this.offset);
585     }
586     Token processBacksolidus_g() throws ParseException {
587         this.next();
588         return Token.getGraphemePattern();
589     }
590     Token processBacksolidus_X() throws ParseException {
591         this.next();
592         return Token.getCombiningCharacterSequence();
593     }
594     Token processBackreference() throws ParseException {
595         int refnum = this.chardata-'0';
596         Token tok = Token.createBackReference(refnum);
597         this.hasBackReferences = true;
598         if (this.references == null) this.references = new Vector JavaDoc();
599         this.references.addElement(new ReferencePosition(refnum, this.offset-2));
600         this.next();
601         return tok;
602     }
603
604     // ----------------------------------------------------------------
605

606     /**
607      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
608      * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
609      * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
610      * | '(?#' [^)]* ')'
611      * minmax ::= '{' min (',' max?)? '}'
612      * min ::= [0-9]+
613      * max ::= [0-9]+
614      */

615     Token parseFactor() throws ParseException {
616         int ch = this.read();
617         Token tok;
618         switch (ch) {
619           case T_CARET: return this.processCaret();
620           case T_DOLLAR: return this.processDollar();
621           case T_LOOKAHEAD: return this.processLookahead();
622           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
623           case T_LOOKBEHIND: return this.processLookbehind();
624           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
625
626           case T_COMMENT:
627             this.next();
628             return Token.createEmpty();
629
630           case T_BACKSOLIDUS:
631             switch (this.chardata) {
632               case 'A': return this.processBacksolidus_A();
633               case 'Z': return this.processBacksolidus_Z();
634               case 'z': return this.processBacksolidus_z();
635               case 'b': return this.processBacksolidus_b();
636               case 'B': return this.processBacksolidus_B();
637               case '<': return this.processBacksolidus_lt();
638               case '>': return this.processBacksolidus_gt();
639             }
640                                                 // through down
641
}
642         tok = this.parseAtom();
643         ch = this.read();
644         switch (ch) {
645           case T_STAR: return this.processStar(tok);
646           case T_PLUS: return this.processPlus(tok);
647           case T_QUESTION: return this.processQuestion(tok);
648           case T_CHAR:
649             if (this.chardata == '{') {
650                                                 // this.offset -> next of '{'
651
int off = this.offset;
652                 int min = 0, max = -1;
653                 if (off >= this.regexlen) break;
654                 ch = this.regex.charAt(off++);
655                 if (ch < '0' || ch > '9') {
656                     throw new RuntimeException JavaDoc("Invalid quantifier '"+(char)ch+"' in " + regex);
657                 }
658                 min = ch-'0';
659                 while (off < this.regexlen
660                         && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
661                       min = min*10 +ch-'0';
662                       ch = -1;
663                 }
664                 max = min;
665                 if (ch!='}' && ch !=',' && (ch < '0' || ch > '9')) {
666                     throw new RuntimeException JavaDoc("Invalid quantifier '"+(char)ch+"' in " + regex);
667                 }
668                 //REVISIT: check for invalid quantifiers!
669
//
670

671                 else if (ch == ',') {
672                     if (ch == '}') {
673                           max = -1; // {min,}
674
} else {
675                         max = ch-'0'; // {min,max}
676
while (off < this.regexlen
677                                && (ch = this.regex.charAt(off++)) >= '0'
678                                && ch <= '9') {
679                             max = max*10 +ch-'0';
680                             ch = -1;
681                         }
682                         //if (min > max)
683
// throw new ParseException("parseFactor(): min > max: "+min+", "+max);
684

685                         if (ch !='}' && (ch < '0' || ch > '9')) {
686                             throw new RuntimeException JavaDoc( "Invalid quantifier '"+(char)ch+"' in" + regex);
687                         }
688                     }
689                 }
690                                                 // off -> next of '}'
691
if (this.checkQuestion(off)) {
692                     tok = Token.createNGClosure(tok);
693                     this.offset = off+1;
694                 } else {
695                     tok = Token.createClosure(tok);
696                     this.offset = off;
697                 }
698                 tok.setMin(min);
699                 tok.setMax(max);
700                 //System.err.println("CLOSURE: "+min+", "+max);
701
this.next();
702             }
703         }
704         return tok;
705     }
706
707     /**
708      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
709      * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
710      * | '(?>' regex ')'
711      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
712      */

713     Token parseAtom() throws ParseException {
714         int ch = this.read();
715         Token tok = null;
716         switch (ch) {
717           case T_LPAREN: return this.processParen();
718           case T_LPAREN2: return this.processParen2(); // '(?:'
719
case T_CONDITION: return this.processCondition(); // '(?('
720
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
721
case T_INDEPENDENT: return this.processIndependent();
722           case T_DOT:
723             this.next(); // Skips '.'
724
tok = Token.token_dot;
725             break;
726
727             /**
728              * char-class ::= '[' ( '^'? range ','?)+ ']'
729              * range ::= '\d' | '\w' | '\s' | category-block | range-char
730              * | range-char '-' range-char
731              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
732              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
733              */

734           case T_LBRACKET: return this.parseCharacterClass(true);
735           case T_SET_OPERATIONS: return this.parseSetOperations();
736
737           case T_BACKSOLIDUS:
738             switch (this.chardata) {
739               case 'd': case 'D':
740               case 'w': case 'W':
741               case 's': case 'S':
742                 tok = this.getTokenForShorthand(this.chardata);
743                 this.next();
744                 return tok;
745
746               case 'e': case 'f': case 'n': case 'r':
747               case 't': case 'u': case 'v': case 'x':
748                 {
749                     int ch2 = this.decodeEscaped();
750                     if (ch2 < 0x10000) {
751                         tok = Token.createChar(ch2);
752                     } else {
753                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
754                     }
755                 }
756                 break;
757
758               case 'c': return this.processBacksolidus_c();
759               case 'C': return this.processBacksolidus_C();
760               case 'i': return this.processBacksolidus_i();
761               case 'I': return this.processBacksolidus_I();
762               case 'g': return this.processBacksolidus_g();
763               case 'X': return this.processBacksolidus_X();
764               case '1': case '2': case '3': case '4':
765               case '5': case '6': case '7': case '8': case '9':
766                 return this.processBackreference();
767
768               case 'P':
769               case 'p':
770                 int pstart = this.offset;
771                 tok = processBacksolidus_pP(this.chardata);
772                 if (tok == null) throw this.ex("parser.atom.5", pstart);
773                 break;
774
775               default:
776                 tok = Token.createChar(this.chardata);
777             }
778             this.next();
779             break;
780
781           case T_CHAR:
782             tok = Token.createChar(this.chardata);
783             this.next();
784             break;
785
786           default:
787             throw this.ex("parser.atom.4", this.offset-1);
788         }
789         return tok;
790     }
791
792     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
793         boolean positive = c == 'p';
794         this.next();
795         if (this.read() != T_CHAR) throw this.ex("parser.atom.2", this.offset-1);
796         RangeToken tok;
797         switch (this.chardata) {
798           case 'L': // Letter
799
tok = Token.getRange("L", positive); break;
800           case 'M': // Mark
801
tok = Token.getRange("M", positive); break;
802           case 'N': // Number
803
tok = Token.getRange("N", positive); break;
804           case 'Z': // Separator
805
tok = Token.getRange("Z", positive); break;
806           case 'C': // Other
807
tok = Token.getRange("C", positive); break;
808           case 'P': // Punctuation
809
tok = Token.getRange("P", positive); break;
810           case 'S': // Symbol
811
tok = Token.getRange("S", positive); break;
812           case '{':
813             // this.offset points the next of '{'.
814
//pstart = this.offset;
815
int namestart = this.offset;
816             int nameend = this.regex.indexOf('}', namestart);
817             if (nameend < 0) throw this.ex("parser.atom.3", this.offset);
818             this.offset = nameend+1;
819             tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
820             /*
821               if (this.isSet(RegularExpression.IGNORE_CASE))
822               tok = RangeToken.createCaseInsensitiveToken(tok);
823             */

824             break;
825
826           default:
827             throw this.ex("parser.atom.2", this.offset-1);
828         }
829         return tok;
830     }
831
832     int processCIinCharacterClass(RangeToken tok, int c) {
833         return this.decodeEscaped();
834     }
835
836     /**
837      * char-class ::= '[' ( '^'? range ','?)+ ']'
838      * range ::= '\d' | '\w' | '\s' | category-block | range-char
839      * | range-char '-' range-char
840      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
841      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
842      */

843     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
844         this.setContext(S_INBRACKETS);
845         this.next(); // '['
846
boolean nrange = false;
847         RangeToken base = null;
848         RangeToken tok;
849         if (this.read() == T_CHAR && this.chardata == '^') {
850             nrange = true;
851             this.next(); // '^'
852
if (useNrange) {
853                 tok = Token.createNRange();
854             } else {
855                 base = Token.createRange();
856                 base.addRange(0, Token.UTF16_MAX);
857                 tok = Token.createRange();
858             }
859         } else {
860             tok = Token.createRange();
861         }
862         int type;
863         boolean firstloop = true;
864         while ((type = this.read()) != T_EOF) {
865             if (type == T_CHAR && this.chardata == ']' && !firstloop)
866                 break;
867             firstloop = false;
868             int c = this.chardata;
869             boolean end = false;
870             if (type == T_BACKSOLIDUS) {
871                 switch (c) {
872                   case 'd': case 'D':
873                   case 'w': case 'W':
874                   case 's': case 'S':
875                     tok.mergeRanges(this.getTokenForShorthand(c));
876                     end = true;
877                     break;
878
879                   case 'i': case 'I':
880                   case 'c': case 'C':
881                     c = this.processCIinCharacterClass(tok, c);
882                     if (c < 0) end = true;
883                     break;
884                     
885                   case 'p':
886                   case 'P':
887                     int pstart = this.offset;
888                     RangeToken tok2 = this.processBacksolidus_pP(c);
889                     if (tok2 == null) throw this.ex("parser.atom.5", pstart);
890                     tok.mergeRanges(tok2);
891                     end = true;
892                     break;
893
894                   default:
895                     c = this.decodeEscaped();
896                 } // \ + c
897
} // backsolidus
898
// POSIX Character class such as [:alnum:]
899
else if (type == T_POSIX_CHARCLASS_START) {
900                 int nameend = this.regex.indexOf(':', this.offset);
901                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
902                 boolean positive = true;
903                 if (this.regex.charAt(this.offset) == '^') {
904                     this.offset ++;
905                     positive = false;
906                 }
907                 String JavaDoc name = this.regex.substring(this.offset, nameend);
908                 RangeToken range = Token.getRange(name, positive);
909                 if (range == null) throw this.ex("parser.cc.3", this.offset);
910                 tok.mergeRanges(range);
911                 end = true;
912                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
913                     throw this.ex("parser.cc.1", nameend);
914                 this.offset = nameend+2;
915             }
916             this.next();
917             if (!end) { // if not shorthands...
918
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
919
tok.addRange(c, c);
920                 } else {
921                     this.next(); // Skips '-'
922
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
923                     if (type == T_CHAR && this.chardata == ']') {
924                         tok.addRange(c, c);
925                         tok.addRange('-', '-');
926                     } else {
927                         int rangeend = this.chardata;
928                         if (type == T_BACKSOLIDUS)
929                             rangeend = this.decodeEscaped();
930                         this.next();
931                         tok.addRange(c, rangeend);
932                     }
933                 }
934             }
935             if (this.isSet(RegularExpression.SPECIAL_COMMA)
936                 && this.read() == T_CHAR && this.chardata == ',')
937                 this.next();
938         }
939         if (this.read() == T_EOF)
940             throw this.ex("parser.cc.2", this.offset);
941         if (!useNrange && nrange) {
942             base.subtractRanges(tok);
943             tok = base;
944         }
945         tok.sortRanges();
946         tok.compactRanges();
947         //tok.dumpRanges();
948
/*
949         if (this.isSet(RegularExpression.IGNORE_CASE))
950             tok = RangeToken.createCaseInsensitiveToken(tok);
951         */

952         this.setContext(S_NORMAL);
953         this.next(); // Skips ']'
954

955         return tok;
956     }
957     private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
958         this.setContext(S_INBRACKETS);
959         this.next(); // '['
960
boolean nrange = false;
961         RangeToken base = null;
962         RangeToken tok;
963         if (this.read() == T_CHAR && this.chardata == '^') {
964             nrange = true;
965             this.next(); // '^'
966
if (useNrange) {
967                 tok = Token.createNRange();
968             } else {
969                 base = Token.createRange();
970                 base.addRange(0, Token.UTF16_MAX);
971                 tok = Token.createRange();
972             }
973         } else {
974             tok = Token.createRange();
975         }
976         int type;
977         while ((type = this.read()) != T_EOF
978                && !(type == T_CHAR && this.chardata == ']')) {
979             int c = this.chardata;
980             /*
981             if (type == T_CHAR && c == '^') {
982                 this.next();
983                 type = this.read();
984                 c = this.chardata;
985                 if (type == T_EOF) break;
986
987                 nrange = !nrange;
988                 if (nrange)
989                     tok = Token.createRange();
990                 else {
991                     base.subtractRanges(tok);
992                     tok = base;
993                 }
994             }
995             */

996             boolean end = false;
997             if (type == T_BACKSOLIDUS) {
998                 switch (c) {
999                   case 'd': case 'D':
1000                  case 'w': case 'W':
1001                  case 's': case 'S':
1002                    tok.mergeRanges(this.getTokenForShorthand(c));
1003                    end = true;
1004                    break;
1005
1006                  case 'i': case 'I':
1007                  case 'c': case 'C':
1008                    c = this.processCIinCharacterClass(tok, c);
1009                    if (c < 0) end = true;
1010                    break;
1011                    
1012                  case 'p':
1013                  case 'P':
1014                    boolean positive = c == 'p';
1015                    int pstart = this.offset;
1016                    this.next();
1017                    if (this.read() != T_CHAR) throw ex("parser.atom.2", this.offset-1);
1018                    RangeToken tok2 = null;
1019                    switch (this.chardata) {
1020                      case 'L': // Letter
1021
tok2 = Token.getRange("L", positive); break;
1022                      case 'M': // Mark
1023
tok2 = Token.getRange("M", positive); break;
1024                      case 'N': // Number
1025
tok2 = Token.getRange("N", positive); break;
1026                      case 'Z': // Separator
1027
tok2 = Token.getRange("Z", positive); break;
1028                      case 'C': // Other
1029
tok2 = Token.getRange("C", positive); break;
1030                      case 'P': // Punctuation
1031
tok2 = Token.getRange("P", positive); break;
1032                      case 'S': // Symbol
1033
tok2 = Token.getRange("S", positive); break;
1034                      case '{':
1035                        // this.offset points the next of '{'.
1036
pstart = this.offset;
1037                        int namestart = this.offset;
1038                        int nameend = this.regex.indexOf('}', namestart);
1039                        if (nameend < 0) throw ex("parser.atom.3", this.offset);
1040                        this.offset = nameend+1;
1041                        tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive);
1042                        break;
1043
1044                      default:
1045                        throw ex("parser.atom.2", this.offset-1);
1046                    }
1047                    if (tok2 == null) throw ex("parser.atom.5", pstart);
1048                    tok.mergeRanges(tok2);
1049                    end = true;
1050                    break;
1051
1052                  default:
1053                    c = this.decodeEscaped();
1054                } // \ + c
1055
} // backsolidus
1056
// POSIX Character class such as [:alnum:]
1057
else if (type == T_POSIX_CHARCLASS_START) {
1058                int nameend = this.regex.indexOf(':', this.offset);
1059                if (nameend < 0) throw ex("parser.cc.1", this.offset);
1060                String JavaDoc name = this.regex.substring(this.offset, nameend);
1061                RangeToken range = Token.getRange(name, true);
1062                if (range == null) throw ex("parser.cc.3", this.offset);
1063                tok.mergeRanges(range);
1064                end = true;
1065                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
1066                    throw ex("parser.cc.1", nameend);
1067                this.offset = nameend+2;
1068            }
1069            this.next();
1070            if (!end) {
1071                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
1072
tok.addRange(c, c);
1073                } else {
1074                    this.next(); // Skips '-'
1075
if ((type = this.read()) == T_EOF) throw ex("parser.cc.2", this.offset);
1076                    int rangeend = this.chardata;
1077                    if (type == T_BACKSOLIDUS)
1078                        rangeend = this.decodeEscaped();
1079                    this.next();
1080                    tok.addRange(c, rangeend);
1081                }
1082            }
1083            if (this.read() == T_CHAR && this.chardata == ',')
1084                this.next();
1085        }
1086        if (this.read() == T_EOF)
1087            throw ex("parser.cc.2", this.offset);
1088        if (!useNrange && nrange) {
1089            base.subtractRanges(tok);
1090            tok = base;
1091        }
1092        tok.sortRanges();
1093        tok.compactRanges();
1094        //tok.dumpRanges();
1095
/*
1096        if (this.isSet(RegularExpression.IGNORE_CASE))
1097            tok = RangeToken.createCaseInsensitiveToken(tok);
1098        */

1099        this.setContext(S_NORMAL);
1100        this.next(); // Skips ']'
1101

1102        return tok;
1103    }
1104
1105    /**
1106     * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1107     */

1108    protected RangeToken parseSetOperations() throws ParseException {
1109        RangeToken tok = this.parseCharacterClass(false);
1110        int type;
1111        while ((type = this.read()) != T_RPAREN) {
1112            int ch = this.chardata;
1113            if (type == T_CHAR && (ch == '-' || ch == '&')
1114                || type == T_PLUS) {
1115                this.next();
1116                if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
1117                RangeToken t2 = this.parseCharacterClass(false);
1118                if (type == T_PLUS)
1119                    tok.mergeRanges(t2);
1120                else if (ch == '-')
1121                    tok.subtractRanges(t2);
1122                else if (ch == '&')
1123                    tok.intersectRanges(t2);
1124                else
1125                    throw new RuntimeException JavaDoc("ASSERT");
1126            } else {
1127                throw ex("parser.ope.2", this.offset-1);
1128            }
1129        }
1130        this.next();
1131        return tok;
1132    }
1133
1134    Token getTokenForShorthand(int ch) {
1135        Token tok;
1136        switch (ch) {
1137          case 'd':
1138            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1139                ? Token.getRange("Nd", true) : Token.token_0to9;
1140            break;
1141          case 'D':
1142            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1143                ? Token.getRange("Nd", false) : Token.token_not_0to9;
1144            break;
1145          case 'w':
1146            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1147                ? Token.getRange("IsWord", true) : Token.token_wordchars;
1148            break;
1149          case 'W':
1150            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1151                ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1152            break;
1153          case 's':
1154            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1155                ? Token.getRange("IsSpace", true) : Token.token_spaces;
1156            break;
1157          case 'S':
1158            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1159                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1160            break;
1161
1162          default:
1163            throw new RuntimeException JavaDoc("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1164        }
1165        return tok;
1166    }
1167
1168    /**
1169     */

1170    int decodeEscaped() throws ParseException {
1171        if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
1172        int c = this.chardata;
1173        switch (c) {
1174          case 'e': c = 0x1b; break; // ESCAPE U+001B
1175
case 'f': c = '\f'; break; // FORM FEED U+000C
1176
case 'n': c = '\n'; break; // LINE FEED U+000A
1177
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
1178
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
1179
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
1180
case 'x':
1181            this.next();
1182            if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
1183            if (this.chardata == '{') {
1184                int v1 = 0;
1185                int uv = 0;
1186                do {
1187                    this.next();
1188                    if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
1189                    if ((v1 = hexChar(this.chardata)) < 0)
1190                        break;
1191                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1192                    uv = uv*16+v1;
1193                } while (true);
1194                if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
1195                if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
1196                c = uv;
1197            } else {
1198                int v1 = 0;
1199                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1200                    throw ex("parser.descape.1", this.offset-1);
1201                int uv = v1;
1202                this.next();
1203                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1204                    throw ex("parser.descape.1", this.offset-1);
1205                uv = uv*16+v1;
1206                c = uv;
1207            }
1208            break;
1209
1210          case 'u':
1211            int v1 = 0;
1212            this.next();
1213            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1214                throw ex("parser.descape.1", this.offset-1);
1215            int uv = v1;
1216            this.next();
1217            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1218                throw ex("parser.descape.1", this.offset-1);
1219            uv = uv*16+v1;
1220            this.next();
1221            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1222                throw ex("parser.descape.1", this.offset-1);
1223            uv = uv*16+v1;
1224            this.next();
1225            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1226                throw ex("parser.descape.1", this.offset-1);
1227            uv = uv*16+v1;
1228            c = uv;
1229            break;
1230
1231          case 'v':
1232            this.next();
1233            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1234                throw ex("parser.descape.1", this.offset-1);
1235            uv = v1;
1236            this.next();
1237            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1238                throw ex("parser.descape.1", this.offset-1);
1239            uv = uv*16+v1;
1240            this.next();
1241            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1242                throw ex("parser.descape.1", this.offset-1);
1243            uv = uv*16+v1;
1244            this.next();
1245            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1246                throw ex("parser.descape.1", this.offset-1);
1247            uv = uv*16+v1;
1248            this.next();
1249            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1250                throw ex("parser.descape.1", this.offset-1);
1251            uv = uv*16+v1;
1252            this.next();
1253            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1254                throw ex("parser.descape.1", this.offset-1);
1255            uv = uv*16+v1;
1256            if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
1257            c = uv;
1258            break;
1259          case 'A':
1260          case 'Z':
1261          case 'z':
1262            throw ex("parser.descape.5", this.offset-2);
1263          default:
1264        }
1265        return c;
1266    }
1267
1268    static private final int hexChar(int ch) {
1269        if (ch < '0') return -1;
1270        if (ch > 'f') return -1;
1271        if (ch <= '9') return ch-'0';
1272        if (ch < 'A') return -1;
1273        if (ch <= 'F') return ch-'A'+10;
1274        if (ch < 'a') return -1;
1275        return ch-'a'+10;
1276    }
1277}
1278
Popular Tags