RegexParser


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package com.sun.org.apache.xerces.internal.impl.xpath.regex;
59  
60  import java.util.Locale  ;
61  import java.util.MissingResourceException  ;
62  import java.util.ResourceBundle  ;
63  import java.util.Vector  ;
64  
65  /**
66   * A Regular Expression Parser.
67   *
68   * @version $Id: RegexParser.java,v 1.8 2003/03/25 14:47:06 sandygao Exp $
69   */
70  class RegexParser {
71      static final int T_CHAR = 0;
72      static final int T_EOF = 1;
73      static final int T_OR = 2;                  // '|'
74      static final int T_STAR = 3;                // '*'
75      static final int T_PLUS = 4;                // '+'
76      static final int T_QUESTION = 5;            // '?'
77      static final int T_LPAREN = 6;              // '('
78      static final int T_RPAREN = 7;              // ')'
79      static final int T_DOT = 8;                 // '.'
80      static final int T_LBRACKET = 9;            // '['
81      static final int T_BACKSOLIDUS = 10;        // '\'
82      static final int T_CARET = 11;              // '^'
83      static final int T_DOLLAR = 12;             // '$'
84      static final int T_LPAREN2 = 13;            // '(?:'
85      static final int T_LOOKAHEAD = 14;          // '(?='
86      static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
87      static final int T_LOOKBEHIND = 16;         // '(?<='
88      static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
89      static final int T_INDEPENDENT = 18;        // '(?>'
90      static final int T_SET_OPERATIONS = 19;     // '(?['
91      static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
92      static final int T_COMMENT = 21;            // '(?#'
93      static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
94      static final int T_CONDITION = 23;          // '(?('
95      static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
96  
97      static class ReferencePosition {
98          int refNumber;
99          int position;
100         ReferencePosition(int n, int pos) {
101             this.refNumber = n;
102             this.position = pos;
103         }
104     }
105 
106     int offset;
107     String   regex;
108     int regexlen;
109     int options;
110     ResourceBundle   resources;
111     int chardata;
112     int nexttoken;
113     static protected final int S_NORMAL = 0;
114     static protected final int S_INBRACKETS = 1;
115     static protected final int S_INXBRACKETS = 2;
116     int context = S_NORMAL;
117     int parennumber = 1;
118     boolean hasBackReferences;
119     Vector   references = null;
120 
121     public RegexParser() {
122         this.setLocale(Locale.getDefault());
123     }
124     public RegexParser(Locale   locale) {
125         this.setLocale(locale);
126     }
127 
128     public void setLocale(Locale   locale) {
129         try {
130             this.resources = ResourceBundle.getBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
131         } catch (MissingResourceException   mre) {
132             throw new RuntimeException  ("Installation Problem???  Couldn't load messages: "
133                                        +mre.getMessage());
134         }
135     }
136 
137     final ParseException ex(String   key, int loc) {
138         return new ParseException(this.resources.getString(key), loc);
139     }
140 
141     private final boolean isSet(int flag) {
142         return (this.options & flag) == flag;
143     }
144 
145     synchronized Token parse(String   regex, int options) throws ParseException {
146         this.options = options;
147         this.offset = 0;
148         this.setContext(S_NORMAL);
149         this.parennumber = 1;
150         this.hasBackReferences = false;
151         this.regex = regex;
152         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
153             this.regex = REUtil.stripExtendedComment(this.regex);
154         this.regexlen = this.regex.length();
155 
156 
157         this.next();
158         Token ret = this.parseRegex();
159         if (this.offset != this.regexlen)
160             throw ex("parser.parse.1", this.offset);
161         if (this.references != null) {
162             for (int i = 0;  i < this.references.size();  i ++) {
163                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
164                 if (this.parennumber <= position.refNumber)
165                     throw ex("parser.parse.2", position.position);
166             }
167             this.references.removeAllElements();
168         }
169         return ret;
170     }
171 
172     /*
173     public RegularExpression createRegex(String regex, int options) throws ParseException {
174         Token tok = this.parse(regex, options);
175         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
176     }
177     */
178 
179     protected final void setContext(int con) {
180         this.context = con;
181     }
182 
183     final int read() {
184         return this.nexttoken;
185     }
186 
187     final void next() {
188         if (this.offset >= this.regexlen) {
189             this.chardata = -1;
190             this.nexttoken = T_EOF;
191             return;
192         }
193 
194         int ret;
195         int ch = this.regex.charAt(this.offset++);
196         this.chardata = ch;
197 
198         if (this.context == S_INBRACKETS) {
199             // In a character class, this.chardata has one character, that is to say,
200             // a pair of surrogates is composed and stored to this.chardata.
201             switch (ch) {
202               case '\\':
203                 ret = T_BACKSOLIDUS;
204                 if (this.offset >= this.regexlen)
205                     throw ex("parser.next.1", this.offset-1);
206                 this.chardata = this.regex.charAt(this.offset++);
207                 break;
208 
209               case '-':
210                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
211                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
212                     this.offset++;
213                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
214                 } else
215                     ret = T_CHAR;
216                 break;
217 
218               case '[':
219                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
220                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
221                     this.offset++;
222                     ret = T_POSIX_CHARCLASS_START;
223                     break;
224                 } // Through down
225               default:
226                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
227                     int low = this.regex.charAt(this.offset);
228                     if (REUtil.isLowSurrogate(low)) {
229                         this.chardata = REUtil.composeFromSurrogates(ch, low);
230                         this.offset ++;
231                     }
232                 }
233                 ret = T_CHAR;
234             }
235             this.nexttoken = ret;
236             return;
237         }
238 
239         switch (ch) {
240           case '|': ret = T_OR;             break;
241           case '*': ret = T_STAR;           break;
242           case '+': ret = T_PLUS;           break;
243           case '?': ret = T_QUESTION;       break;
244           case ')': ret = T_RPAREN;         break;
245           case '.': ret = T_DOT;            break;
246           case '[': ret = T_LBRACKET;       break;
247           case '^': ret = T_CARET;          break;
248           case '$': ret = T_DOLLAR;         break;
249           case '(':
250             ret = T_LPAREN;
251             if (this.offset >= this.regexlen)
252                 break;
253             if (this.regex.charAt(this.offset) != '?')
254                 break;
255             if (++this.offset >= this.regexlen)
256                 throw ex("parser.next.2", this.offset-1);
257             ch = this.regex.charAt(this.offset++);
258             switch (ch) {
259               case ':':  ret = T_LPAREN2;            break;
260               case '=':  ret = T_LOOKAHEAD;          break;
261               case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
262               case '[':  ret = T_SET_OPERATIONS;     break;
263               case '>':  ret = T_INDEPENDENT;        break;
264               case '<':
265                 if (this.offset >= this.regexlen)
266                     throw ex("parser.next.2", this.offset-3);
267                 ch = this.regex.charAt(this.offset++);
268                 if (ch == '=') {
269                     ret = T_LOOKBEHIND;
270                 } else if (ch == '!') {
271                     ret = T_NEGATIVELOOKBEHIND;
272                 } else
273                     throw ex("parser.next.3", this.offset-3);
274                 break;
275               case '#':
276                 while (this.offset < this.regexlen) {
277                     ch = this.regex.charAt(this.offset++);
278                     if (ch == ')')  break;
279                 }
280                 if (ch != ')')
281                     throw ex("parser.next.4", this.offset-1);
282                 ret = T_COMMENT;
283                 break;
284               default:
285                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
286                     this.offset --;
287                     ret = T_MODIFIERS;
288                     break;
289                 } else if (ch == '(') {         // conditional
290                     ret = T_CONDITION;          // this.offsets points the next of '('.
291                     break;
292                 }
293                 throw ex("parser.next.2", this.offset-2);
294             }
295             break;
296             
297           case '\\':
298             ret = T_BACKSOLIDUS;
299             if (this.offset >= this.regexlen)
300                 throw ex("parser.next.1", this.offset-1);
301             this.chardata = this.regex.charAt(this.offset++);
302             break;
303 
304           default:
305             ret = T_CHAR;
306         }
307         this.nexttoken = ret;
308     }
309 
310     /**
311      * regex ::= term (`|` term)*
312      * term ::= factor+
313      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
314      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
315      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
316      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
317      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 
318      */
319     Token parseRegex() throws ParseException {
320         Token tok = this.parseTerm();
321         Token parent = null;
322         while (this.read() == T_OR) {
323             this.next();                    // '|'
324             if (parent == null) {
325                 parent = Token.createUnion();
326                 parent.addChild(tok);
327                 tok = parent;
328             }
329             tok.addChild(this.parseTerm());
330         }
331         return tok;
332     }
333 
334     /**
335      * term ::= factor+
336      */
337     Token parseTerm() throws ParseException {
338         int ch = this.read();
339         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
340             return Token.createEmpty();
341         } else {
342             Token tok = this.parseFactor();
343             Token concat = null;
344             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
345                 if (concat == null) {
346                     concat = Token.createConcat();
347                     concat.addChild(tok);
348                     tok = concat;
349                 }
350                 concat.addChild(this.parseFactor());
351                 //tok = Token.createConcat(tok, this.parseFactor());
352             }
353             return tok;
354         }
355     }
356 
357     // ----------------------------------------------------------------
358 
359     Token processCaret() throws ParseException {
360         this.next();
361         return Token.token_linebeginning;
362     }
363     Token processDollar() throws ParseException {
364         this.next();
365         return Token.token_lineend;
366     }
367     Token processLookahead() throws ParseException {
368         this.next();
369         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
370         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
371         this.next();                            // ')'
372         return tok;
373     }
374     Token processNegativelookahead() throws ParseException {
375         this.next();
376         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
377         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
378         this.next();                            // ')'
379         return tok;
380     }
381     Token processLookbehind() throws ParseException {
382         this.next();
383         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
384         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
385         this.next();                            // ')'
386         return tok;
387     }
388     Token processNegativelookbehind() throws ParseException {
389         this.next();
390         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
391         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
392         this.next();                    // ')'
393         return tok;
394     }
395     Token processBacksolidus_A() throws ParseException {
396         this.next();
397         return Token.token_stringbeginning;
398     }
399     Token processBacksolidus_Z() throws ParseException {
400         this.next();
401         return Token.token_stringend2;
402     }
403     Token processBacksolidus_z() throws ParseException {
404         this.next();
405         return Token.token_stringend;
406     }
407     Token processBacksolidus_b() throws ParseException {
408         this.next();
409         return Token.token_wordedge;
410     }
411     Token processBacksolidus_B() throws ParseException {
412         this.next();
413         return Token.token_not_wordedge;
414     }
415     Token processBacksolidus_lt() throws ParseException {
416         this.next();
417         return Token.token_wordbeginning;
418     }
419     Token processBacksolidus_gt() throws ParseException {
420         this.next();
421         return Token.token_wordend;
422     }
423     Token processStar(Token tok) throws ParseException {
424         this.next();
425         if (this.read() == T_QUESTION) {
426             this.next();
427             return Token.createNGClosure(tok);
428         } else
429             return Token.createClosure(tok);
430     }
431     Token processPlus(Token tok) throws ParseException {
432         // X+ -> XX*
433         this.next();
434         if (this.read() == T_QUESTION) {
435             this.next();
436             return Token.createConcat(tok, Token.createNGClosure(tok));
437         } else
438             return Token.createConcat(tok, Token.createClosure(tok));
439     }
440     Token processQuestion(Token tok) throws ParseException {
441         // X? -> X|
442         this.next();
443         Token par = Token.createUnion();
444         if (this.read() == T_QUESTION) {
445             this.next();
446             par.addChild(Token.createEmpty());
447             par.addChild(tok);
448         } else {
449             par.addChild(tok);
450             par.addChild(Token.createEmpty());
451         }
452         return par;
453     }
454     boolean checkQuestion(int off) {
455         return off < this.regexlen && this.regex.charAt(off) == '?';
456     }
457     Token processParen() throws ParseException {
458         this.next();
459         int p = this.parennumber++;
460         Token tok = Token.createParen(this.parseRegex(), p);
461         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
462         this.next();                            // Skips ')'
463         return tok;
464     }
465     Token processParen2() throws ParseException {
466         this.next();
467         Token tok = Token.createParen(this.parseRegex(), 0);
468         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
469         this.next();                            // Skips ')'
470         return tok;
471     }
472     Token processCondition() throws ParseException {
473                                                 // this.offset points the next of '('
474         if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
475                                                 // Parses a condition.
476         int refno = -1;
477         Token condition = null;
478         int ch = this.regex.charAt(this.offset);
479         if ('1' <= ch && ch <= '9') {
480             refno = ch-'0';
481             this.hasBackReferences = true;
482             if (this.references == null)  this.references = new Vector  ();
483             this.references.addElement(new ReferencePosition(refno, this.offset));
484             this.offset ++;
485             if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
486             this.offset ++;
487         } else {
488             if (ch == '?')  this.offset --; // Points '('.
489             this.next();
490             condition = this.parseFactor();
491             switch (condition.type) {
492               case Token.LOOKAHEAD:
493               case Token.NEGATIVELOOKAHEAD:
494               case Token.LOOKBEHIND:
495               case Token.NEGATIVELOOKBEHIND:
496                 break;
497               case Token.ANCHOR:
498                 if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
499                 break;
500               default:
501                 throw ex("parser.factor.5", this.offset);
502             }
503         }
504                                                 // Parses yes/no-patterns.
505         this.next();
506         Token yesPattern = this.parseRegex();
507         Token noPattern = null;
508         if (yesPattern.type == Token.UNION) {
509             if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
510             noPattern = yesPattern.getChild(1);
511             yesPattern = yesPattern.getChild(0);
512         }
513         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
514         this.next();
515         return Token.createCondition(refno, condition, yesPattern, noPattern);
516     }
517     Token processModifiers() throws ParseException {
518                                                 // this.offset points the next of '?'.
519                                                 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
520         int add = 0, mask = 0, ch = -1;
521         while (this.offset < this.regexlen) {
522             ch = this.regex.charAt(this.offset);
523             int v = REUtil.getOptionValue(ch);
524             if (v == 0)  break;                 // '-' or ':'?
525             add |= v;
526             this.offset ++;
527         }
528         if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
529         if (ch == '-') {
530             this.offset ++;
531             while (this.offset < this.regexlen) {
532                 ch = this.regex.charAt(this.offset);
533                 int v = REUtil.getOptionValue(ch);
534                 if (v == 0)  break;             // ':'?
535                 mask |= v;
536                 this.offset ++;
537             }
538             if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
539         }
540         Token tok;
541         if (ch == ':') {
542             this.offset ++;
543             this.next();
544             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
545             if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
546             this.next();
547         } else if (ch == ')') {                 // such as (?-i)
548             this.offset ++;
549             this.next();
550             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
551         } else
552             throw ex("parser.factor.3", this.offset);
553 
554         return tok;
555     }
556     Token processIndependent() throws ParseException {
557         this.next();
558         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
559         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
560         this.next();                            // Skips ')'
561         return tok;
562     }
563     Token processBacksolidus_c() throws ParseException {
564         int ch2;                                // Must be in 0x0040-0x005f
565         if (this.offset >= this.regexlen
566             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
567             throw ex("parser.atom.1", this.offset-1);
568         this.next();
569         return Token.createChar(ch2-0x40);
570     }
571     Token processBacksolidus_C() throws ParseException {
572         throw ex("parser.process.1", this.offset);
573     }
574     Token processBacksolidus_i() throws ParseException {
575         Token tok = Token.createChar('i');
576         this.next();
577         return tok;
578     }
579     Token processBacksolidus_I() throws ParseException {
580         throw ex("parser.process.1", this.offset);
581     }
582     Token processBacksolidus_g() throws ParseException {
583         this.next();
584         return Token.getGraphemePattern();
585     }
586     Token processBacksolidus_X() throws ParseException {
587         this.next();
588         return Token.getCombiningCharacterSequence();
589     }
590     Token processBackreference() throws ParseException {
591         int refnum = this.chardata-'0';
592         Token tok = Token.createBackReference(refnum);
593         this.hasBackReferences = true;
594         if (this.references == null)  this.references = new Vector  ();
595         this.references.addElement(new ReferencePosition(refnum, this.offset-2));
596         this.next();
597         return tok;
598     }
599 
600     // ----------------------------------------------------------------
601 
602     /**
603      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
604      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
605      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
606      *            | '(?#' [^)]* ')'
607      * minmax ::= '{' min (',' max?)? '}'
608      * min ::= [0-9]+
609      * max ::= [0-9]+
610      */
611     Token parseFactor() throws ParseException {        
612         int ch = this.read();
613         Token tok;
614         switch (ch) {
615           case T_CARET:         return this.processCaret();
616           case T_DOLLAR:        return this.processDollar();
617           case T_LOOKAHEAD:     return this.processLookahead();
618           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
619           case T_LOOKBEHIND:    return this.processLookbehind();
620           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
621 
622           case T_COMMENT:
623             this.next();
624             return Token.createEmpty();
625 
626           case T_BACKSOLIDUS:
627             switch (this.chardata) {
628               case 'A': return this.processBacksolidus_A();
629               case 'Z': return this.processBacksolidus_Z();
630               case 'z': return this.processBacksolidus_z();
631               case 'b': return this.processBacksolidus_b();
632               case 'B': return this.processBacksolidus_B();
633               case '<': return this.processBacksolidus_lt();
634               case '>': return this.processBacksolidus_gt();
635             }
636                                                 // through down
637         }
638         tok = this.parseAtom();
639         ch = this.read();
640         switch (ch) {
641           case T_STAR:  return this.processStar(tok);
642           case T_PLUS:  return this.processPlus(tok);
643           case T_QUESTION: return this.processQuestion(tok);
644           case T_CHAR:
645             if (this.chardata == '{' && this.offset < this.regexlen) {
646 
647                 int off = this.offset;          // this.offset -> next of '{'
648                 int min = 0, max = -1;
649 
650                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
651 
652                     min = ch -'0';
653                     while (off < this.regexlen
654                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
655                         min = min*10 +ch-'0';
656                         if (min < 0)
657                             throw ex("parser.quantifier.5", this.offset);
658                     }
659                 }
660                 else {
661                     throw ex("parser.quantifier.1", this.offset);
662                 }
663 
664                 max = min;
665                 if (ch == ',') {
666 
667                    if (off >= this.regexlen) {
668                        throw ex("parser.quantifier.3", this.offset);
669                    }
670                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {                       
671 
672                         max = ch -'0';       // {min,max}
673                         while (off < this.regexlen
674                                && (ch = this.regex.charAt(off++)) >= '0'
675                                && ch <= '9') {
676                             max = max*10 +ch-'0';
677                             if (max < 0)
678                                 throw ex("parser.quantifier.5", this.offset);
679                         }
680 
681                         if (min > max)
682                             throw ex("parser.quantifier.4", this.offset);
683                    }
684                    else { // assume {min,}
685                         max = -1;           
686                     }
687                 }
688 
689                if (ch != '}')
690                    throw ex("parser.quantifier.2", this.offset);
691 
692                if (this.checkQuestion(off)) {  // off -> next of '}'
693                     tok = Token.createNGClosure(tok);
694                     this.offset = off+1;
695                 } else {
696                     tok = Token.createClosure(tok);
697                     this.offset = off;
698                 }
699 
700                 tok.setMin(min);
701                 tok.setMax(max);
702                 //System.err.println("CLOSURE: "+min+", "+max);
703                 this.next();
704             }
705         }
706         return tok;
707     }
708 
709     /**
710      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
711      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
712      *          | '(?>' regex ')'
713      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
714      */
715     Token parseAtom() throws ParseException {
716         int ch = this.read();
717         Token tok = null;
718         switch (ch) {
719           case T_LPAREN:        return this.processParen();
720           case T_LPAREN2:       return this.processParen2(); // '(?:'
721           case T_CONDITION:     return this.processCondition(); // '(?('
722           case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
723           case T_INDEPENDENT:   return this.processIndependent();
724           case T_DOT:
725             this.next();                    // Skips '.'
726             tok = Token.token_dot;
727             break;
728 
729             /**
730              * char-class ::= '[' ( '^'? range ','?)+ ']'
731              * range ::= '\d' | '\w' | '\s' | category-block | range-char
732              *           | range-char '-' range-char
733              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
734              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
735              */
736           case T_LBRACKET:      return this.parseCharacterClass(true);
737           case T_SET_OPERATIONS: return this.parseSetOperations();
738 
739           case T_BACKSOLIDUS:
740             switch (this.chardata) {
741               case 'd':  case 'D':
742               case 'w':  case 'W':
743               case 's':  case 'S':
744                 tok = this.getTokenForShorthand(this.chardata);
745                 this.next();
746                 return tok;
747 
748               case 'e':  case 'f':  case 'n':  case 'r':
749               case 't':  case 'u':  case 'v':  case 'x':
750                 {
751                     int ch2 = this.decodeEscaped();
752                     if (ch2 < 0x10000) {
753                         tok = Token.createChar(ch2);
754                     } else {
755                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
756                     }
757                 }
758                 break;
759 
760               case 'c': return this.processBacksolidus_c();
761               case 'C': return this.processBacksolidus_C();
762               case 'i': return this.processBacksolidus_i();
763               case 'I': return this.processBacksolidus_I();
764               case 'g': return this.processBacksolidus_g();
765               case 'X': return this.processBacksolidus_X();
766               case '1':  case '2':  case '3':  case '4':
767               case '5':  case '6':  case '7':  case '8':  case '9':
768                 return this.processBackreference();
769 
770               case 'P':
771               case 'p':
772                 int pstart = this.offset;
773                 tok = processBacksolidus_pP(this.chardata);
774                 if (tok == null)  throw this.ex("parser.atom.5", pstart);
775                 break;
776 
777               default:
778                 tok = Token.createChar(this.chardata);
779             }
780             this.next();
781             break;
782 
783           case T_CHAR:
784             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
785                 throw this.ex("parser.atom.4", this.offset-1);
786             tok = Token.createChar(this.chardata);
787             int high = this.chardata;
788             this.next();
789             if (REUtil.isHighSurrogate(high)
790                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
791                 char[] sur = new char[2];
792                 sur[0] = (char)high;
793                 sur[1] = (char)this.chardata;
794                 tok = Token.createParen(Token.createString(new String  (sur)), 0);
795                 this.next();
796             }
797             break;
798 
799           default:
800             throw this.ex("parser.atom.4", this.offset-1);
801         }
802         return tok;
803     }
804 
805     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
806 
807         this.next();
808         if (this.read() != T_CHAR || this.chardata != '{')
809             throw this.ex("parser.atom.2", this.offset-1);
810 
811         // handle category escape
812         boolean positive = c == 'p';
813         int namestart = this.offset;
814         int nameend = this.regex.indexOf('}', namestart);
815 
816         if (nameend < 0)
817             throw this.ex("parser.atom.3", this.offset);
818 
819         String   pname = this.regex.substring(namestart, nameend);
820         this.offset = nameend+1;
821 
822         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
823     }
824 
825     int processCIinCharacterClass(RangeToken tok, int c) {
826         return this.decodeEscaped();
827     }
828 
829     /**
830      * char-class ::= '[' ( '^'? range ','?)+ ']'
831      * range ::= '\d' | '\w' | '\s' | category-block | range-char
832      *           | range-char '-' range-char
833      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
834      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
835      */
836     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
837         this.setContext(S_INBRACKETS);
838         this.next();                            // '['
839         boolean nrange = false;
840         RangeToken base = null;
841         RangeToken tok;
842         if (this.read() == T_CHAR && this.chardata == '^') {
843             nrange = true;
844             this.next();                        // '^'
845             if (useNrange) {
846                 tok = Token.createNRange();
847             } else {
848                 base = Token.createRange();
849                 base.addRange(0, Token.UTF16_MAX);
850                 tok = Token.createRange();
851             }
852         } else {
853             tok = Token.createRange();
854         }
855         int type;
856         boolean firstloop = true;
857         while ((type = this.read()) != T_EOF) {
858             if (type == T_CHAR && this.chardata == ']' && !firstloop)
859                 break;
860             firstloop = false;
861             int c = this.chardata;
862             boolean end = false;
863             if (type == T_BACKSOLIDUS) {
864                 switch (c) {
865                   case 'd':  case 'D':
866                   case 'w':  case 'W':
867                   case 's':  case 'S':
868                     tok.mergeRanges(this.getTokenForShorthand(c));
869                     end = true;
870                     break;
871 
872                   case 'i':  case 'I':
873                   case 'c':  case 'C':
874                     c = this.processCIinCharacterClass(tok, c);
875                     if (c < 0)  end = true;
876                     break;
877                     
878                   case 'p':
879                   case 'P':
880                     int pstart = this.offset;
881                     RangeToken tok2 = this.processBacksolidus_pP(c);
882                     if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
883                     tok.mergeRanges(tok2);
884                     end = true;
885                     break;
886 
887                   default:
888                     c = this.decodeEscaped();
889                 } // \ + c
890             } // backsolidus
891                                                 // POSIX Character class such as [:alnum:]
892             else if (type == T_POSIX_CHARCLASS_START) {
893                 int nameend = this.regex.indexOf(':', this.offset);
894                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
895                 boolean positive = true;
896                 if (this.regex.charAt(this.offset) == '^') {
897                     this.offset ++;
898                     positive = false;
899                 }
900                 String   name = this.regex.substring(this.offset, nameend);
901                 RangeToken range = Token.getRange(name, positive,
902                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
903                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
904                 tok.mergeRanges(range);
905                 end = true;
906                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
907                     throw this.ex("parser.cc.1", nameend);
908                 this.offset = nameend+2;
909             }
910             this.next();
911             if (!end) {                         // if not shorthands...
912                 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
913                     tok.addRange(c, c);
914                 } else {
915                     this.next(); // Skips '-'
916                     if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
917                     if (type == T_CHAR && this.chardata == ']') {
918                         tok.addRange(c, c);
919                         tok.addRange('-', '-');
920                     } else {
921                         int rangeend = this.chardata;
922                         if (type == T_BACKSOLIDUS)
923                             rangeend = this.decodeEscaped();
924                         this.next();
925                         tok.addRange(c, rangeend);
926                     }
927                 }
928             }
929             if (this.isSet(RegularExpression.SPECIAL_COMMA)
930                 && this.read() == T_CHAR && this.chardata == ',')
931                 this.next();
932         }
933         if (this.read() == T_EOF)
934             throw this.ex("parser.cc.2", this.offset);
935         if (!useNrange && nrange) {
936             base.subtractRanges(tok);
937             tok = base;
938         }
939         tok.sortRanges();
940         tok.compactRanges();
941         //tok.dumpRanges();
942         /*
943         if (this.isSet(RegularExpression.IGNORE_CASE))
944             tok = RangeToken.createCaseInsensitiveToken(tok);
945         */
946         this.setContext(S_NORMAL);
947         this.next();                    // Skips ']'
948 
949         return tok;
950     }
951 
952     /**
953      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
954      */
955     protected RangeToken parseSetOperations() throws ParseException {
956         RangeToken tok = this.parseCharacterClass(false);
957         int type;
958         while ((type = this.read()) != T_RPAREN) {
959             int ch = this.chardata;
960             if (type == T_CHAR && (ch == '-' || ch == '&')
961                 || type == T_PLUS) {
962                 this.next();
963                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
964                 RangeToken t2 = this.parseCharacterClass(false);
965                 if (type == T_PLUS)
966                     tok.mergeRanges(t2);
967                 else if (ch == '-')
968                     tok.subtractRanges(t2);
969                 else if (ch == '&')
970                     tok.intersectRanges(t2);
971                 else
972                     throw new RuntimeException  ("ASSERT");
973             } else {
974                 throw ex("parser.ope.2", this.offset-1);
975             }
976         }
977         this.next();
978         return tok;
979     }
980 
981     Token getTokenForShorthand(int ch) {
982         Token tok;
983         switch (ch) {
984           case 'd':
985             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
986                 ? Token.getRange("Nd", true) : Token.token_0to9;
987             break;
988           case 'D':
989             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
990                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
991             break;
992           case 'w':
993             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
994                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
995             break;
996           case 'W':
997             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
998                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
999             break;
1000          case 's':
1001            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1002                ? Token.getRange("IsSpace", true) : Token.token_spaces;
1003            break;
1004          case 'S':
1005            tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1006                ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1007            break;
1008
1009          default:
1010            throw new RuntimeException  ("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1011        }
1012        return tok;
1013    }
1014
1015    /**
1016     */
1017    int decodeEscaped() throws ParseException {
1018        if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
1019        int c = this.chardata;
1020        switch (c) {
1021          case 'e':  c = 0x1b;  break; // ESCAPE U+001B
1022          case 'f':  c = '\f';  break; // FORM FEED U+000C
1023          case 'n':  c = '\n';  break; // LINE FEED U+000A
1024          case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
1025          case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
1026          //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
1027          case 'x':
1028            this.next();
1029            if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1030            if (this.chardata == '{') {
1031                int v1 = 0;
1032                int uv = 0;
1033                do {
1034                    this.next();
1035                    if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1036                    if ((v1 = hexChar(this.chardata)) < 0)
1037                        break;
1038                    if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1039                    uv = uv*16+v1;
1040                } while (true);
1041                if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
1042                if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
1043                c = uv;
1044            } else {
1045                int v1 = 0;
1046                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1047                    throw ex("parser.descape.1", this.offset-1);
1048                int uv = v1;
1049                this.next();
1050                if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1051                    throw ex("parser.descape.1", this.offset-1);
1052                uv = uv*16+v1;
1053                c = uv;
1054            }
1055            break;
1056
1057          case 'u':
1058            int v1 = 0;
1059            this.next();
1060            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1061                throw ex("parser.descape.1", this.offset-1);
1062            int uv = v1;
1063            this.next();
1064            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1065                throw ex("parser.descape.1", this.offset-1);
1066            uv = uv*16+v1;
1067            this.next();
1068            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1069                throw ex("parser.descape.1", this.offset-1);
1070            uv = uv*16+v1;
1071            this.next();
1072            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1073                throw ex("parser.descape.1", this.offset-1);
1074            uv = uv*16+v1;
1075            c = uv;
1076            break;
1077
1078          case 'v':
1079            this.next();
1080            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1081                throw ex("parser.descape.1", this.offset-1);
1082            uv = v1;
1083            this.next();
1084            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1085                throw ex("parser.descape.1", this.offset-1);
1086            uv = uv*16+v1;
1087            this.next();
1088            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1089                throw ex("parser.descape.1", this.offset-1);
1090            uv = uv*16+v1;
1091            this.next();
1092            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1093                throw ex("parser.descape.1", this.offset-1);
1094            uv = uv*16+v1;
1095            this.next();
1096            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1097                throw ex("parser.descape.1", this.offset-1);
1098            uv = uv*16+v1;
1099            this.next();
1100            if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1101                throw ex("parser.descape.1", this.offset-1);
1102            uv = uv*16+v1;
1103            if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
1104            c = uv;
1105            break;
1106          case 'A':
1107          case 'Z':
1108          case 'z':
1109            throw ex("parser.descape.5", this.offset-2);
1110          default:
1111        }
1112        return c;
1113    }
1114
1115    static private final int hexChar(int ch) {
1116        if (ch < '0')  return -1;
1117        if (ch > 'f')  return -1;
1118        if (ch <= '9')  return ch-'0';
1119        if (ch < 'A')  return -1;
1120        if (ch <= 'F')  return ch-'A'+10;
1121        if (ch < 'a')  return -1;
1122        return ch-'a'+10;
1123    }
1124}
1125
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags