AwkCompiler


1   package org.apache.oro.text.awk;
2   
3   /* ====================================================================
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2000 The Apache Software Foundation.  All rights
7    * reserved.
8    *
9    * Redistribution and use in source and binary forms, with or without
10   * modification, are permitted provided that the following conditions
11   * are met:
12   *
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   *
16   * 2. Redistributions in binary form must reproduce the above copyright
17   *    notice, this list of conditions and the following disclaimer in
18   *    the documentation and/or other materials provided with the
19   *    distribution.
20   *
21   * 3. The end-user documentation included with the redistribution,
22   *    if any, must include the following acknowledgment:
23   *       "This product includes software developed by the
24   *        Apache Software Foundation (http://www.apache.org/)."
25   *    Alternately, this acknowledgment may appear in the software itself,
26   *    if and wherever such third-party acknowledgments normally appear.
27   *
28   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
29   *    must not be used to endorse or promote products derived from this
30   *    software without prior written permission. For written
31   *    permission, please contact apache@apache.org.
32   *
33   * 5. Products derived from this software may not be called "Apache" 
34   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
35   *    name, without prior written permission of the Apache Software Foundation.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   *
51   * This software consists of voluntary contributions made by many
52   * individuals on behalf of the Apache Software Foundation.  For more
53   * information on the Apache Software Foundation, please see
54   * <http://www.apache.org/>.
55   *
56   * Portions of this software are based upon software originally written 
57   * by Daniel F. Savarese. We appreciate his contributions.
58   */
59  
60  import org.apache.oro.text.regex.*;
61  
62  /**
63   * The AwkCompiler class is used to create compiled regular expressions
64   * conforming to the Awk regular expression syntax.  It generates
65   * AwkPattern instances upon compilation to be used in conjunction
66   * with an AwkMatcher instance.  AwkMatcher finds true leftmost-longest
67   * matches, so you must take care with how you formulate your regular
68   * expression to avoid matching more than you really want.
69   * <p>
70   * The supported regular expression syntax is a superset of traditional AWK,
71   * but NOT to be confused with GNU AWK or other AWK variants.  Additionally,
72   * this AWK implementation is DFA-based and only supports 8-bit ASCII.
73   * Consequently, these classes can perform very fast pattern matches in
74   * most cases.
75   * <p>
76   * This is the traditional Awk syntax that is supported:
77   * <ul>
78   * <li> Alternatives separated by |
79   * <li> Quantified atoms
80   * <dl compact>
81   *      <dt> *     <dd> Match 0 or more times.
82   *      <dt> +     <dd> Match 1 or more times.
83   *      <dt> ?     <dd> Match 0 or 1 times.
84   * </dl>
85   * <li> Atoms
86   * <ul>
87   *     <li> regular expression within parentheses
88   *     <li> a . matches everything including newline
89   *     <li> a ^ is a null token matching the beginning of a string
90   *          but has no relation to newlines (and is only valid at the
91   *          beginning of a regex; this differs from traditional awk
92   *          for the sake of efficiency in Java).
93   *     <li> a $ is a null token matching the end of a string but has
94   *          no relation to newlines (and is only valid at the
95   *          end of a regex; this differs from traditional awk for the
96   *          sake of efficiency in Java).
97   *     <li> Character classes (e.g., [abcd]) and ranges (e.g. [a-z])
98   *     <ul>
99   *         <li> Special backslashed characters work within a character class
100  *     </ul>
101  *     <li> Special backslashed characters
102  *     <dl compact>
103  *         <dt> \b <dd> backspace
104  *         <dt> \n <dd> newline
105  *         <dt> \r <dd> carriage return
106  *         <dt> \t <dd> tab
107  *         <dt> \f <dd> formfeed
108  *         <dt> \xnn <dd> hexadecimal representation of character
109  *         <dt> \nn or \nnn <dd> octal representation of character
110  *         <dt> Any other backslashed character matches itself
111  *     </dl>
112  * </ul></ul>
113  * <p>
114  * This is the extended syntax that is supported:
115  * <ul>
116  * <li> Quantified atoms
117  * <dl compact>
118  *      <dt> {n,m} <dd> Match at least n but not more than m times.
119  *  <dt> {n,}  <dd> Match at least n times.
120  *      <dt> {n}   <dd> Match exactly n times.  
121  * </dl>
122  * <li> Atoms
123  * <ul>
124  *     <li> Special backslashed characters
125  *     <dl compact>
126  *         <dt> \d <dd> digit [0-9]
127  *         <dt> \D <dd> non-digit [^0-9]
128  *         <dt> \w <dd> word character [0-9a-z_A-Z]
129  *         <dt> \W <dd> a non-word character [^0-9a-z_A-Z]
130  *         <dt> \s <dd> a whitespace character [ \t\n\r\f]
131  *         <dt> \S <dd> a non-whitespace character [^ \t\n\r\f]
132  *         <dt> \cD <dd> matches the corresponding control character
133  *         <dt> \0 <dd> matches null character
134  *     </dl>
135  * </ul></ul>
136 
137  @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
138  @version $Id: AwkCompiler.java,v 1.1.1.1 2000/07/23 23:08:49 jon Exp $
139 
140  * @see org.apache.oro.text.regex.PatternCompiler
141  * @see org.apache.oro.text.regex.MalformedPatternException
142  * @see AwkPattern
143  * @see AwkMatcher
144  */                        
145 public final class AwkCompiler implements PatternCompiler {
146   public static final int DEFAULT_MASK          = 0;
147   public static final int CASE_INSENSITIVE_MASK = 0x0001;
148 
149   static final char _END_OF_INPUT = '\uFFFF';
150   
151   private boolean __inCharacterClass, __caseSensitive;
152   private boolean __beginAnchor, __endAnchor;
153   private char __lookahead;
154   private int __position, __bytesRead, __expressionLength;
155   private char[] __regularExpression;
156   private int __openParen, __closeParen;
157 
158   public AwkCompiler() { }
159 
160   private static boolean __isMetachar(char token) {
161     return (token == '*' || token == '?' || token == '+' ||
162         token == '[' || token == ']' || token == '(' ||
163         token == ')' || token == '|' || /* token == '^' ||
164         token == '$' || */ token == '.');
165   }
166 
167   static boolean _isWordCharacter(char token) {
168     return ((token >= 'a' && token <= 'z') || 
169         (token >= 'A' && token <= 'Z') || 
170         (token >= '0' && token <= '9') || 
171         (token == '_'));
172   }
173 
174   static boolean _isLowerCase(char token){
175     return (token >= 'a' && token <= 'z');
176   }
177 
178   static boolean _isUpperCase(char token){
179     return (token >= 'A' && token <= 'Z');
180   }
181 
182   static char _toggleCase(char token){
183     if(_isUpperCase(token))
184       return (char)(token + 32);
185     else if(_isLowerCase(token))
186       return (char)(token - 32);
187 
188     return token;
189   }
190 
191 
192   private void __match(char token) throws MalformedPatternException {
193     if(token == __lookahead){
194       if(__bytesRead < __expressionLength)
195     __lookahead = __regularExpression[__bytesRead++];
196       else
197     __lookahead = _END_OF_INPUT;
198     }
199     else
200       throw new MalformedPatternException("token: " + token + 
201                     " does not match lookahead: " +
202                     __lookahead + " at position: " +
203                          __bytesRead);
204   }
205 
206   private void __putback() {
207     if(__lookahead != _END_OF_INPUT)
208       --__bytesRead;
209     __lookahead = __regularExpression[__bytesRead - 1];
210   }
211 
212   private SyntaxNode __regex() throws MalformedPatternException {
213     SyntaxNode left;
214 
215     left = __branch();
216 
217     if(__lookahead == '|') {
218       __match('|');
219       return (new OrNode(left, __regex()));
220     } 
221 
222     return left;
223   }
224 
225 
226   private SyntaxNode __branch() throws MalformedPatternException {
227     CatNode current;
228     SyntaxNode left, root;
229 
230     left = __piece();
231 
232     if(__lookahead == ')'){
233       if(__openParen > __closeParen)
234     return left;
235       else
236       throw
237         new MalformedPatternException("Parse error: close parenthesis"
238          + " without matching open parenthesis at position " + __bytesRead);
239     } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT)
240       return left;
241 
242     root = current = new CatNode();
243     current._left = left;
244 
245     while(true) {
246       left = __piece();
247 
248       if(__lookahead == ')'){
249     if(__openParen > __closeParen){
250       current._right = left;
251       break;
252     }
253     else
254       throw
255         new MalformedPatternException("Parse error: close parenthesis"
256          + " without matching open parenthesis at position " + __bytesRead);
257       } else  if(__lookahead == '|' || __lookahead == _END_OF_INPUT){
258     current._right = left;
259     break;
260       }
261 
262       current._right = new CatNode();
263       current = (CatNode)current._right;
264       current._left   = left;
265     }
266 
267     return root;
268   }
269 
270 
271   private SyntaxNode __piece() throws MalformedPatternException {
272     SyntaxNode left;
273 
274     left = __atom();
275 
276     switch(__lookahead){
277     case '+' : __match('+'); return (new PlusNode(left));
278     case '?' : __match('?'); return (new QuestionNode(left));
279     case '*' : __match('*'); return (new StarNode(left));
280     case '{' : return __repetition(left);
281     }
282 
283     return left;
284   }
285 
286   // if numChars is 0, this means match as many as you want
287   private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits)
288     throws MalformedPatternException {
289     int num, digits = 0;
290     StringBuffer   buf;
291 
292     // We don't expect huge numbers, so an initial buffer of 4 is fine.
293     buf = new StringBuffer  (4);
294 
295     while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){
296       buf.append((char)__lookahead);
297       __match(__lookahead);
298       ++digits;
299     }
300 
301     if(digits < minDigits || digits > maxDigits)
302       throw
303     new MalformedPatternException(
304         "Parse error: unexpected number of digits at position " + __bytesRead);
305 
306     try {
307       num = Integer.parseInt(buf.toString(), radix);
308     } catch(NumberFormatException   e) {
309       throw
310     new MalformedPatternException("Parse error: numeric value at " +
311                 "position " + __bytesRead + " is invalid");
312     }
313 
314     return num;
315   }
316 
317   private SyntaxNode __repetition(SyntaxNode atom)
318     throws MalformedPatternException {
319     int min, max, startPosition[];
320     StringBuffer   minBuffer, maxBuffer;
321     SyntaxNode root = null;
322     CatNode catNode;
323 
324     __match('{');
325 
326     min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
327     startPosition = new int[1];
328     startPosition[0] = __position;
329 
330     if(__lookahead == '}'){
331       // Match exactly min times.  Concatenate the atom min times.
332       __match('}');
333 
334       if(min == 0)
335     throw
336       new MalformedPatternException(
337               "Parse error: Superfluous interval specified at position " +
338               __bytesRead + ".  Number of occurences was set to zero.");
339 
340       if(min == 1)
341     return atom;
342 
343       root = catNode = new CatNode();
344       catNode._left = atom;
345 
346       while(--min > 1) {
347     atom = atom._clone(startPosition);
348 
349     catNode._right = new CatNode();
350     catNode       = (CatNode)catNode._right;
351     catNode._left  = atom;
352       }
353 
354       catNode._right = atom._clone(startPosition);
355     } else if(__lookahead == ','){
356       __match(',');
357 
358       if(__lookahead == '}') {
359     // match at least min times
360     __match('}');
361 
362     if(min == 0)
363       return new StarNode(atom);
364 
365     if(min == 1)
366       return new PlusNode(atom);
367 
368     root = catNode = new CatNode();
369     catNode._left = atom;
370 
371     while(--min > 0) {
372       atom = atom._clone(startPosition);
373 
374       catNode._right = new CatNode();
375       catNode       = (CatNode)catNode._right;
376       catNode._left  = atom;
377     }
378 
379     catNode._right = new StarNode(atom._clone(startPosition));
380       } else {
381     // match at least min times and at most max times
382     max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
383     __match('}');
384 
385     if(max < min)
386       throw
387         new MalformedPatternException("Parse error: invalid interval; "
388          +  max + " is less than " + min + " at position " + __bytesRead);
389     if(max == 0)
390       throw
391         new MalformedPatternException(
392         "Parse error: Superfluous interval specified at position " +
393         __bytesRead + ".  Number of occurences was set to zero.");
394 
395     if(min == 0) {
396       if(max == 1)
397         return new QuestionNode(atom);
398 
399       root = catNode = new CatNode();
400       atom = new QuestionNode(atom);
401       catNode._left = atom;
402 
403       while(--max > 1) {
404         atom =  atom._clone(startPosition);
405 
406         catNode._right = new CatNode();
407         catNode       = (CatNode)catNode._right;
408         catNode._left  = atom;
409       }
410 
411       catNode._right = atom._clone(startPosition);
412     } else if(min == max) {
413       if(min == 1)
414         return atom;
415 
416       root = catNode = new CatNode();
417       catNode._left = atom;
418 
419       while(--min > 1) {
420         atom = atom._clone(startPosition);
421 
422         catNode._right = new CatNode();
423         catNode       = (CatNode)catNode._right;
424         catNode._left  = atom;
425       }
426 
427       catNode._right = atom._clone(startPosition);
428     } else {
429       int count;
430 
431       root = catNode = new CatNode();
432       catNode._left = atom;
433 
434       for(count=1; count < min; count++) {
435         atom = atom._clone(startPosition);
436 
437         catNode._right = new CatNode();
438         catNode       = (CatNode)catNode._right;
439         catNode._left  = atom;
440       }
441 
442       atom = new QuestionNode(atom._clone(startPosition));
443 
444       count = max-min;
445 
446       if(count == 1)
447         catNode._right = atom;
448       else {
449         catNode._right = new CatNode();
450         catNode = (CatNode)catNode._right;
451         catNode._left = atom;
452 
453         while(--count > 1) {
454           atom = atom._clone(startPosition);
455 
456           catNode._right = new CatNode();
457           catNode       = (CatNode)catNode._right;
458           catNode._left  = atom;
459         }
460 
461         catNode._right = atom._clone(startPosition);
462       }
463     }
464       }
465     } else
466       throw
467     new MalformedPatternException("Parse error: unexpected character " +
468         __lookahead + " in interval at position "   + __bytesRead);
469     __position = startPosition[0];
470     return root;
471   }
472 
473 
474   private SyntaxNode __backslashToken() throws MalformedPatternException {
475     SyntaxNode current;
476     char token;
477     int number;
478 
479     __match('\\');
480 
481     if(__lookahead == 'x'){
482       __match('x');
483       // Parse a hexadecimal number
484       current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2),
485                  __position++);
486     } else if(__lookahead == 'c') {
487       __match('c');
488       // Create a control character
489       token = Character.toUpperCase(__lookahead);
490       token = (char)(token > 63 ? token - 64 : token + 64);
491       current = new TokenNode(token, __position++);
492       __match(__lookahead);
493     } else if(__lookahead >= '0' && __lookahead <= '9') {
494       __match(__lookahead);
495 
496       if(__lookahead >= '0' && __lookahead <= '9'){
497     // We have an octal character or a multi-digit backreference.
498     // Assume octal character for now.
499     __putback();
500     number = __parseUnsignedInteger(10, 2, 3);
501     number = Integer.parseInt(Integer.toString(number), 8);
502     current =  _newTokenNode((char)number, __position++);
503       } else {
504     // We have either \0, an escaped digit, or a backreference.
505     __putback();
506     if(__lookahead == '0'){
507       // \0 matches the null character
508       __match('0');
509       current = new TokenNode('\0', __position++);
510     } else {
511       // Either an escaped digit or backreference.
512       number = Character.digit(__lookahead, 10);
513       current =  _newTokenNode(__lookahead, __position++);
514       __match(__lookahead);
515     }
516       }
517     } else if(__lookahead == 'b') {
518       // Inside of a character class the \b means backspace, otherwise
519       // it means a word boundary
520       //if(__inCharacterClass)
521       // \b always means backspace
522       current = new TokenNode('\b', __position++);
523       /*
524       else 
525     current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
526                 position++);
527                 */
528       __match('b');
529     } /*else if(__lookahead == 'B' && !__inCharacterClass){
530       current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
531                   position++);
532       __match('B');
533     } */ else {
534       CharacterClassNode characterSet;
535       token = __lookahead;
536 
537       switch(__lookahead){
538       case 'n' : token = '\n'; break;
539       case 'r' : token = '\r'; break;
540       case 't' : token = '\t'; break;
541       case 'f' : token = '\f'; break;
542       }
543 
544       switch(token) {
545       case 'd' :
546     characterSet = new CharacterClassNode(__position++);
547     characterSet._addTokenRange('0', '9');
548     current = characterSet;
549     break;
550       case 'D' :
551     characterSet = new NegativeCharacterClassNode(__position++);
552     characterSet._addTokenRange('0', '9');
553     current = characterSet;
554     break;
555       case 'w' :
556     characterSet = new CharacterClassNode(__position++);
557     characterSet._addTokenRange('0', '9');
558     characterSet._addTokenRange('a', 'z');
559     characterSet._addTokenRange('A', 'Z');
560     characterSet._addToken('_');
561     current = characterSet;
562     break;
563       case 'W' :
564     characterSet = new NegativeCharacterClassNode(__position++);
565     characterSet._addTokenRange('0', '9');
566     characterSet._addTokenRange('a', 'z');
567     characterSet._addTokenRange('A', 'Z');
568     characterSet._addToken('_');
569     current = characterSet;
570     break;
571       case 's' :
572     characterSet = new CharacterClassNode(__position++);
573     characterSet._addToken(' ');
574     characterSet._addToken('\f');
575     characterSet._addToken('\n');
576     characterSet._addToken('\r');
577     characterSet._addToken('\t');
578     current = characterSet;
579     break;
580       case 'S' :
581     characterSet = new NegativeCharacterClassNode(__position++);
582     characterSet._addToken(' ');
583     characterSet._addToken('\f');
584     characterSet._addToken('\n');
585     characterSet._addToken('\r');
586     characterSet._addToken('\t');
587     current = characterSet;
588     break;
589     default  : current = _newTokenNode(token, __position++); break;
590       }
591 
592       __match(__lookahead);
593     }
594 
595     return current;
596   }
597 
598   private SyntaxNode __atom() throws MalformedPatternException {
599     SyntaxNode current;
600 
601     if(__lookahead == '(') {
602       __match('(');
603       ++__openParen;
604       current = __regex();
605       __match(')');
606       ++__closeParen;
607     } else if(__lookahead == '[')
608       current = __characterClass();
609     else if(__lookahead == '.') {
610       CharacterClassNode characterSet;
611 
612       __match('.');
613       characterSet = new NegativeCharacterClassNode(__position++);
614       characterSet._addToken('\n');
615       current = characterSet;
616     } else if(__lookahead == '\\') {
617       current = __backslashToken();
618     } /*else if(__lookahead == '^') {
619       current =
620     new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);
621       __match('^');
622     } else if(__lookahead == '$') {
623       current =
624     new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);
625       __match('$');
626     } */ else if(!__isMetachar(__lookahead)) {
627       current = _newTokenNode(__lookahead, __position++);
628       __match(__lookahead);
629     } else
630       throw
631     new MalformedPatternException("Parse error: unexpected character " +
632                 __lookahead + " at position " + __bytesRead);
633 
634     return current;
635   }
636 
637 
638   private SyntaxNode __characterClass() throws MalformedPatternException {
639     char lastToken, token;
640     SyntaxNode node;
641     CharacterClassNode current;
642 
643     __match('[');
644     __inCharacterClass = true;
645 
646     if(__lookahead == '^'){
647       __match('^');
648       current = new NegativeCharacterClassNode(__position++);
649     } else
650       current = new CharacterClassNode(__position++);
651 
652     while(__lookahead != ']' && __lookahead != _END_OF_INPUT) {
653 
654       if(__lookahead == '\\'){
655     node = __backslashToken();
656     --__position;
657 
658     // __backslashToken() (actually newTokenNode()) does not take care of
659         // case insensitivity when __inCharacterClass is true.
660     if(node instanceof TokenNode){
661       lastToken = ((TokenNode)node)._token;
662       current._addToken(lastToken);
663       if(!__caseSensitive)
664         current._addToken(_toggleCase(lastToken));
665     } else {
666       CharacterClassNode slash;
667       slash = (CharacterClassNode)node;
668       // This could be made more efficient by manipulating the
669       // characterSet elements of the CharacterClassNodes but
670       // for the moment, this is more clear.
671       for(token=0; token < LeafNode._NUM_TOKENS; token++){
672         if(slash._matches(token))
673           current._addToken(token);
674       }
675 
676       // A byproduct of this act is that when a '-' occurs after
677       // a \d, \w, etc. it is not interpreted as a range and no
678       // parse exception is thrown.
679       // This is considered a feature and not a bug for now.
680       continue;
681     }
682       } else {
683     lastToken = __lookahead;
684     current._addToken(__lookahead);
685     if(!__caseSensitive)
686       current._addToken(_toggleCase(__lookahead));
687     __match(__lookahead);
688       }
689 
690       // In Perl, a - is a token if it occurs at the beginning
691       // or end of the character class.  Anywhere else, it indicates
692       // a range.
693       // A byproduct of this implementation is that if a '-' occurs
694       // after the end of a range, it is interpreted as a '-' and no
695       // exception is thrown. e.g., the second dash in [a-z-x]
696       // This is considered a feature and not a bug for now.
697       if(__lookahead == '-'){
698     __match('-');
699     if(__lookahead == ']'){
700       current._addToken('-');
701       break;
702     } else if(__lookahead == '\\') {
703       node = __backslashToken();
704       --__position;
705       if(node instanceof TokenNode)
706         token = ((TokenNode)node)._token;
707       else
708         throw new MalformedPatternException(
709        "Parse error: invalid range specified at position " + __bytesRead);
710     } else {
711       token = __lookahead;
712       __match(__lookahead);
713     }
714 
715     if(token < lastToken)
716       throw new MalformedPatternException(
717      "Parse error: invalid range specified at position " + __bytesRead);
718     current._addTokenRange(lastToken + 1, token);
719     if(!__caseSensitive)
720       current._addTokenRange(_toggleCase((char)(lastToken + 1)),
721                 _toggleCase(token));
722       }
723     }
724 
725     __match(']');
726     __inCharacterClass = false;
727     return current;
728   }
729 
730 
731   SyntaxNode _newTokenNode(char token, int position){
732     if(!__inCharacterClass && !__caseSensitive &&
733        (_isUpperCase(token) || _isLowerCase(token))){
734       CharacterClassNode node = new CharacterClassNode(position);
735       node._addToken(token);
736       node._addToken(_toggleCase(token));
737       return node;
738     }
739 
740     return new TokenNode(token, position);
741   }
742 
743 
744   SyntaxTree _parse(char[] expression) throws MalformedPatternException {
745     SyntaxTree tree;
746 
747     __openParen = __closeParen = 0;
748     __regularExpression = expression;
749     __bytesRead = 0;
750     __expressionLength = expression.length;
751     __inCharacterClass = false;
752 
753     __position = 0;
754     __match(__lookahead); // Call match to read first input.
755 
756     if(__lookahead == '^') {
757       __beginAnchor = true;
758       __match(__lookahead);
759     }
760 
761     if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') {
762       --__expressionLength;
763       __endAnchor = true;
764     }
765 
766     if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) {
767       CatNode root;
768       root = new CatNode();
769       root._left  = __regex();
770       // end marker
771       root._right =
772     new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++);
773       tree = new SyntaxTree(root, __position);
774     } else 
775       tree = new
776     SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1);
777 
778     tree._computeFollowPositions();
779 
780     return tree;
781   }
782 
783 
784   /**
785    * Compiles an Awk regular expression into an AwkPattern instance that
786    * can be used by an AwkMatcher object to perform pattern matching.
787    * <p>
788    * @param pattern  An Awk regular expression to compile.
789    * @param options  A set of flags giving the compiler instructions on
790    *                 how to treat the regular expression.  Currently the
791    *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
792    * @return A Pattern instance constituting the compiled regular expression.
793    *         This instance will always be an AwkPattern and can be reliably
794    *         be casted to an AwkPattern.
795    * @exception MalformedPatternException  If the compiled expression
796    *  is not a valid Awk regular expression.
797    */
798   public Pattern compile(char[] pattern, int options)
799        throws MalformedPatternException
800   {
801     SyntaxTree tree;
802     AwkPattern regexp;
803 
804     __beginAnchor = __endAnchor = false;
805     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
806     tree   = _parse(pattern);
807     regexp = new AwkPattern(new String  (pattern), tree);
808     regexp._options = options;
809     regexp._hasBeginAnchor = __beginAnchor;
810     regexp._hasEndAnchor   = __endAnchor;
811 
812     return regexp;
813   }
814 
815 
816   /**
817    * Compiles an Awk regular expression into an AwkPattern instance that
818    * can be used by an AwkMatcher object to perform pattern matching.
819    * <p>
820    * @param pattern  An Awk regular expression to compile.
821    * @param options  A set of flags giving the compiler instructions on
822    *                 how to treat the regular expression.  Currently the
823    *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
824    * @return A Pattern instance constituting the compiled regular expression.
825    *         This instance will always be an AwkPattern and can be reliably
826    *         be casted to an AwkPattern.
827    * @exception MalformedPatternException  If the compiled expression
828    *  is not a valid Awk regular expression.
829    */
830   public Pattern compile(String   pattern, int options)
831        throws MalformedPatternException
832   {
833     SyntaxTree tree;
834     AwkPattern regexp;
835 
836     __beginAnchor = __endAnchor = false;
837     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
838     tree   = _parse(pattern.toCharArray());
839     regexp = new AwkPattern(pattern, tree);
840     regexp._options = options;
841     regexp._hasBeginAnchor = __beginAnchor;
842     regexp._hasEndAnchor   = __endAnchor;
843 
844     return regexp;
845   }
846 
847   /**
848    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
849    * <p>
850    * @param pattern  A regular expression to compile.
851    * @return A Pattern instance constituting the compiled regular expression.
852    *         This instance will always be an AwkPattern and can be reliably
853    *         be casted to an AwkPattern.
854    * @exception MalformedPatternException  If the compiled expression
855    *  is not a valid Awk regular expression.
856    */
857   public Pattern compile(char[] pattern) throws MalformedPatternException {
858     return compile(pattern, DEFAULT_MASK);
859   }
860 
861 
862   /**
863    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
864    * <p>
865    * @param pattern  A regular expression to compile.
866    * @return A Pattern instance constituting the compiled regular expression.
867    *         This instance will always be an AwkPattern and can be reliably
868    *         be casted to an AwkPattern.
869    * @exception MalformedPatternException  If the compiled expression
870    *  is not a valid Awk regular expression.
871    */
872   public Pattern compile(String   pattern) throws MalformedPatternException {
873     return compile(pattern, DEFAULT_MASK);
874   }
875 
876 }
877
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags