KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > oro > text > awk > AwkCompiler


1 package org.apache.oro.text.awk;
2
3 /* ====================================================================
4  * The Apache Software License, Version 1.1
5  *
6  * Copyright (c) 2000 The Apache Software Foundation. All rights
7  * reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  *
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in
18  * the documentation and/or other materials provided with the
19  * distribution.
20  *
21  * 3. The end-user documentation included with the redistribution,
22  * if any, must include the following acknowledgment:
23  * "This product includes software developed by the
24  * Apache Software Foundation (http://www.apache.org/)."
25  * Alternately, this acknowledgment may appear in the software itself,
26  * if and wherever such third-party acknowledgments normally appear.
27  *
28  * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
29  * must not be used to endorse or promote products derived from this
30  * software without prior written permission. For written
31  * permission, please contact apache@apache.org.
32  *
33  * 5. Products derived from this software may not be called "Apache"
34  * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
35  * name, without prior written permission of the Apache Software Foundation.
36  *
37  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  * ====================================================================
50  *
51  * This software consists of voluntary contributions made by many
52  * individuals on behalf of the Apache Software Foundation. For more
53  * information on the Apache Software Foundation, please see
54  * <http://www.apache.org/>.
55  *
56  * Portions of this software are based upon software originally written
57  * by Daniel F. Savarese. We appreciate his contributions.
58  */

59
60 import org.apache.oro.text.regex.*;
61
62 /**
63  * The AwkCompiler class is used to create compiled regular expressions
64  * conforming to the Awk regular expression syntax. It generates
65  * AwkPattern instances upon compilation to be used in conjunction
66  * with an AwkMatcher instance. AwkMatcher finds true leftmost-longest
67  * matches, so you must take care with how you formulate your regular
68  * expression to avoid matching more than you really want.
69  * <p>
70  * The supported regular expression syntax is a superset of traditional AWK,
71  * but NOT to be confused with GNU AWK or other AWK variants. Additionally,
72  * this AWK implementation is DFA-based and only supports 8-bit ASCII.
73  * Consequently, these classes can perform very fast pattern matches in
74  * most cases.
75  * <p>
76  * This is the traditional Awk syntax that is supported:
77  * <ul>
78  * <li> Alternatives separated by |
79  * <li> Quantified atoms
80  * <dl compact>
81  * <dt> * <dd> Match 0 or more times.
82  * <dt> + <dd> Match 1 or more times.
83  * <dt> ? <dd> Match 0 or 1 times.
84  * </dl>
85  * <li> Atoms
86  * <ul>
87  * <li> regular expression within parentheses
88  * <li> a . matches everything including newline
89  * <li> a ^ is a null token matching the beginning of a string
90  * but has no relation to newlines (and is only valid at the
91  * beginning of a regex; this differs from traditional awk
92  * for the sake of efficiency in Java).
93  * <li> a $ is a null token matching the end of a string but has
94  * no relation to newlines (and is only valid at the
95  * end of a regex; this differs from traditional awk for the
96  * sake of efficiency in Java).
97  * <li> Character classes (e.g., [abcd]) and ranges (e.g. [a-z])
98  * <ul>
99  * <li> Special backslashed characters work within a character class
100  * </ul>
101  * <li> Special backslashed characters
102  * <dl compact>
103  * <dt> \b <dd> backspace
104  * <dt> \n <dd> newline
105  * <dt> \r <dd> carriage return
106  * <dt> \t <dd> tab
107  * <dt> \f <dd> formfeed
108  * <dt> \xnn <dd> hexadecimal representation of character
109  * <dt> \nn or \nnn <dd> octal representation of character
110  * <dt> Any other backslashed character matches itself
111  * </dl>
112  * </ul></ul>
113  * <p>
114  * This is the extended syntax that is supported:
115  * <ul>
116  * <li> Quantified atoms
117  * <dl compact>
118  * <dt> {n,m} <dd> Match at least n but not more than m times.
119  * <dt> {n,} <dd> Match at least n times.
120  * <dt> {n} <dd> Match exactly n times.
121  * </dl>
122  * <li> Atoms
123  * <ul>
124  * <li> Special backslashed characters
125  * <dl compact>
126  * <dt> \d <dd> digit [0-9]
127  * <dt> \D <dd> non-digit [^0-9]
128  * <dt> \w <dd> word character [0-9a-z_A-Z]
129  * <dt> \W <dd> a non-word character [^0-9a-z_A-Z]
130  * <dt> \s <dd> a whitespace character [ \t\n\r\f]
131  * <dt> \S <dd> a non-whitespace character [^ \t\n\r\f]
132  * <dt> \cD <dd> matches the corresponding control character
133  * <dt> \0 <dd> matches null character
134  * </dl>
135  * </ul></ul>
136
137  @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
138  @version $Id: AwkCompiler.java,v 1.1.1.1 2000/07/23 23:08:49 jon Exp $
139
140  * @see org.apache.oro.text.regex.PatternCompiler
141  * @see org.apache.oro.text.regex.MalformedPatternException
142  * @see AwkPattern
143  * @see AwkMatcher
144  */

145 public final class AwkCompiler implements PatternCompiler {
146   public static final int DEFAULT_MASK = 0;
147   public static final int CASE_INSENSITIVE_MASK = 0x0001;
148
149   static final char _END_OF_INPUT = '\uFFFF';
150   
151   private boolean __inCharacterClass, __caseSensitive;
152   private boolean __beginAnchor, __endAnchor;
153   private char __lookahead;
154   private int __position, __bytesRead, __expressionLength;
155   private char[] __regularExpression;
156   private int __openParen, __closeParen;
157
158   public AwkCompiler() { }
159
160   private static boolean __isMetachar(char token) {
161     return (token == '*' || token == '?' || token == '+' ||
162         token == '[' || token == ']' || token == '(' ||
163         token == ')' || token == '|' || /* token == '^' ||
164         token == '$' || */
token == '.');
165   }
166
167   static boolean _isWordCharacter(char token) {
168     return ((token >= 'a' && token <= 'z') ||
169         (token >= 'A' && token <= 'Z') ||
170         (token >= '0' && token <= '9') ||
171         (token == '_'));
172   }
173
174   static boolean _isLowerCase(char token){
175     return (token >= 'a' && token <= 'z');
176   }
177
178   static boolean _isUpperCase(char token){
179     return (token >= 'A' && token <= 'Z');
180   }
181
182   static char _toggleCase(char token){
183     if(_isUpperCase(token))
184       return (char)(token + 32);
185     else if(_isLowerCase(token))
186       return (char)(token - 32);
187
188     return token;
189   }
190
191
192   private void __match(char token) throws MalformedPatternException {
193     if(token == __lookahead){
194       if(__bytesRead < __expressionLength)
195     __lookahead = __regularExpression[__bytesRead++];
196       else
197     __lookahead = _END_OF_INPUT;
198     }
199     else
200       throw new MalformedPatternException("token: " + token +
201                     " does not match lookahead: " +
202                     __lookahead + " at position: " +
203                          __bytesRead);
204   }
205
206   private void __putback() {
207     if(__lookahead != _END_OF_INPUT)
208       --__bytesRead;
209     __lookahead = __regularExpression[__bytesRead - 1];
210   }
211
212   private SyntaxNode __regex() throws MalformedPatternException {
213     SyntaxNode left;
214
215     left = __branch();
216
217     if(__lookahead == '|') {
218       __match('|');
219       return (new OrNode(left, __regex()));
220     }
221
222     return left;
223   }
224
225
226   private SyntaxNode __branch() throws MalformedPatternException {
227     CatNode current;
228     SyntaxNode left, root;
229
230     left = __piece();
231
232     if(__lookahead == ')'){
233       if(__openParen > __closeParen)
234     return left;
235       else
236       throw
237         new MalformedPatternException("Parse error: close parenthesis"
238          + " without matching open parenthesis at position " + __bytesRead);
239     } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT)
240       return left;
241
242     root = current = new CatNode();
243     current._left = left;
244
245     while(true) {
246       left = __piece();
247
248       if(__lookahead == ')'){
249     if(__openParen > __closeParen){
250       current._right = left;
251       break;
252     }
253     else
254       throw
255         new MalformedPatternException("Parse error: close parenthesis"
256          + " without matching open parenthesis at position " + __bytesRead);
257       } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT){
258     current._right = left;
259     break;
260       }
261
262       current._right = new CatNode();
263       current = (CatNode)current._right;
264       current._left = left;
265     }
266
267     return root;
268   }
269
270
271   private SyntaxNode __piece() throws MalformedPatternException {
272     SyntaxNode left;
273
274     left = __atom();
275
276     switch(__lookahead){
277     case '+' : __match('+'); return (new PlusNode(left));
278     case '?' : __match('?'); return (new QuestionNode(left));
279     case '*' : __match('*'); return (new StarNode(left));
280     case '{' : return __repetition(left);
281     }
282
283     return left;
284   }
285
286   // if numChars is 0, this means match as many as you want
287
private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits)
288     throws MalformedPatternException {
289     int num, digits = 0;
290     StringBuffer JavaDoc buf;
291
292     // We don't expect huge numbers, so an initial buffer of 4 is fine.
293
buf = new StringBuffer JavaDoc(4);
294
295     while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){
296       buf.append((char)__lookahead);
297       __match(__lookahead);
298       ++digits;
299     }
300
301     if(digits < minDigits || digits > maxDigits)
302       throw
303     new MalformedPatternException(
304         "Parse error: unexpected number of digits at position " + __bytesRead);
305
306     try {
307       num = Integer.parseInt(buf.toString(), radix);
308     } catch(NumberFormatException JavaDoc e) {
309       throw
310     new MalformedPatternException("Parse error: numeric value at " +
311                 "position " + __bytesRead + " is invalid");
312     }
313
314     return num;
315   }
316
317   private SyntaxNode __repetition(SyntaxNode atom)
318     throws MalformedPatternException {
319     int min, max, startPosition[];
320     StringBuffer JavaDoc minBuffer, maxBuffer;
321     SyntaxNode root = null;
322     CatNode catNode;
323
324     __match('{');
325
326     min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
327     startPosition = new int[1];
328     startPosition[0] = __position;
329
330     if(__lookahead == '}'){
331       // Match exactly min times. Concatenate the atom min times.
332
__match('}');
333
334       if(min == 0)
335     throw
336       new MalformedPatternException(
337               "Parse error: Superfluous interval specified at position " +
338               __bytesRead + ". Number of occurences was set to zero.");
339
340       if(min == 1)
341     return atom;
342
343       root = catNode = new CatNode();
344       catNode._left = atom;
345
346       while(--min > 1) {
347     atom = atom._clone(startPosition);
348
349     catNode._right = new CatNode();
350     catNode = (CatNode)catNode._right;
351     catNode._left = atom;
352       }
353
354       catNode._right = atom._clone(startPosition);
355     } else if(__lookahead == ','){
356       __match(',');
357
358       if(__lookahead == '}') {
359     // match at least min times
360
__match('}');
361
362     if(min == 0)
363       return new StarNode(atom);
364
365     if(min == 1)
366       return new PlusNode(atom);
367
368     root = catNode = new CatNode();
369     catNode._left = atom;
370
371     while(--min > 0) {
372       atom = atom._clone(startPosition);
373
374       catNode._right = new CatNode();
375       catNode = (CatNode)catNode._right;
376       catNode._left = atom;
377     }
378
379     catNode._right = new StarNode(atom._clone(startPosition));
380       } else {
381     // match at least min times and at most max times
382
max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
383     __match('}');
384
385     if(max < min)
386       throw
387         new MalformedPatternException("Parse error: invalid interval; "
388          + max + " is less than " + min + " at position " + __bytesRead);
389     if(max == 0)
390       throw
391         new MalformedPatternException(
392         "Parse error: Superfluous interval specified at position " +
393         __bytesRead + ". Number of occurences was set to zero.");
394
395     if(min == 0) {
396       if(max == 1)
397         return new QuestionNode(atom);
398
399       root = catNode = new CatNode();
400       atom = new QuestionNode(atom);
401       catNode._left = atom;
402
403       while(--max > 1) {
404         atom = atom._clone(startPosition);
405
406         catNode._right = new CatNode();
407         catNode = (CatNode)catNode._right;
408         catNode._left = atom;
409       }
410
411       catNode._right = atom._clone(startPosition);
412     } else if(min == max) {
413       if(min == 1)
414         return atom;
415
416       root = catNode = new CatNode();
417       catNode._left = atom;
418
419       while(--min > 1) {
420         atom = atom._clone(startPosition);
421
422         catNode._right = new CatNode();
423         catNode = (CatNode)catNode._right;
424         catNode._left = atom;
425       }
426
427       catNode._right = atom._clone(startPosition);
428     } else {
429       int count;
430
431       root = catNode = new CatNode();
432       catNode._left = atom;
433
434       for(count=1; count < min; count++) {
435         atom = atom._clone(startPosition);
436
437         catNode._right = new CatNode();
438         catNode = (CatNode)catNode._right;
439         catNode._left = atom;
440       }
441
442       atom = new QuestionNode(atom._clone(startPosition));
443
444       count = max-min;
445
446       if(count == 1)
447         catNode._right = atom;
448       else {
449         catNode._right = new CatNode();
450         catNode = (CatNode)catNode._right;
451         catNode._left = atom;
452
453         while(--count > 1) {
454           atom = atom._clone(startPosition);
455
456           catNode._right = new CatNode();
457           catNode = (CatNode)catNode._right;
458           catNode._left = atom;
459         }
460
461         catNode._right = atom._clone(startPosition);
462       }
463     }
464       }
465     } else
466       throw
467     new MalformedPatternException("Parse error: unexpected character " +
468         __lookahead + " in interval at position " + __bytesRead);
469     __position = startPosition[0];
470     return root;
471   }
472
473
474   private SyntaxNode __backslashToken() throws MalformedPatternException {
475     SyntaxNode current;
476     char token;
477     int number;
478
479     __match('\\');
480
481     if(__lookahead == 'x'){
482       __match('x');
483       // Parse a hexadecimal number
484
current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2),
485                  __position++);
486     } else if(__lookahead == 'c') {
487       __match('c');
488       // Create a control character
489
token = Character.toUpperCase(__lookahead);
490       token = (char)(token > 63 ? token - 64 : token + 64);
491       current = new TokenNode(token, __position++);
492       __match(__lookahead);
493     } else if(__lookahead >= '0' && __lookahead <= '9') {
494       __match(__lookahead);
495
496       if(__lookahead >= '0' && __lookahead <= '9'){
497     // We have an octal character or a multi-digit backreference.
498
// Assume octal character for now.
499
__putback();
500     number = __parseUnsignedInteger(10, 2, 3);
501     number = Integer.parseInt(Integer.toString(number), 8);
502     current = _newTokenNode((char)number, __position++);
503       } else {
504     // We have either \0, an escaped digit, or a backreference.
505
__putback();
506     if(__lookahead == '0'){
507       // \0 matches the null character
508
__match('0');
509       current = new TokenNode('\0', __position++);
510     } else {
511       // Either an escaped digit or backreference.
512
number = Character.digit(__lookahead, 10);
513       current = _newTokenNode(__lookahead, __position++);
514       __match(__lookahead);
515     }
516       }
517     } else if(__lookahead == 'b') {
518       // Inside of a character class the \b means backspace, otherwise
519
// it means a word boundary
520
//if(__inCharacterClass)
521
// \b always means backspace
522
current = new TokenNode('\b', __position++);
523       /*
524       else
525     current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
526                 position++);
527                 */

528       __match('b');
529     } /*else if(__lookahead == 'B' && !__inCharacterClass){
530       current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
531                   position++);
532       __match('B');
533     } */
else {
534       CharacterClassNode characterSet;
535       token = __lookahead;
536
537       switch(__lookahead){
538       case 'n' : token = '\n'; break;
539       case 'r' : token = '\r'; break;
540       case 't' : token = '\t'; break;
541       case 'f' : token = '\f'; break;
542       }
543
544       switch(token) {
545       case 'd' :
546     characterSet = new CharacterClassNode(__position++);
547     characterSet._addTokenRange('0', '9');
548     current = characterSet;
549     break;
550       case 'D' :
551     characterSet = new NegativeCharacterClassNode(__position++);
552     characterSet._addTokenRange('0', '9');
553     current = characterSet;
554     break;
555       case 'w' :
556     characterSet = new CharacterClassNode(__position++);
557     characterSet._addTokenRange('0', '9');
558     characterSet._addTokenRange('a', 'z');
559     characterSet._addTokenRange('A', 'Z');
560     characterSet._addToken('_');
561     current = characterSet;
562     break;
563       case 'W' :
564     characterSet = new NegativeCharacterClassNode(__position++);
565     characterSet._addTokenRange('0', '9');
566     characterSet._addTokenRange('a', 'z');
567     characterSet._addTokenRange('A', 'Z');
568     characterSet._addToken('_');
569     current = characterSet;
570     break;
571       case 's' :
572     characterSet = new CharacterClassNode(__position++);
573     characterSet._addToken(' ');
574     characterSet._addToken('\f');
575     characterSet._addToken('\n');
576     characterSet._addToken('\r');
577     characterSet._addToken('\t');
578     current = characterSet;
579     break;
580       case 'S' :
581     characterSet = new NegativeCharacterClassNode(__position++);
582     characterSet._addToken(' ');
583     characterSet._addToken('\f');
584     characterSet._addToken('\n');
585     characterSet._addToken('\r');
586     characterSet._addToken('\t');
587     current = characterSet;
588     break;
589     default : current = _newTokenNode(token, __position++); break;
590       }
591
592       __match(__lookahead);
593     }
594
595     return current;
596   }
597
598   private SyntaxNode __atom() throws MalformedPatternException {
599     SyntaxNode current;
600
601     if(__lookahead == '(') {
602       __match('(');
603       ++__openParen;
604       current = __regex();
605       __match(')');
606       ++__closeParen;
607     } else if(__lookahead == '[')
608       current = __characterClass();
609     else if(__lookahead == '.') {
610       CharacterClassNode characterSet;
611
612       __match('.');
613       characterSet = new NegativeCharacterClassNode(__position++);
614       characterSet._addToken('\n');
615       current = characterSet;
616     } else if(__lookahead == '\\') {
617       current = __backslashToken();
618     } /*else if(__lookahead == '^') {
619       current =
620     new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);
621       __match('^');
622     } else if(__lookahead == '$') {
623       current =
624     new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);
625       __match('$');
626     } */
else if(!__isMetachar(__lookahead)) {
627       current = _newTokenNode(__lookahead, __position++);
628       __match(__lookahead);
629     } else
630       throw
631     new MalformedPatternException("Parse error: unexpected character " +
632                 __lookahead + " at position " + __bytesRead);
633
634     return current;
635   }
636
637
638   private SyntaxNode __characterClass() throws MalformedPatternException {
639     char lastToken, token;
640     SyntaxNode node;
641     CharacterClassNode current;
642
643     __match('[');
644     __inCharacterClass = true;
645
646     if(__lookahead == '^'){
647       __match('^');
648       current = new NegativeCharacterClassNode(__position++);
649     } else
650       current = new CharacterClassNode(__position++);
651
652     while(__lookahead != ']' && __lookahead != _END_OF_INPUT) {
653
654       if(__lookahead == '\\'){
655     node = __backslashToken();
656     --__position;
657
658     // __backslashToken() (actually newTokenNode()) does not take care of
659
// case insensitivity when __inCharacterClass is true.
660
if(node instanceof TokenNode){
661       lastToken = ((TokenNode)node)._token;
662       current._addToken(lastToken);
663       if(!__caseSensitive)
664         current._addToken(_toggleCase(lastToken));
665     } else {
666       CharacterClassNode slash;
667       slash = (CharacterClassNode)node;
668       // This could be made more efficient by manipulating the
669
// characterSet elements of the CharacterClassNodes but
670
// for the moment, this is more clear.
671
for(token=0; token < LeafNode._NUM_TOKENS; token++){
672         if(slash._matches(token))
673           current._addToken(token);
674       }
675
676       // A byproduct of this act is that when a '-' occurs after
677
// a \d, \w, etc. it is not interpreted as a range and no
678
// parse exception is thrown.
679
// This is considered a feature and not a bug for now.
680
continue;
681     }
682       } else {
683     lastToken = __lookahead;
684     current._addToken(__lookahead);
685     if(!__caseSensitive)
686       current._addToken(_toggleCase(__lookahead));
687     __match(__lookahead);
688       }
689
690       // In Perl, a - is a token if it occurs at the beginning
691
// or end of the character class. Anywhere else, it indicates
692
// a range.
693
// A byproduct of this implementation is that if a '-' occurs
694
// after the end of a range, it is interpreted as a '-' and no
695
// exception is thrown. e.g., the second dash in [a-z-x]
696
// This is considered a feature and not a bug for now.
697
if(__lookahead == '-'){
698     __match('-');
699     if(__lookahead == ']'){
700       current._addToken('-');
701       break;
702     } else if(__lookahead == '\\') {
703       node = __backslashToken();
704       --__position;
705       if(node instanceof TokenNode)
706         token = ((TokenNode)node)._token;
707       else
708         throw new MalformedPatternException(
709        "Parse error: invalid range specified at position " + __bytesRead);
710     } else {
711       token = __lookahead;
712       __match(__lookahead);
713     }
714
715     if(token < lastToken)
716       throw new MalformedPatternException(
717      "Parse error: invalid range specified at position " + __bytesRead);
718     current._addTokenRange(lastToken + 1, token);
719     if(!__caseSensitive)
720       current._addTokenRange(_toggleCase((char)(lastToken + 1)),
721                 _toggleCase(token));
722       }
723     }
724
725     __match(']');
726     __inCharacterClass = false;
727     return current;
728   }
729
730
731   SyntaxNode _newTokenNode(char token, int position){
732     if(!__inCharacterClass && !__caseSensitive &&
733        (_isUpperCase(token) || _isLowerCase(token))){
734       CharacterClassNode node = new CharacterClassNode(position);
735       node._addToken(token);
736       node._addToken(_toggleCase(token));
737       return node;
738     }
739
740     return new TokenNode(token, position);
741   }
742
743
744   SyntaxTree _parse(char[] expression) throws MalformedPatternException {
745     SyntaxTree tree;
746
747     __openParen = __closeParen = 0;
748     __regularExpression = expression;
749     __bytesRead = 0;
750     __expressionLength = expression.length;
751     __inCharacterClass = false;
752
753     __position = 0;
754     __match(__lookahead); // Call match to read first input.
755

756     if(__lookahead == '^') {
757       __beginAnchor = true;
758       __match(__lookahead);
759     }
760
761     if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') {
762       --__expressionLength;
763       __endAnchor = true;
764     }
765
766     if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) {
767       CatNode root;
768       root = new CatNode();
769       root._left = __regex();
770       // end marker
771
root._right =
772     new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++);
773       tree = new SyntaxTree(root, __position);
774     } else
775       tree = new
776     SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1);
777
778     tree._computeFollowPositions();
779
780     return tree;
781   }
782
783
784   /**
785    * Compiles an Awk regular expression into an AwkPattern instance that
786    * can be used by an AwkMatcher object to perform pattern matching.
787    * <p>
788    * @param pattern An Awk regular expression to compile.
789    * @param options A set of flags giving the compiler instructions on
790    * how to treat the regular expression. Currently the
791    * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
792    * @return A Pattern instance constituting the compiled regular expression.
793    * This instance will always be an AwkPattern and can be reliably
794    * be casted to an AwkPattern.
795    * @exception MalformedPatternException If the compiled expression
796    * is not a valid Awk regular expression.
797    */

798   public Pattern compile(char[] pattern, int options)
799        throws MalformedPatternException
800   {
801     SyntaxTree tree;
802     AwkPattern regexp;
803
804     __beginAnchor = __endAnchor = false;
805     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
806     tree = _parse(pattern);
807     regexp = new AwkPattern(new String JavaDoc(pattern), tree);
808     regexp._options = options;
809     regexp._hasBeginAnchor = __beginAnchor;
810     regexp._hasEndAnchor = __endAnchor;
811
812     return regexp;
813   }
814
815
816   /**
817    * Compiles an Awk regular expression into an AwkPattern instance that
818    * can be used by an AwkMatcher object to perform pattern matching.
819    * <p>
820    * @param pattern An Awk regular expression to compile.
821    * @param options A set of flags giving the compiler instructions on
822    * how to treat the regular expression. Currently the
823    * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
824    * @return A Pattern instance constituting the compiled regular expression.
825    * This instance will always be an AwkPattern and can be reliably
826    * be casted to an AwkPattern.
827    * @exception MalformedPatternException If the compiled expression
828    * is not a valid Awk regular expression.
829    */

830   public Pattern compile(String JavaDoc pattern, int options)
831        throws MalformedPatternException
832   {
833     SyntaxTree tree;
834     AwkPattern regexp;
835
836     __beginAnchor = __endAnchor = false;
837     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
838     tree = _parse(pattern.toCharArray());
839     regexp = new AwkPattern(pattern, tree);
840     regexp._options = options;
841     regexp._hasBeginAnchor = __beginAnchor;
842     regexp._hasEndAnchor = __endAnchor;
843
844     return regexp;
845   }
846
847   /**
848    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
849    * <p>
850    * @param pattern A regular expression to compile.
851    * @return A Pattern instance constituting the compiled regular expression.
852    * This instance will always be an AwkPattern and can be reliably
853    * be casted to an AwkPattern.
854    * @exception MalformedPatternException If the compiled expression
855    * is not a valid Awk regular expression.
856    */

857   public Pattern compile(char[] pattern) throws MalformedPatternException {
858     return compile(pattern, DEFAULT_MASK);
859   }
860
861
862   /**
863    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
864    * <p>
865    * @param pattern A regular expression to compile.
866    * @return A Pattern instance constituting the compiled regular expression.
867    * This instance will always be an AwkPattern and can be reliably
868    * be casted to an AwkPattern.
869    * @exception MalformedPatternException If the compiled expression
870    * is not a valid Awk regular expression.
871    */

872   public Pattern compile(String JavaDoc pattern) throws MalformedPatternException {
873     return compile(pattern, DEFAULT_MASK);
874   }
875
876 }
877
Popular Tags