KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > percederberg > grammatica > parser > re > RegExp


1 /*
2  * RegExp.java
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 2.1
7  * of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the Free
16  * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17  * MA 02111-1307, USA.
18  *
19  * Copyright (c) 2003-2005 Per Cederberg. All rights reserved.
20  */

21
22 package net.percederberg.grammatica.parser.re;
23
24 import java.io.PrintWriter JavaDoc;
25 import java.io.Reader JavaDoc;
26 import java.io.StringReader JavaDoc;
27 import java.io.StringWriter JavaDoc;
28 import java.util.ArrayList JavaDoc;
29
30 import net.percederberg.grammatica.parser.LookAheadReader;
31
32 /**
33  * A regular expression. This class creates and holds an internal
34  * data structure representing a regular expression. It also allows
35  * creating matchers. This class is thread-safe. Multiple matchers may
36  * operate simultanously on the same regular expression.
37  *
38  * @author Per Cederberg, <per at percederberg dot net>
39  * @version 1.5
40  */

41 public class RegExp {
42
43     /**
44      * The base regular expression element.
45      */

46     private Element element;
47
48     /**
49      * The regular expression pattern.
50      */

51     private String JavaDoc pattern;
52
53     /**
54      * The character case ignore flag.
55      */

56     private boolean ignoreCase;
57
58     /**
59      * The current position in the pattern. This variable is used by
60      * the parsing methods.
61      */

62     private int pos;
63
64     /**
65      * Creates a new case-sensitive regular expression.
66      *
67      * @param pattern the regular expression pattern
68      *
69      * @throws RegExpException if the regular expression couldn't be
70      * parsed correctly
71      */

72     public RegExp(String JavaDoc pattern) throws RegExpException {
73         this(pattern, false);
74     }
75
76     /**
77      * Creates a new regular expression. The regular expression can be
78      * either case-sensitive or case-insensitive.
79      *
80      * @param pattern the regular expression pattern
81      * @param ignoreCase the character case ignore flag
82      *
83      * @throws RegExpException if the regular expression couldn't be
84      * parsed correctly
85      *
86      * @since 1.5
87      */

88     public RegExp(String JavaDoc pattern, boolean ignoreCase)
89         throws RegExpException {
90
91         this.pattern = pattern;
92         this.ignoreCase = ignoreCase;
93         this.pos = 0;
94         this.element = parseExpr();
95         if (pos < pattern.length()) {
96             throw new RegExpException(
97                 RegExpException.UNEXPECTED_CHARACTER,
98                 pos,
99                 pattern);
100         }
101     }
102
103     /**
104      * Creates a new matcher for the specified string.
105      *
106      * @param str the string to work with
107      *
108      * @return the regular expresion matcher
109      *
110      * @deprecated The CharBuffer class has been deprecated in favor
111      * of LookAheadReader as of version 1.5. Create a LookAheadReader
112      * and use the matcher(Reader) method instead of this one.
113      */

114     public Matcher matcher(CharBuffer str) {
115         return matcher(str.toString());
116     }
117
118     /**
119      * Creates a new matcher for the specified string.
120      *
121      * @param str the string to work with
122      *
123      * @return the regular expresion matcher
124      */

125     public Matcher matcher(String JavaDoc str) {
126         return matcher(new StringReader JavaDoc(str));
127     }
128
129     /**
130      * Creates a new matcher for the specified string.
131      *
132      * @param str the string to work with
133      *
134      * @return the regular expresion matcher
135      */

136     public Matcher matcher(StringBuffer JavaDoc str) {
137         return matcher(new StringReader JavaDoc(str.toString()));
138     }
139
140     /**
141      * Creates a new matcher for the specified character input stream.
142      *
143      * @param input the character input stream
144      *
145      * @return the regular expresion matcher
146      *
147      * @since 1.5
148      */

149     public Matcher matcher(Reader JavaDoc input) {
150         if (input instanceof LookAheadReader) {
151             return matcher((LookAheadReader) input);
152         } else {
153             return matcher(new LookAheadReader(input));
154         }
155     }
156
157     /**
158      * Creates a new matcher for the specified look-ahead character
159      * input stream.
160      *
161      * @param input the character input stream
162      *
163      * @return the regular expresion matcher
164      *
165      * @since 1.5
166      */

167     private Matcher matcher(LookAheadReader input) {
168         return new Matcher((Element) element.clone(), input, ignoreCase);
169     }
170
171     /**
172      * Returns a string representation of the regular expression.
173      *
174      * @return a string representation of the regular expression
175      */

176     public String JavaDoc toString() {
177         StringWriter JavaDoc str;
178
179         str = new StringWriter JavaDoc();
180         str.write("Regular Expression\n");
181         str.write(" Pattern: " + pattern + "\n");
182         str.write(" Flags:");
183         if (ignoreCase) {
184             str.write(" caseignore");
185         }
186         str.write("\n");
187         str.write(" Compiled:\n");
188         element.printTo(new PrintWriter JavaDoc(str), " ");
189         return str.toString();
190     }
191
192     /**
193      * Parses a regular expression. This method handles the Expr
194      * production in the grammar (see regexp.grammar).
195      *
196      * @return the element representing this expression
197      *
198      * @throws RegExpException if an error was encountered in the
199      * pattern string
200      */

201     private Element parseExpr() throws RegExpException {
202         Element first;
203         Element second;
204
205         first = parseTerm();
206         if (peekChar(0) != '|') {
207             return first;
208         } else {
209             readChar('|');
210             second = parseExpr();
211             return new AlternativeElement(first, second);
212         }
213     }
214
215     /**
216      * Parses a regular expression term. This method handles the
217      * Term production in the grammar (see regexp.grammar).
218      *
219      * @return the element representing this term
220      *
221      * @throws RegExpException if an error was encountered in the
222      * pattern string
223      */

224     private Element parseTerm() throws RegExpException {
225         ArrayList JavaDoc list = new ArrayList JavaDoc();
226
227         list.add(parseFact());
228         while (true) {
229             switch (peekChar(0)) {
230             case -1:
231             case ')':
232             case ']':
233             case '{':
234             case '}':
235             case '?':
236             case '+':
237             case '|':
238                 return combineElements(list);
239             default:
240                 list.add(parseFact());
241             }
242         }
243     }
244
245     /**
246      * Parses a regular expression factor. This method handles the
247      * Fact production in the grammar (see regexp.grammar).
248      *
249      * @return the element representing this factor
250      *
251      * @throws RegExpException if an error was encountered in the
252      * pattern string
253      */

254     private Element parseFact() throws RegExpException {
255         Element elem;
256
257         elem = parseAtom();
258         switch (peekChar(0)) {
259         case '?':
260         case '*':
261         case '+':
262         case '{':
263             return parseAtomModifier(elem);
264         default:
265             return elem;
266         }
267     }
268
269     /**
270      * Parses a regular expression atom. This method handles the
271      * Atom production in the grammar (see regexp.grammar).
272      *
273      * @return the element representing this atom
274      *
275      * @throws RegExpException if an error was encountered in the
276      * pattern string
277      */

278     private Element parseAtom() throws RegExpException {
279         Element elem;
280
281         switch (peekChar(0)) {
282         case '.':
283             readChar('.');
284             return CharacterSetElement.DOT;
285         case '(':
286             readChar('(');
287             elem = parseExpr();
288             readChar(')');
289             return elem;
290         case '[':
291             readChar('[');
292             elem = parseCharSet();
293             readChar(']');
294             return elem;
295         case -1:
296         case ')':
297         case ']':
298         case '{':
299         case '}':
300         case '?':
301         case '*':
302         case '+':
303         case '|':
304             throw new RegExpException(
305                 RegExpException.UNEXPECTED_CHARACTER,
306                 pos,
307                 pattern);
308         default:
309             return parseChar();
310         }
311     }
312
313     /**
314      * Parses a regular expression atom modifier. This method handles
315      * the AtomModifier production in the grammar (see regexp.grammar).
316      *
317      * @param elem the element to modify
318      *
319      * @return the modified element
320      *
321      * @throws RegExpException if an error was encountered in the
322      * pattern string
323      */

324     private Element parseAtomModifier(Element elem) throws RegExpException {
325         int min = 0;
326         int max = -1;
327         int type = RepeatElement.GREEDY;
328         int firstPos;
329
330         // Read min and max
331
switch (readChar()) {
332         case '?':
333             min = 0;
334             max = 1;
335             break;
336         case '*':
337             min = 0;
338             max = -1;
339             break;
340         case '+':
341             min = 1;
342             max = -1;
343             break;
344         case '{':
345             firstPos = pos - 1;
346             min = readNumber();
347             max = min;
348             if (peekChar(0) == ',') {
349                 readChar(',');
350                 max = -1;
351                 if (peekChar(0) != '}') {
352                     max = readNumber();
353                 }
354             }
355             readChar('}');
356             if (max == 0 || (max > 0 && min > max)) {
357                 throw new RegExpException(
358                     RegExpException.INVALID_REPEAT_COUNT,
359                     firstPos,
360                     pattern);
361             }
362             break;
363         default:
364             throw new RegExpException(
365                 RegExpException.UNEXPECTED_CHARACTER,
366                 pos - 1,
367                 pattern);
368         }
369
370         // Read operator mode
371
if (peekChar(0) == '?') {
372             readChar('?');
373             type = RepeatElement.RELUCTANT;
374         } else if (peekChar(0) == '+') {
375             readChar('+');
376             type = RepeatElement.POSSESSIVE;
377         }
378
379         return new RepeatElement(elem, min, max, type);
380     }
381
382     /**
383      * Parses a regular expression character set. This method handles
384      * the contents of the '[...]' construct in a regular expression.
385      *
386      * @return the element representing this character set
387      *
388      * @throws RegExpException if an error was encountered in the
389      * pattern string
390      */

391     private Element parseCharSet() throws RegExpException {
392         CharacterSetElement charset;
393         Element elem;
394         boolean repeat = true;
395         char start;
396         char end;
397
398         if (peekChar(0) == '^') {
399             readChar('^');
400             charset = new CharacterSetElement(true);
401         } else {
402             charset = new CharacterSetElement(false);
403         }
404
405         while (peekChar(0) > 0 && repeat) {
406             start = (char) peekChar(0);
407             switch (start) {
408             case ']':
409                 repeat = false;
410                 break;
411             case '\\':
412                 elem = parseEscapeChar();
413                 if (elem instanceof StringElement) {
414                     charset.addCharacters((StringElement) elem);
415                 } else {
416                     charset.addCharacterSet((CharacterSetElement) elem);
417                 }
418                 break;
419             default:
420                 readChar(start);
421                 if (peekChar(0) == '-'
422                  && peekChar(1) > 0
423                  && peekChar(1) != ']') {
424
425                     readChar('-');
426                     end = readChar();
427                     charset.addRange(fixChar(start), fixChar(end));
428                 } else {
429                     charset.addCharacter(fixChar(start));
430                 }
431             }
432         }
433
434         return charset;
435     }
436
437     /**
438      * Parses a regular expression character. This method handles
439      * a single normal character in a regular expression.
440      *
441      * @return the element representing this character
442      *
443      * @throws RegExpException if an error was encountered in the
444      * pattern string
445      */

446     private Element parseChar() throws RegExpException {
447         switch (peekChar(0)) {
448         case '\\':
449             return parseEscapeChar();
450         case '^':
451         case '$':
452             throw new RegExpException(
453                 RegExpException.UNSUPPORTED_SPECIAL_CHARACTER,
454                 pos,
455                 pattern);
456         default:
457             return new StringElement(fixChar(readChar()));
458         }
459     }
460
461     /**
462      * Parses a regular expression character escape. This method
463      * handles a single character escape in a regular expression.
464      *
465      * @return the element representing this character escape
466      *
467      * @throws RegExpException if an error was encountered in the
468      * pattern string
469      */

470     private Element parseEscapeChar() throws RegExpException {
471         char c;
472         String JavaDoc str;
473
474         readChar('\\');
475         c = readChar();
476         switch (c) {
477         case '0':
478             c = readChar();
479             if (c < '0' || c > '3') {
480                 throw new RegExpException(
481                     RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
482                     pos - 3,
483                     pattern);
484             }
485             str = String.valueOf(c);
486             c = (char) peekChar(0);
487             if ('0' <= c && c <= '7') {
488                 str += String.valueOf(readChar());
489                 c = (char) peekChar(0);
490                 if ('0' <= c && c <= '7') {
491                     str += String.valueOf(readChar());
492                 }
493             }
494             try {
495                 c = (char) Integer.parseInt(str, 8);
496                 return new StringElement(fixChar(c));
497             } catch (NumberFormatException JavaDoc e) {
498                 throw new RegExpException(
499                     RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
500                     pos - str.length() - 2,
501                     pattern);
502             }
503         case 'x':
504             str = String.valueOf(readChar()) +
505                   String.valueOf(readChar());
506             try {
507                 c = (char) Integer.parseInt(str, 16);
508                 return new StringElement(fixChar(c));
509             } catch (NumberFormatException JavaDoc e) {
510                 throw new RegExpException(
511                     RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
512                     pos - str.length() - 2,
513                     pattern);
514             }
515         case 'u':
516             str = String.valueOf(readChar()) +
517                   String.valueOf(readChar()) +
518                   String.valueOf(readChar()) +
519                   String.valueOf(readChar());
520             try {
521                 c = (char) Integer.parseInt(str, 16);
522                 return new StringElement(fixChar(c));
523             } catch (NumberFormatException JavaDoc e) {
524                 throw new RegExpException(
525                     RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
526                     pos - str.length() - 2,
527                     pattern);
528             }
529         case 't':
530             return new StringElement('\t');
531         case 'n':
532             return new StringElement('\n');
533         case 'r':
534             return new StringElement('\r');
535         case 'f':
536             return new StringElement('\f');
537         case 'a':
538             return new StringElement('\u0007');
539         case 'e':
540             return new StringElement('\u001B');
541         case 'd':
542             return CharacterSetElement.DIGIT;
543         case 'D':
544             return CharacterSetElement.NON_DIGIT;
545         case 's':
546             return CharacterSetElement.WHITESPACE;
547         case 'S':
548             return CharacterSetElement.NON_WHITESPACE;
549         case 'w':
550             return CharacterSetElement.WORD;
551         case 'W':
552             return CharacterSetElement.NON_WORD;
553         default:
554             if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) {
555                 throw new RegExpException(
556                     RegExpException.UNSUPPORTED_ESCAPE_CHARACTER,
557                     pos - 2,
558                     pattern);
559             }
560             return new StringElement(fixChar(c));
561         }
562     }
563
564     /**
565      * Adjusts a character for inclusion in a string or character set
566      * element. For case-insensitive regular expressions, this
567      * transforms the character to lower-case.
568      *
569      * @param c the input character
570      *
571      * @return the adjusted character
572      */

573     private char fixChar(char c) {
574         return ignoreCase ? Character.toLowerCase(c) : c;
575     }
576
577     /**
578      * Reads a number from the pattern. If the next character isn't a
579      * numeric character, an exception is thrown. This method reads
580      * several consecutive numeric characters.
581      *
582      * @return the numeric value read
583      *
584      * @throws RegExpException if an error was encountered in the
585      * pattern string
586      */

587     private int readNumber() throws RegExpException {
588         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
589         int c;
590
591         c = peekChar(0);
592         while ('0' <= c && c <= '9') {
593             buf.append(readChar());
594             c = peekChar(0);
595         }
596         if (buf.length() <= 0) {
597             throw new RegExpException(
598                 RegExpException.UNEXPECTED_CHARACTER,
599                 pos,
600                 pattern);
601         }
602         return Integer.parseInt(buf.toString());
603     }
604
605     /**
606      * Reads the next character in the pattern. If no next character
607      * exists, an exception is thrown.
608      *
609      * @return the character read
610      *
611      * @throws RegExpException if no next character was available in
612      * the pattern string
613      */

614     private char readChar() throws RegExpException {
615         int c = peekChar(0);
616
617         if (c < 0) {
618             throw new RegExpException(
619                 RegExpException.UNTERMINATED_PATTERN,
620                 pos,
621                 pattern);
622         } else {
623             pos++;
624             return (char) c;
625         }
626     }
627
628     /**
629      * Reads the next character in the pattern. If the character
630      * wasn't the specified one, an exception is thrown.
631      *
632      * @param c the character to read
633      *
634      * @return the character read
635      *
636      * @throws RegExpException if the character read didn't match the
637      * specified one, or if no next character was
638      * available in the pattern string
639      */

640     private char readChar(char c) throws RegExpException {
641         if (c != readChar()) {
642             throw new RegExpException(
643                 RegExpException.UNEXPECTED_CHARACTER,
644                 pos - 1,
645                 pattern);
646         }
647         return c;
648     }
649
650     /**
651      * Returns a character that has not yet been read from the
652      * pattern. If the requested position is beyond the end of the
653      * pattern string, -1 is returned.
654      *
655      * @param count the preview position, from zero (0)
656      *
657      * @return the character found, or
658      * -1 if beyond the end of the pattern string
659      */

660     private int peekChar(int count) {
661         if (pos + count < pattern.length()) {
662             return pattern.charAt(pos + count);
663         } else {
664             return -1;
665         }
666     }
667
668     /**
669      * Combines a list of elements. This method takes care to always
670      * concatenate adjacent string elements into a single string
671      * element.
672      *
673      * @param list the list with elements
674      *
675      * @return the combined element
676      */

677     private Element combineElements(ArrayList JavaDoc list) {
678         Element prev;
679         Element elem;
680         String JavaDoc str;
681         int i;
682
683         // Concatenate string elements
684
prev = (Element) list.get(0);
685         for (i = 1; i < list.size(); i++) {
686             elem = (Element) list.get(i);
687             if (prev instanceof StringElement
688              && elem instanceof StringElement) {
689
690                 str = ((StringElement) prev).getString() +
691                       ((StringElement) elem).getString();
692                 elem = new StringElement(str);
693                 list.remove(i);
694                 list.set(i - 1, elem);
695                 i--;
696             }
697             prev = elem;
698         }
699
700         // Combine all remaining elements
701
elem = (Element) list.get(list.size() - 1);
702         for (i = list.size() - 2; i >= 0; i--) {
703             prev = (Element) list.get(i);
704             elem = new CombineElement(prev, elem);
705         }
706         return elem;
707     }
708 }
709
Popular Tags