KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > jcorporate > expresso > ext > regexp > RECompiler


1 /* ====================================================================
2  * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3  *
4  * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * 3. The end-user documentation included with the redistribution,
19  * if any, must include the following acknowledgment:
20  * "This product includes software developed by Jcorporate Ltd.
21  * (http://www.jcorporate.com/)."
22  * Alternately, this acknowledgment may appear in the software itself,
23  * if and wherever such third-party acknowledgments normally appear.
24  *
25  * 4. "Jcorporate" and product names such as "Expresso" must
26  * not be used to endorse or promote products derived from this
27  * software without prior written permission. For written permission,
28  * please contact info@jcorporate.com.
29  *
30  * 5. Products derived from this software may not be called "Expresso",
31  * or other Jcorporate product names; nor may "Expresso" or other
32  * Jcorporate product names appear in their name, without prior
33  * written permission of Jcorporate Ltd.
34  *
35  * 6. No product derived from this software may compete in the same
36  * market space, i.e. framework, without prior written permission
37  * of Jcorporate Ltd. For written permission, please contact
38  * partners@jcorporate.com.
39  *
40  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43  * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  * ====================================================================
53  *
54  * This software consists of voluntary contributions made by many
55  * individuals on behalf of the Jcorporate Ltd. Contributions back
56  * to the project(s) are encouraged when you make modifications.
57  * Please send them to support@jcorporate.com. For more information
58  * on Jcorporate Ltd. and its products, please see
59  * <http://www.jcorporate.com/>.
60  *
61  * Portions of this software are based upon other open source
62  * products and are subject to their respective licenses.
63  */

64
65 package com.jcorporate.expresso.ext.regexp;
66
67 /*
68  * ====================================================================
69  *
70  * The Apache Software License, Version 1.1
71  *
72  * Copyright (c) 1999 The Apache Software Foundation. All rights
73  * reserved.
74  *
75  * Redistribution and use in source and binary forms, with or without
76  * modification, are permitted provided that the following conditions
77  * are met:
78  *
79  * 1. Redistributions of source code must retain the above copyright
80  * notice, this list of conditions and the following disclaimer.
81  *
82  * 2. Redistributions in binary form must reproduce the above copyright
83  * notice, this list of conditions and the following disclaimer in
84  * the documentation and/or other materials provided with the
85  * distribution.
86  *
87  * 3. The end-user documentation included with the redistribution, if
88  * any, must include the following acknowlegement:
89  * "This product includes software developed by the
90  * Apache Software Foundation (http://www.apache.org/)."
91  * Alternately, this acknowlegement may appear in the software itself,
92  * if and wherever such third-party acknowlegements normally appear.
93  *
94  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
95  * Foundation" must not be used to endorse or promote products derived
96  * from this software without prior written permission. For written
97  * permission, please contact apache@apache.org.
98  *
99  * 5. Products derived from this software may not be called "Apache"
100  * nor may "Apache" appear in their names without prior written
101  * permission of the Apache Group.
102  *
103  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
104  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
105  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
106  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
107  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
108  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
109  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
110  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
112  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
113  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
114  * SUCH DAMAGE.
115  * ====================================================================
116  *
117  * This software consists of voluntary contributions made by many
118  * individuals on behalf of the Apache Software Foundation. For more
119  * information on the Apache Software Foundation, please see
120  * <http://www.apache.org/>.
121  *
122  */

123
124 import java.util.Hashtable JavaDoc;
125
126
127 /**
128  * A regular expression compiler class. This class compiles a pattern string into a
129  * regular expression program interpretable by the RE evaluator class. The 'recompile'
130  * command line tool uses this compiler to pre-compile regular expressions for use
131  * with RE. For a description of the syntax accepted by RECompiler and what you can
132  * do with regular expressions, see the documentation for the RE matcher class.
133  *
134  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
135  * @version $Id: RECompiler.java,v 1.7 2004/11/17 20:48:15 lhamel Exp $
136  * @see RE
137  * @deprecated since v5.6, use jakarta oro
138  */

139 public class RECompiler {
140
141     // The compiled program
142
char[] instruction; // The compiled RE 'program' instruction buffer
143
int lenInstruction; // The amount of the program buffer currently in use
144

145     // Input state for compiling regular expression
146
String JavaDoc pattern; // Input string
147
int len; // Length of the pattern string
148
int idx; // Current input index into ac
149
int parens; // Total number of paren pairs
150

151     // Node flags
152
static final int NODE_NORMAL = 0; // No flags (nothing special)
153
static final int NODE_NULLABLE = 1; // True if node is potentially null
154
static final int NODE_TOPLEVEL = 2; // True if top level expr
155

156     // Special types of 'escapes'
157
static final char ESC_MASK = 0xfff0; // Escape complexity mask
158
static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
159
static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
160
static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
161

162     // {m,n} stacks
163
static final int maxBrackets = 10; // Maximum number of bracket pairs
164
static int brackets = 0; // Number of bracket sets
165
static int[] bracketStart = null; // Starting point
166
static int[] bracketEnd = null; // Ending point
167
static int[] bracketMin = null; // Minimum number of matches
168
static int[] bracketOpt = null; // Additional optional matches
169
static final int bracketUnbounded = -1; // Unbounded value
170
static final int bracketFinished = -2; // Unbounded value
171

172     // Lookup table for POSIX character class names
173
static Hashtable JavaDoc hashPOSIX = new Hashtable JavaDoc();
174
175     static {
176         hashPOSIX.put("alnum", new Character JavaDoc(RE.POSIX_CLASS_ALNUM));
177         hashPOSIX.put("alpha", new Character JavaDoc(RE.POSIX_CLASS_ALPHA));
178         hashPOSIX.put("blank", new Character JavaDoc(RE.POSIX_CLASS_BLANK));
179         hashPOSIX.put("cntrl", new Character JavaDoc(RE.POSIX_CLASS_CNTRL));
180         hashPOSIX.put("digit", new Character JavaDoc(RE.POSIX_CLASS_DIGIT));
181         hashPOSIX.put("graph", new Character JavaDoc(RE.POSIX_CLASS_GRAPH));
182         hashPOSIX.put("lower", new Character JavaDoc(RE.POSIX_CLASS_LOWER));
183         hashPOSIX.put("print", new Character JavaDoc(RE.POSIX_CLASS_PRINT));
184         hashPOSIX.put("punct", new Character JavaDoc(RE.POSIX_CLASS_PUNCT));
185         hashPOSIX.put("space", new Character JavaDoc(RE.POSIX_CLASS_SPACE));
186         hashPOSIX.put("upper", new Character JavaDoc(RE.POSIX_CLASS_UPPER));
187         hashPOSIX.put("xdigit", new Character JavaDoc(RE.POSIX_CLASS_XDIGIT));
188         hashPOSIX.put("javastart", new Character JavaDoc(RE.POSIX_CLASS_JSTART));
189         hashPOSIX.put("javapart", new Character JavaDoc(RE.POSIX_CLASS_JPART));
190     }
191
192     /**
193      * Local, nested class for maintaining character ranges for character classes.
194      */

195     class RERange {
196         int size = 16; // Capacity of current range arrays
197
int[] minRange = new int[size]; // Range minima
198
int[] maxRange = new int[size]; // Range maxima
199
int num = 0; // Number of range array elements in use
200

201         /**
202          * Deletes the range at a given index from the range lists
203          *
204          * @param index Index of range to delete from minRange and maxRange arrays.
205          */

206         void delete(int index) {
207
208             // Return if no elements left or index is out of range
209
if (num == 0 || index >= num) {
210                 return;
211             }
212             // Move elements down
213
while (index++ < num) {
214                 if (index - 1 >= 0) {
215                     minRange[index - 1] = minRange[index];
216                     maxRange[index - 1] = maxRange[index];
217                 }
218             }
219
220             // One less element now
221
num--;
222         }
223
224         /**
225          * Merges a range into the range list, coalescing ranges if possible.
226          *
227          * @param min Minimum end of range
228          * @param max Maximum end of range
229          */

230         void merge(int min, int max) {
231
232             // Loop through ranges
233
for (int i = 0; i < num; i++) {
234
235                 // Min-max is subsumed by minRange[i]-maxRange[i]
236
if (min >= minRange[i] && max <= maxRange[i]) {
237                     return;
238                 }
239
240                 // Min-max subsumes minRange[i]-maxRange[i]
241
else if (min <= minRange[i] && max >= maxRange[i]) {
242                     delete(i);
243                     merge(min, max);
244
245                     return;
246                 }
247
248                 // Min is in the range, but max is outside
249
else if (min >= minRange[i] && min <= maxRange[i]) {
250                     delete(i);
251                     min = minRange[i];
252                     merge(min, max);
253
254                     return;
255                 }
256
257                 // Max is in the range, but min is outside
258
else if (max >= minRange[i] && max <= maxRange[i]) {
259                     delete(i);
260                     max = maxRange[i];
261                     merge(min, max);
262
263                     return;
264                 }
265             }
266             // Must not overlap any other ranges
267
if (num >= size) {
268                 size *= 2;
269
270                 int[] newMin = new int[size];
271                 int[] newMax = new int[size];
272                 System.arraycopy(minRange, 0, newMin, 0, num);
273                 System.arraycopy(maxRange, 0, newMax, 0, num);
274                 minRange = newMin;
275                 maxRange = newMax;
276             }
277
278             minRange[num] = min;
279             maxRange[num] = max;
280             num++;
281         }
282
283         /**
284          * Removes a range by deleting or shrinking all other ranges
285          *
286          * @param min Minimum end of range
287          * @param max Maximum end of range
288          */

289         void remove(int min, int max) {
290
291             // Loop through ranges
292
for (int i = 0; i < num; i++) {
293
294                 // minRange[i]-maxRange[i] is subsumed by min-max
295
if (minRange[i] >= min && maxRange[i] <= max) {
296                     delete(i);
297                     i--;
298
299                     return;
300                 }
301
302                 // min-max is subsumed by minRange[i]-maxRange[i]
303
else if (min >= minRange[i] && max <= maxRange[i]) {
304                     int minr = minRange[i];
305                     int maxr = maxRange[i];
306                     delete(i);
307
308                     if (minr < min - 1) {
309                         merge(minr, min - 1);
310                     }
311                     if (max + 1 < maxr) {
312                         merge(max + 1, maxr);
313                     }
314
315                     return;
316                 }
317
318                 // minRange is in the range, but maxRange is outside
319
else if (minRange[i] >= min && minRange[i] <= max) {
320                     minRange[i] = max + 1;
321
322                     return;
323                 }
324
325                 // maxRange is in the range, but minRange is outside
326
else if (maxRange[i] >= min && maxRange[i] <= max) {
327                     maxRange[i] = min - 1;
328
329                     return;
330                 }
331             }
332         }
333
334         /**
335          * Includes (or excludes) the range from min to max, inclusive.
336          *
337          * @param min Minimum end of range
338          * @param max Maximum end of range
339          * @param include True if range should be included. False otherwise.
340          */

341         void include(int min, int max, boolean include) {
342             if (include) {
343                 merge(min, max);
344             } else {
345                 remove(min, max);
346             }
347         }
348
349         /**
350          * Includes a range with the same min and max
351          *
352          * @param minmax Minimum and maximum end of range (inclusive)
353          * @param include True if range should be included. False otherwise.
354          */

355         void include(char minmax, boolean include) {
356             include(minmax, minmax, include);
357         }
358     }
359
360     /**
361      * Constructor. Creates (initially empty) storage for a regular expression program.
362      */

363     public RECompiler() {
364
365         // Start off with a generous, yet reasonable, initial size
366
instruction = new char[128];
367         lenInstruction = 0;
368     }
369
370     /**
371      * Allocate storage for brackets only as needed
372      */

373     void allocBrackets() {
374
375         // Allocate bracket stacks if not already done
376
if (bracketStart == null) {
377
378             // Allocate storage
379
bracketStart = new int[maxBrackets];
380             bracketEnd = new int[maxBrackets];
381             bracketMin = new int[maxBrackets];
382             bracketOpt = new int[maxBrackets];
383
384             // Initialize to invalid values
385
for (int i = 0; i < maxBrackets; i++) {
386                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
387             }
388         }
389     }
390
391     /**
392      * Absorb an atomic character string. This method is a little tricky because
393      * it can un-include the last character of string if a closure operator follows.
394      * This is correct because *+? have higher precedence than concatentation (thus
395      * ABC* means AB(C*) and NOT (ABC)*).
396      *
397      * @return Index of new atom node
398      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
399      */

400     int atom()
401             throws RESyntaxException {
402
403         // Create a string node
404
int ret = node(RE.OP_ATOM, 0);
405
406         // Length of atom
407
int lenAtom = 0;
408
409 // Loop while we've got input
410
atomLoop:
411
412                 while (idx < len) {
413
414                     // Is there a next char?
415
if ((idx + 1) < len) {
416                         char c = pattern.charAt(idx + 1);
417
418                         // If the next 'char' is an escape, look past the whole escape
419
if (pattern.charAt(idx) == '\\') {
420                             int idxEscape = idx;
421                             escape();
422
423                             if (idx < len) {
424                                 c = pattern.charAt(idx);
425                             }
426
427                             idx = idxEscape;
428                         }
429                         // Switch on next char
430
switch (c) {
431                             case '{':
432                             case '?':
433                             case '*':
434                             case '+':
435
436                                 // If the next character is a closure operator and our atom is non-empty, the
437
// current character should bind to the closure operator rather than the atom
438
if (lenAtom != 0) {
439                                     break atomLoop;
440                                 }
441                         }
442                     }
443                     // Switch on current char
444
switch (pattern.charAt(idx)) {
445                         case ']':
446                         case '^':
447                         case '$':
448                         case '.':
449                         case '[':
450                         case '(':
451                         case ')':
452                         case '|':
453                             break atomLoop;
454
455                         case '{':
456                         case '?':
457                         case '*':
458                         case '+':
459
460                             // We should have an atom by now
461
if (lenAtom == 0) {
462
463                                 // No atom before closure
464
syntaxError("Missing operand to closure");
465                             }
466
467                             break atomLoop;
468
469                         case '\\':
470                             {
471
472                                 // Get the escaped character (advances input automatically)
473
int idxBeforeEscape = idx;
474                                 char c = escape();
475
476                                 // Check if it's a simple escape (as opposed to, say, a backreference)
477
if ((c & ESC_MASK) == ESC_MASK) {
478
479                                     // Not a simple escape, so backup to where we were before the escape.
480
idx = idxBeforeEscape;
481                                     break atomLoop;
482                                 }
483
484                                 // Add escaped char to atom
485
emit(c);
486                                 lenAtom++;
487                             }
488
489                             break;
490
491                         default:
492
493                             // Add normal character to atom
494
emit(pattern.charAt(idx++));
495                             lenAtom++;
496                             break;
497                     }
498                 }
499         // This "shouldn't" happen
500
if (lenAtom == 0) {
501             internalError();
502         }
503
504         // Emit the atom length into the program
505
instruction[ret + RE.offsetOpdata] = (char) lenAtom;
506
507         return ret;
508     }
509
510     /**
511      * Match bracket {m,n} expression put results in bracket member variables
512      *
513      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
514      */

515     void bracket()
516             throws RESyntaxException {
517
518         // Current character must be a '{'
519
if (idx >= len || pattern.charAt(idx++) != '{') {
520             internalError();
521         }
522         // Next char must be a digit
523
if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
524             syntaxError("Expected digit");
525         }
526
527         // Get min ('m' of {m,n}) number
528
StringBuffer JavaDoc number = new StringBuffer JavaDoc();
529
530         while (idx < len && Character.isDigit(pattern.charAt(idx))) {
531             number.append(pattern.charAt(idx++));
532         }
533         try {
534             bracketMin[brackets] = Integer.parseInt(number.toString());
535         } catch (NumberFormatException JavaDoc e) {
536             syntaxError("Expected valid number");
537         }
538         // If out of input, fail
539
if (idx >= len) {
540             syntaxError("Expected comma or right bracket");
541         }
542         // If end of expr, optional limit is 0
543
if (pattern.charAt(idx) == '}') {
544             idx++;
545             bracketOpt[brackets] = 0;
546
547             return;
548         }
549         // Must have at least {m,} and maybe {m,n}.
550
if (idx >= len || pattern.charAt(idx++) != ',') {
551             syntaxError("Expected comma");
552         }
553         // If out of input, fail
554
if (idx >= len) {
555             syntaxError("Expected comma or right bracket");
556         }
557         // If {m,} max is unlimited
558
if (pattern.charAt(idx) == '}') {
559             idx++;
560             bracketOpt[brackets] = bracketUnbounded;
561
562             return;
563         }
564         // Next char must be a digit
565
if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
566             syntaxError("Expected digit");
567         }
568
569         // Get max number
570
number.setLength(0);
571
572         while (idx < len && Character.isDigit(pattern.charAt(idx))) {
573             number.append(pattern.charAt(idx++));
574         }
575         try {
576             bracketOpt[brackets] = Integer.parseInt(number.toString()) -
577                     bracketMin[brackets];
578         } catch (NumberFormatException JavaDoc e) {
579             syntaxError("Expected valid number");
580         }
581         // Optional repetitions must be > 0
582
if (bracketOpt[brackets] <= 0) {
583             syntaxError("Bad range");
584         }
585         // Must have close brace
586
if (idx >= len || pattern.charAt(idx++) != '}') {
587             syntaxError("Missing close brace");
588         }
589     }
590
591     /**
592      * Compile one branch of an or operator (implements concatenation)
593      *
594      * @param flags Flags passed by reference
595      * @return Pointer to branch node
596      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
597      */

598     int branch(int[] flags)
599             throws RESyntaxException {
600
601         // Get each possibly closured piece and concat
602
int node;
603         int ret = node(RE.OP_BRANCH, 0);
604         int chain = -1;
605         int[] closureFlags = new int[1];
606         boolean nullable = true;
607
608         while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
609
610             // Get new node
611
closureFlags[0] = NODE_NORMAL;
612             node = closure(closureFlags);
613
614             if (closureFlags[0] == NODE_NORMAL) {
615                 nullable = false;
616             }
617             // If there's a chain, append to the end
618
if (chain != -1) {
619                 setNextOfEnd(chain, node);
620             }
621
622             // Chain starts at current
623
chain = node;
624         }
625         // If we don't run loop, make a nothing node
626
if (chain == -1) {
627             node(RE.OP_NOTHING, 0);
628         }
629         // Set nullable flag for this branch
630
if (nullable) {
631             flags[0] |= NODE_NULLABLE;
632         }
633
634         return ret;
635     }
636
637     /**
638      * Compile a character class
639      *
640      * @return Index of class node
641      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
642      */

643     int characterClass()
644             throws RESyntaxException {
645
646         // Check for bad calling or empty class
647
if (pattern.charAt(idx) != '[') {
648             internalError();
649         }
650         // Check for unterminated or empty class
651
if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
652             syntaxError("Empty or unterminated class");
653         }
654         // Check for POSIX character class
655
if (idx < len && pattern.charAt(idx) == ':') {
656
657             // Skip colon
658
idx++;
659
660             // POSIX character classes are denoted with lowercase ASCII strings
661
int idxStart = idx;
662
663             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') {
664                 idx++;
665             }
666             // Should be a ":]" to terminate the POSIX character class
667
if ((idx + 1) < len && pattern.charAt(idx) == ':' &&
668                     pattern.charAt(idx + 1) == ']') {
669
670                 // Get character class
671
String JavaDoc charClass = pattern.substring(idxStart, idx);
672
673                 // Select the POSIX class id
674
Character JavaDoc i = (Character JavaDoc) hashPOSIX.get(charClass);
675
676                 if (i != null) {
677
678                     // Move past colon and right bracket
679
idx += 2;
680
681                     // Return new POSIX character class node
682
return node(RE.OP_POSIXCLASS, i.charValue());
683                 }
684
685                 syntaxError("Invalid POSIX character class '" + charClass +
686                         "'");
687             }
688
689             syntaxError("Invalid POSIX character class syntax");
690         }
691
692         // Try to build a class. Create OP_ANYOF node
693
int ret = node(RE.OP_ANYOF, 0);
694
695         // Parse class declaration
696
char CHAR_INVALID = Character.MAX_VALUE;
697         char last = CHAR_INVALID;
698         char simpleChar = 0;
699         boolean include = true;
700         boolean definingRange = false;
701         int idxFirst = idx;
702         char rangeStart = Character.MIN_VALUE;
703         char rangeEnd;
704         RERange range = new RERange();
705
706         while (idx < len && pattern.charAt(idx) != ']') {
707             switchOnCharacter:
708
709                         // Switch on character
710
switch (pattern.charAt(idx)) {
711                             case '^':
712                                 include = !include;
713
714                                 if (idx == idxFirst) {
715                                     range.include(Character.MIN_VALUE, Character.MAX_VALUE,
716                                             true);
717                                 }
718
719                                 idx++;
720                                 continue;
721
722                             case '\\':
723                                 {
724
725                                     // Escape always advances the stream
726
char c;
727
728                                     switch (c = escape()) {
729                                         case ESC_COMPLEX:
730                                         case ESC_BACKREF:
731
732                                             // Word boundaries and backrefs not allowed in a character class!
733
syntaxError("Bad character class");
734
735                                         case ESC_CLASS:
736
737                                             // Classes can't be an endpoint of a range
738
if (definingRange) {
739                                                 syntaxError("Bad character class");
740                                             }
741                                             // Handle specific type of class (some are ok)
742
switch (pattern.charAt(idx - 1)) {
743                                                 case RE.E_NSPACE:
744                                                 case RE.E_NDIGIT:
745                                                 case RE.E_NALNUM:
746                                                     syntaxError("Bad character class");
747
748                                                 case RE.E_SPACE:
749                                                     range.include('\t', include);
750                                                     range.include('\r', include);
751                                                     range.include('\f', include);
752                                                     range.include('\n', include);
753                                                     range.include('\b', include);
754                                                     range.include(' ', include);
755                                                     break;
756
757                                                 case RE.E_ALNUM:
758                                                     range.include('a', 'z', include);
759                                                     range.include('A', 'Z', include);
760                                                     range.include('_', include);
761
762                                                     // Fall through!
763
case RE.E_DIGIT:
764                                                     range.include('0', '9', include);
765                                                     break;
766                                             }
767
768                                             // Make last char invalid (can't be a range start)
769
last = CHAR_INVALID;
770                                             break;
771
772                                         default:
773
774                                             // Escape is simple so treat as a simple char
775
simpleChar = c;
776                                             break switchOnCharacter;
777                                     }
778                                 }
779
780                                 continue;
781
782                             case '-':
783
784                                 // Start a range if one isn't already started
785
if (definingRange) {
786                                     syntaxError("Bad class range");
787                                 }
788
789                                 definingRange = true;
790
791                                 // If no last character, start of range is 0
792
rangeStart = (last == CHAR_INVALID ? 0 : last);
793
794                                 // Premature end of range. define up to Character.MAX_VALUE
795
if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
796                                     simpleChar = Character.MAX_VALUE;
797                                     break;
798                                 }
799
800                                 continue;
801
802                             default:
803                                 simpleChar = pattern.charAt(idx++);
804                                 break;
805                         }
806             // Handle simple character simpleChar
807
if (definingRange) {
808
809                 // if we are defining a range make it now
810
rangeEnd = simpleChar;
811
812                 // Actually create a range if the range is ok
813
if (rangeStart >= rangeEnd) {
814                     syntaxError("Bad character class");
815                 }
816
817                 range.include(rangeStart, rangeEnd, include);
818
819                 // We are done defining the range
820
last = CHAR_INVALID;
821                 definingRange = false;
822             } else {
823
824                 // If simple character and not start of range, include it
825
if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') {
826                     range.include(simpleChar, include);
827                 }
828
829                 last = simpleChar;
830             }
831         }
832         // Shouldn't be out of input
833
if (idx == len) {
834             syntaxError("Unterminated character class");
835         }
836
837         // Absorb the ']' end of class marker
838
idx++;
839
840         // Emit character class definition
841
instruction[ret + RE.offsetOpdata] = (char) range.num;
842
843         for (int i = 0; i < range.num; i++) {
844             emit((char) range.minRange[i]);
845             emit((char) range.maxRange[i]);
846         }
847
848         return ret;
849     }
850
851     /**
852      * Compile a possibly closured terminal
853      *
854      * @param flags Flags passed by reference
855      * @return Index of closured node
856      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
857      */

858     int closure(int[] flags)
859             throws RESyntaxException {
860
861         // Before terminal
862
int idxBeforeTerminal = idx;
863
864         // Values to pass by reference to terminal()
865
int[] terminalFlags = {NODE_NORMAL};
866
867         // Get terminal symbol
868
int ret = terminal(terminalFlags);
869
870         // Or in flags from terminal symbol
871
flags[0] |= terminalFlags[0];
872
873         // Advance input, set NODE_NULLABLE flag and do sanity checks
874
if (idx >= len) {
875             return ret;
876         }
877
878         boolean greedy = true;
879         char closureType = pattern.charAt(idx);
880
881         switch (closureType) {
882             case '?':
883             case '*':
884
885                 // The current node can be null
886
flags[0] |= NODE_NULLABLE;
887
888             case '+':
889
890                 // Eat closure character
891
idx++;
892
893             case '{':
894
895                 // Don't allow blantant stupidity
896
int opcode = instruction[ret + RE.offsetOpcode];
897
898                 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
899                     syntaxError("Bad closure operand");
900                 }
901                 if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
902                     syntaxError("Closure operand can't be nullable");
903                 }
904
905                 break;
906         }
907         // If the next character is a '?', make the closure non-greedy (reluctant)
908
if (idx < len && pattern.charAt(idx) == '?') {
909             idx++;
910             greedy = false;
911         }
912         if (greedy) {
913
914             // Actually do the closure now
915
switch (closureType) {
916                 case '{':
917                     {
918
919                         // We look for our bracket in the list
920
boolean found = false;
921                         int i;
922                         allocBrackets();
923
924                         for (i = 0; i < brackets; i++) {
925                             if (bracketStart[i] == idx) {
926                                 found = true;
927                                 break;
928                             }
929                         }
930                         // If its not in the list we parse the {m,n}
931
if (!found) {
932                             if (brackets >= maxBrackets) {
933                                 syntaxError("Too many bracketed closures (limit is 10)");
934                             }
935
936                             bracketStart[brackets] = idx;
937                             bracket();
938                             bracketEnd[brackets] = idx;
939                             i = brackets++;
940                         }
941                         // If there's a min, rewind stream and reparse
942
if (--bracketMin[i] > 0) {
943
944                             // Rewind stream and run it through again
945
idx = idxBeforeTerminal;
946                             break;
947                         }
948                         // Do the right thing for maximum ({m,})
949
if (bracketOpt[i] == bracketFinished) {
950
951                             // Drop through now and closure expression.
952
// We are done with the {m,} expr, so skip rest
953
closureType = '*';
954                             bracketOpt[i] = 0;
955                             idx = bracketEnd[i];
956                         } else if (bracketOpt[i] == bracketUnbounded) {
957                             idx = idxBeforeTerminal;
958                             bracketOpt[i] = bracketFinished;
959                             break;
960                         } else if (bracketOpt[i]-- > 0) {
961
962                             // Drop through to optionally close and then 'play it again sam!'
963
idx = idxBeforeTerminal;
964                             closureType = '?';
965                         } else {
966
967                             // We are done. skip the rest of {m,n} expr
968
idx = bracketEnd[i];
969                             break;
970                         }
971                     }
972                     // Fall through!
973
case '?':
974                 case '*':
975                     if (!greedy) {
976                         break;
977                     }
978                     if (closureType == '?') {
979
980                         // X? is compiled as (X|)
981
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
982
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // inserted branch to option
983

984                         int nothing = node(RE.OP_NOTHING, 0); // which is OP_NOTHING
985
setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
986
setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
987
}
988                     if (closureType == '*') {
989
990                         // X* is compiled as (X{gotoX}|)
991
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
992
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
993
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
994
setNextOfEnd(ret + RE.nodeSize, ret); // the start again
995
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
996
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
997
}
998
999                     break;
1000
1001                case '+':
1002                    {
1003
1004                        // X+ is compiled as X({gotoX}|)
1005
int branch;
1006                        branch = node(RE.OP_BRANCH, 0); // a new branch
1007
setNextOfEnd(ret, branch); // is added to the end of X
1008
setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1009
setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1010
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1011
}
1012
1013                    break;
1014            }
1015        } else {
1016
1017            // Add end after closured subexpr
1018
setNextOfEnd(ret, node(RE.OP_END, 0));
1019
1020            // Actually do the closure now
1021
switch (closureType) {
1022                case '?':
1023                    nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1024                    break;
1025
1026                case '*':
1027                    nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1028                    break;
1029
1030                case '+':
1031                    nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1032                    break;
1033            }
1034
1035            // Point to the expr after the closure
1036
setNextOfEnd(ret, lenInstruction);
1037        }
1038
1039        return ret;
1040    }
1041
1042    /**
1043     * Compiles a regular expression pattern into a program runnable by the pattern
1044     * matcher class 'RE'.
1045     *
1046     * @param pattern Regular expression pattern to compile (see RECompiler class
1047     * for details).
1048     * @return A compiled regular expression program.
1049     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1050     * @see RECompiler
1051     * @see RE
1052     */

1053    public REProgram compile(String JavaDoc pattern)
1054            throws RESyntaxException {
1055
1056        // Initialize variables for compilation
1057
this.pattern = pattern; // Save pattern in instance variable
1058
len = pattern.length(); // Precompute pattern length for speed
1059
idx = 0; // Set parsing index to the first character
1060
lenInstruction = 0; // Set emitted instruction count to zero
1061
parens = 1; // Set paren level to 1 (the implicit outer parens)
1062
brackets = 0; // No bracketed closures yet
1063

1064        // Initialize pass by reference flags value
1065
int[] flags = {NODE_TOPLEVEL};
1066
1067        // Parse expression
1068
expr(flags);
1069
1070        // Should be at end of input
1071
if (idx != len) {
1072            if (pattern.charAt(idx) == ')') {
1073                syntaxError("Unmatched close paren");
1074            }
1075
1076            syntaxError("Unexpected input remains");
1077        }
1078
1079        // Return the result
1080
char[] ins = new char[lenInstruction];
1081        System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1082
1083        return new REProgram(ins);
1084    }
1085
1086    /**
1087     * Emit a single character into the program stream.
1088     *
1089     * @param c Character to add
1090     */

1091    void emit(char c) {
1092
1093        // Make room for character
1094
ensure(1);
1095
1096        // Add character
1097
instruction[lenInstruction++] = c;
1098    }
1099
1100    /**
1101     * Ensures that n more characters can fit in the program buffer.
1102     * If n more can't fit, then the size is doubled until it can.
1103     *
1104     * @param n Number of additional characters to ensure will fit.
1105     */

1106    void ensure(int n) {
1107
1108        // Get current program length
1109
int curlen = instruction.length;
1110
1111        // If the current length + n more is too much
1112
if (lenInstruction + n >= curlen) {
1113
1114            // Double the size of the program array until n more will fit
1115
while (lenInstruction + n >= curlen) {
1116                curlen *= 2;
1117            }
1118
1119            // Allocate new program array and move data into it
1120
char[] newInstruction = new char[curlen];
1121            System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
1122            instruction = newInstruction;
1123        }
1124    }
1125
1126    /**
1127     * Match an escape sequence. Handles quoted chars and octal escapes as well
1128     * as normal escape characters. Always advances the input stream by the
1129     * right amount. This code "understands" the subtle difference between an
1130     * octal escape and a backref. You can access the type of ESC_CLASS or
1131     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
1132     *
1133     * @return ESC_* code or character if simple escape
1134     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1135     */

1136    char escape()
1137            throws RESyntaxException {
1138
1139        // "Shouldn't" happen
1140
if (pattern.charAt(idx) != '\\') {
1141            internalError();
1142        }
1143        // Escape shouldn't occur as last character in string!
1144
if (idx + 1 == len) {
1145            syntaxError("Escape terminates string");
1146        }
1147
1148        // Switch on character after backslash
1149
idx += 2;
1150
1151        char escapeChar = pattern.charAt(idx - 1);
1152
1153        switch (escapeChar) {
1154            case RE.E_BOUND:
1155            case RE.E_NBOUND:
1156                return ESC_COMPLEX;
1157
1158            case RE.E_ALNUM:
1159            case RE.E_NALNUM:
1160            case RE.E_SPACE:
1161            case RE.E_NSPACE:
1162            case RE.E_DIGIT:
1163            case RE.E_NDIGIT:
1164                return ESC_CLASS;
1165
1166            case 'u':
1167            case 'x':
1168                {
1169
1170                    // Exact required hex digits for escape type
1171
int hexDigits = (escapeChar == 'u' ? 4 : 2);
1172
1173                    // Parse up to hexDigits characters from input
1174
int val = 0;
1175
1176                    for (; idx < len && hexDigits-- > 0; idx++) {
1177
1178                        // Get char
1179
char c = pattern.charAt(idx);
1180
1181                        // If it's a hexadecimal digit (0-9)
1182
if (c >= '0' && c <= '9') {
1183
1184                            // Compute new value
1185
val = (val << 4) + c - '0';
1186                        } else {
1187
1188                            // If it's a hexadecimal letter (a-f)
1189
c = Character.toLowerCase(c);
1190
1191                            if (c >= 'a' && c <= 'f') {
1192
1193                                // Compute new value
1194
val = (val << 4) + (c - 'a') + 10;
1195                            } else {
1196
1197                                // If it's not a valid digit or hex letter, the escape must be invalid
1198
// because hexDigits of input have not been absorbed yet.
1199
syntaxError("Expected " + hexDigits +
1200                                        " hexadecimal digits after \\" +
1201                                        escapeChar);
1202                            }
1203                        }
1204                    }
1205
1206                    return (char) val;
1207                }
1208            case 't':
1209                return '\t';
1210
1211            case 'n':
1212                return '\n';
1213
1214            case 'r':
1215                return '\r';
1216
1217            case 'f':
1218                return '\f';
1219
1220            case '0':
1221            case '1':
1222            case '2':
1223            case '3':
1224            case '4':
1225            case '5':
1226            case '6':
1227            case '7':
1228            case '8':
1229            case '9':
1230
1231                // An octal escape starts with a 0 or has two digits in a row
1232
if ((idx < len && Character.isDigit(pattern.charAt(idx))) ||
1233                        escapeChar == '0') {
1234
1235                    // Handle \nnn octal escapes
1236
int val = escapeChar - '0';
1237
1238                    if (idx < len && Character.isDigit(pattern.charAt(idx))) {
1239                        val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1240
1241                        if (idx < len &&
1242                                Character.isDigit(pattern.charAt(idx))) {
1243                            val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1244                        }
1245                    }
1246
1247                    return (char) val;
1248                }
1249
1250                // It's actually a backreference (\[1-9]), not an escape
1251
return ESC_BACKREF;
1252
1253            default:
1254
1255                // Simple quoting of a character
1256
return escapeChar;
1257        }
1258    }
1259
1260    /**
1261     * Compile an expression with possible parens around it. Paren matching
1262     * is done at this level so we can tie the branch tails together.
1263     *
1264     * @param flags Flag value passed by reference
1265     * @return Node index of expression in instruction array
1266     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1267     */

1268    int expr(int[] flags)
1269            throws RESyntaxException {
1270
1271        // Create open paren node unless we were called from the top level (which has no parens)
1272
boolean paren = false;
1273        int ret = -1;
1274        int closeParens = parens;
1275
1276        if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
1277            idx++;
1278            paren = true;
1279            ret = node(RE.OP_OPEN, parens++);
1280        }
1281
1282        flags[0] &= ~NODE_TOPLEVEL;
1283
1284        // Create a branch node
1285
int branch = branch(flags);
1286
1287        if (ret == -1) {
1288            ret = branch;
1289        } else {
1290            setNextOfEnd(ret, branch);
1291        }
1292        // Loop through branches
1293
while (idx < len && pattern.charAt(idx) == '|') {
1294            idx++;
1295            branch = branch(flags);
1296            setNextOfEnd(ret, branch);
1297        }
1298
1299        // Create an ending node (either a close paren or an OP_END)
1300
int end;
1301
1302        if (paren) {
1303            if (idx < len && pattern.charAt(idx) == ')') {
1304                idx++;
1305            } else {
1306                syntaxError("Missing close paren");
1307            }
1308
1309            end = node(RE.OP_CLOSE, closeParens);
1310        } else {
1311            end = node(RE.OP_END, 0);
1312        }
1313
1314        // Append the ending node to the ret nodelist
1315
setNextOfEnd(ret, end);
1316
1317        // Hook the ends of each branch to the end node
1318
for (int next = -1, i = ret;
1319             next != 0;
1320             next = instruction[i + RE.offsetNext], i += next) {
1321
1322            // If branch, make the end of the branch's operand chain point to the end node.
1323
if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) {
1324                setNextOfEnd(i + RE.nodeSize, end);
1325            }
1326        }
1327
1328        // Return the node list
1329
return ret;
1330    }
1331
1332    /**
1333     * Throws a new internal error exception
1334     *
1335     * @throws Error Thrown in the event of an internal error.
1336     */

1337    void internalError()
1338            throws Error JavaDoc {
1339        throw new Error JavaDoc("Internal error!");
1340    }
1341
1342    /**
1343     * Adds a new node
1344     *
1345     * @param opcode Opcode for node
1346     * @param opdata Opdata for node (only the low 16 bits are currently used)
1347     * @return Index of new node in program
1348     */

1349    int node(char opcode, int opdata) {
1350
1351        // Make room for a new node
1352
ensure(RE.nodeSize);
1353
1354        // Add new node at end
1355
instruction[lenInstruction + RE.offsetOpcode] = opcode;
1356        instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
1357        instruction[lenInstruction + RE.offsetNext] = 0;
1358        lenInstruction += RE.nodeSize;
1359
1360        // Return index of new node
1361
return lenInstruction - RE.nodeSize;
1362    }
1363
1364    /**
1365     * Inserts a node with a given opcode and opdata at insertAt. The node relative next
1366     * pointer is initialized to 0.
1367     *
1368     * @param opcode Opcode for new node
1369     * @param opdata Opdata for new node (only the low 16 bits are currently used)
1370     * @param insertAt Index at which to insert the new node in the program
1371     */

1372    void nodeInsert(char opcode, int opdata, int insertAt) {
1373
1374        // Make room for a new node
1375
ensure(RE.nodeSize);
1376
1377        // Move everything from insertAt to the end down nodeSize elements
1378
System.arraycopy(instruction, insertAt, instruction,
1379                insertAt + RE.nodeSize, lenInstruction - insertAt);
1380        instruction[insertAt + RE.offsetOpcode] = opcode;
1381        instruction[insertAt + RE.offsetOpdata] = (char) opdata;
1382        instruction[insertAt + RE.offsetNext] = 0;
1383        lenInstruction += RE.nodeSize;
1384    }
1385
1386    /**
1387     * Appends a node to the end of a node chain
1388     *
1389     * @param node Start of node chain to traverse
1390     * @param pointTo Node to have the tail of the chain point to
1391     */

1392    void setNextOfEnd(int node, int pointTo) {
1393
1394        // Traverse the chain until the next offset is 0
1395
int next;
1396
1397        while ((next = instruction[node + RE.offsetNext]) != 0) {
1398            node += next;
1399        }
1400
1401        // Point the last node in the chain to pointTo.
1402
instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
1403    }
1404
1405    /**
1406     * Throws a new syntax error exception
1407     *
1408     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1409     */

1410    void syntaxError(String JavaDoc s)
1411            throws RESyntaxException {
1412        throw new RESyntaxException(s);
1413    }
1414
1415    /**
1416     * Match a terminal node.
1417     *
1418     * @param flags Flags
1419     * @return Index of terminal node (closeable)
1420     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1421     */

1422    int terminal(int[] flags)
1423            throws RESyntaxException {
1424        switch (pattern.charAt(idx)) {
1425            case RE.OP_EOL:
1426            case RE.OP_BOL:
1427            case RE.OP_ANY:
1428                return node(pattern.charAt(idx++), 0);
1429
1430            case '[':
1431                return characterClass();
1432
1433            case '(':
1434                return expr(flags);
1435
1436            case ')':
1437                syntaxError("Unexpected close paren");
1438
1439            case '|':
1440                internalError();
1441
1442            case ']':
1443                syntaxError("Mismatched class");
1444
1445            case 0:
1446                syntaxError("Unexpected end of input");
1447
1448            case '?':
1449            case '+':
1450            case '{':
1451            case '*':
1452                syntaxError("Missing operand to closure");
1453
1454            case '\\':
1455                {
1456
1457                    // Don't forget, escape() advances the input stream!
1458
int idxBeforeEscape = idx;
1459
1460                    // Switch on escaped character
1461
switch (escape()) {
1462                        case ESC_CLASS:
1463                        case ESC_COMPLEX:
1464                            flags[0] &= ~NODE_NULLABLE;
1465
1466                            return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
1467
1468                        case ESC_BACKREF:
1469                            {
1470                                char backreference = (char) (pattern.charAt(idx - 1) -
1471                                        '0');
1472
1473                                if (parens <= backreference) {
1474                                    syntaxError("Bad backreference");
1475                                }
1476
1477                                flags[0] |= NODE_NULLABLE;
1478
1479                                return node(RE.OP_BACKREF, backreference);
1480                            }
1481                        default:
1482
1483                            // We had a simple escape and we want to have it end up in
1484
// an atom, so we back up and fall though to the default handling
1485
idx = idxBeforeEscape;
1486                            flags[0] &= ~NODE_NULLABLE;
1487                            break;
1488                    }
1489                }
1490        }
1491
1492        // Everything above either fails or returns.
1493
// If it wasn't one of the above, it must be the start of an atom.
1494
flags[0] &= ~NODE_NULLABLE;
1495
1496        return atom();
1497    }
1498}
Popular Tags