KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > org > apache > regexp > internal > RECompiler


1 package com.sun.org.apache.regexp.internal;
2
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  * any, must include the following acknowlegement:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowlegement may appear in the software itself,
28  * if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  * Foundation" must not be used to endorse or promote products derived
32  * from this software without prior written permission. For written
33  * permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  * nor may "Apache" appear in their names without prior written
37  * permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */

59
60 import com.sun.org.apache.regexp.internal.RE;
61 import java.util.Hashtable JavaDoc;
62
63 /**
64  * A regular expression compiler class. This class compiles a pattern string into a
65  * regular expression program interpretable by the RE evaluator class. The 'recompile'
66  * command line tool uses this compiler to pre-compile regular expressions for use
67  * with RE. For a description of the syntax accepted by RECompiler and what you can
68  * do with regular expressions, see the documentation for the RE matcher class.
69  *
70  * @see RE
71  * @see recompile
72  *
73  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
74  * @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $
75  */

76 public class RECompiler
77 {
78     // The compiled program
79
char[] instruction; // The compiled RE 'program' instruction buffer
80
int lenInstruction; // The amount of the program buffer currently in use
81

82     // Input state for compiling regular expression
83
String JavaDoc pattern; // Input string
84
int len; // Length of the pattern string
85
int idx; // Current input index into ac
86
int parens; // Total number of paren pairs
87

88     // Node flags
89
static final int NODE_NORMAL = 0; // No flags (nothing special)
90
static final int NODE_NULLABLE = 1; // True if node is potentially null
91
static final int NODE_TOPLEVEL = 2; // True if top level expr
92

93     // Special types of 'escapes'
94
static final char ESC_MASK = 0xfff0; // Escape complexity mask
95
static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
96
static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
97
static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
98

99     // {m,n} stacks
100
static final int maxBrackets = 10; // Maximum number of bracket pairs
101
static int brackets = 0; // Number of bracket sets
102
static int[] bracketStart = null; // Starting point
103
static int[] bracketEnd = null; // Ending point
104
static int[] bracketMin = null; // Minimum number of matches
105
static int[] bracketOpt = null; // Additional optional matches
106
static final int bracketUnbounded = -1; // Unbounded value
107
static final int bracketFinished = -2; // Unbounded value
108

109     // Lookup table for POSIX character class names
110
static Hashtable JavaDoc hashPOSIX = new Hashtable JavaDoc();
111     static
112     {
113         hashPOSIX.put("alnum", new Character JavaDoc(RE.POSIX_CLASS_ALNUM));
114         hashPOSIX.put("alpha", new Character JavaDoc(RE.POSIX_CLASS_ALPHA));
115         hashPOSIX.put("blank", new Character JavaDoc(RE.POSIX_CLASS_BLANK));
116         hashPOSIX.put("cntrl", new Character JavaDoc(RE.POSIX_CLASS_CNTRL));
117         hashPOSIX.put("digit", new Character JavaDoc(RE.POSIX_CLASS_DIGIT));
118         hashPOSIX.put("graph", new Character JavaDoc(RE.POSIX_CLASS_GRAPH));
119         hashPOSIX.put("lower", new Character JavaDoc(RE.POSIX_CLASS_LOWER));
120         hashPOSIX.put("print", new Character JavaDoc(RE.POSIX_CLASS_PRINT));
121         hashPOSIX.put("punct", new Character JavaDoc(RE.POSIX_CLASS_PUNCT));
122         hashPOSIX.put("space", new Character JavaDoc(RE.POSIX_CLASS_SPACE));
123         hashPOSIX.put("upper", new Character JavaDoc(RE.POSIX_CLASS_UPPER));
124         hashPOSIX.put("xdigit", new Character JavaDoc(RE.POSIX_CLASS_XDIGIT));
125         hashPOSIX.put("javastart", new Character JavaDoc(RE.POSIX_CLASS_JSTART));
126         hashPOSIX.put("javapart", new Character JavaDoc(RE.POSIX_CLASS_JPART));
127     }
128
129     /**
130      * Constructor. Creates (initially empty) storage for a regular expression program.
131      */

132     public RECompiler()
133     {
134         // Start off with a generous, yet reasonable, initial size
135
instruction = new char[128];
136         lenInstruction = 0;
137     }
138
139     /**
140      * Ensures that n more characters can fit in the program buffer.
141      * If n more can't fit, then the size is doubled until it can.
142      * @param n Number of additional characters to ensure will fit.
143      */

144     void ensure(int n)
145     {
146         // Get current program length
147
int curlen = instruction.length;
148
149         // If the current length + n more is too much
150
if (lenInstruction + n >= curlen)
151         {
152             // Double the size of the program array until n more will fit
153
while (lenInstruction + n >= curlen)
154             {
155                 curlen *= 2;
156             }
157
158             // Allocate new program array and move data into it
159
char[] newInstruction = new char[curlen];
160             System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
161             instruction = newInstruction;
162         }
163     }
164
165     /**
166      * Emit a single character into the program stream.
167      * @param c Character to add
168      */

169     void emit(char c)
170     {
171         // Make room for character
172
ensure(1);
173
174         // Add character
175
instruction[lenInstruction++] = c;
176     }
177
178     /**
179      * Inserts a node with a given opcode and opdata at insertAt. The node relative next
180      * pointer is initialized to 0.
181      * @param opcode Opcode for new node
182      * @param opdata Opdata for new node (only the low 16 bits are currently used)
183      * @param insertAt Index at which to insert the new node in the program
184      */

185     void nodeInsert(char opcode, int opdata, int insertAt)
186     {
187         // Make room for a new node
188
ensure(RE.nodeSize);
189
190         // Move everything from insertAt to the end down nodeSize elements
191
System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
192         instruction[insertAt + RE.offsetOpcode] = opcode;
193         instruction[insertAt + RE.offsetOpdata] = (char)opdata;
194         instruction[insertAt + RE.offsetNext] = 0;
195         lenInstruction += RE.nodeSize;
196     }
197
198     /**
199      * Appends a node to the end of a node chain
200      * @param node Start of node chain to traverse
201      * @param pointTo Node to have the tail of the chain point to
202      */

203     void setNextOfEnd(int node, int pointTo)
204     {
205         // Traverse the chain until the next offset is 0
206
int next;
207         while ((next = instruction[node + RE.offsetNext]) != 0)
208         {
209             node += next;
210         }
211
212         // Point the last node in the chain to pointTo.
213
instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
214     }
215
216     /**
217      * Adds a new node
218      * @param opcode Opcode for node
219      * @param opdata Opdata for node (only the low 16 bits are currently used)
220      * @return Index of new node in program
221      */

222     int node(char opcode, int opdata)
223     {
224         // Make room for a new node
225
ensure(RE.nodeSize);
226
227         // Add new node at end
228
instruction[lenInstruction + RE.offsetOpcode] = opcode;
229         instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
230         instruction[lenInstruction + RE.offsetNext] = 0;
231         lenInstruction += RE.nodeSize;
232
233         // Return index of new node
234
return lenInstruction - RE.nodeSize;
235     }
236
237
238     /**
239      * Throws a new internal error exception
240      * @exception Error Thrown in the event of an internal error.
241      */

242     void internalError() throws Error JavaDoc
243     {
244         throw new Error JavaDoc("Internal error!");
245     }
246
247     /**
248      * Throws a new syntax error exception
249      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
250      */

251     void syntaxError(String JavaDoc s) throws RESyntaxException
252     {
253         throw new RESyntaxException(s);
254     }
255
256     /**
257      * Allocate storage for brackets only as needed
258      */

259     void allocBrackets()
260     {
261         // Allocate bracket stacks if not already done
262
if (bracketStart == null)
263         {
264             // Allocate storage
265
bracketStart = new int[maxBrackets];
266             bracketEnd = new int[maxBrackets];
267             bracketMin = new int[maxBrackets];
268             bracketOpt = new int[maxBrackets];
269
270             // Initialize to invalid values
271
for (int i = 0; i < maxBrackets; i++)
272             {
273                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
274             }
275         }
276     }
277
278     /**
279      * Match bracket {m,n} expression put results in bracket member variables
280      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
281      */

282     void bracket() throws RESyntaxException
283     {
284         // Current character must be a '{'
285
if (idx >= len || pattern.charAt(idx++) != '{')
286         {
287             internalError();
288         }
289
290         // Next char must be a digit
291
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
292         {
293             syntaxError("Expected digit");
294         }
295
296         // Get min ('m' of {m,n}) number
297
StringBuffer JavaDoc number = new StringBuffer JavaDoc();
298         while (idx < len && Character.isDigit(pattern.charAt(idx)))
299         {
300             number.append(pattern.charAt(idx++));
301         }
302         try
303         {
304             bracketMin[brackets] = Integer.parseInt(number.toString());
305         }
306         catch (NumberFormatException JavaDoc e)
307         {
308             syntaxError("Expected valid number");
309         }
310
311         // If out of input, fail
312
if (idx >= len)
313         {
314             syntaxError("Expected comma or right bracket");
315         }
316
317         // If end of expr, optional limit is 0
318
if (pattern.charAt(idx) == '}')
319         {
320             idx++;
321             bracketOpt[brackets] = 0;
322             return;
323         }
324
325         // Must have at least {m,} and maybe {m,n}.
326
if (idx >= len || pattern.charAt(idx++) != ',')
327         {
328             syntaxError("Expected comma");
329         }
330
331         // If out of input, fail
332
if (idx >= len)
333         {
334             syntaxError("Expected comma or right bracket");
335         }
336
337         // If {m,} max is unlimited
338
if (pattern.charAt(idx) == '}')
339         {
340             idx++;
341             bracketOpt[brackets] = bracketUnbounded;
342             return;
343         }
344
345         // Next char must be a digit
346
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
347         {
348             syntaxError("Expected digit");
349         }
350
351         // Get max number
352
number.setLength(0);
353         while (idx < len && Character.isDigit(pattern.charAt(idx)))
354         {
355             number.append(pattern.charAt(idx++));
356         }
357         try
358         {
359             bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
360         }
361         catch (NumberFormatException JavaDoc e)
362         {
363             syntaxError("Expected valid number");
364         }
365
366         // Optional repetitions must be > 0
367
if (bracketOpt[brackets] <= 0)
368         {
369             syntaxError("Bad range");
370         }
371
372         // Must have close brace
373
if (idx >= len || pattern.charAt(idx++) != '}')
374         {
375             syntaxError("Missing close brace");
376         }
377     }
378
379     /**
380      * Match an escape sequence. Handles quoted chars and octal escapes as well
381      * as normal escape characters. Always advances the input stream by the
382      * right amount. This code "understands" the subtle difference between an
383      * octal escape and a backref. You can access the type of ESC_CLASS or
384      * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
385      * @return ESC_* code or character if simple escape
386      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
387      */

388     char escape() throws RESyntaxException
389     {
390         // "Shouldn't" happen
391
if (pattern.charAt(idx) != '\\')
392         {
393             internalError();
394         }
395
396         // Escape shouldn't occur as last character in string!
397
if (idx + 1 == len)
398         {
399             syntaxError("Escape terminates string");
400         }
401
402         // Switch on character after backslash
403
idx += 2;
404         char escapeChar = pattern.charAt(idx - 1);
405         switch (escapeChar)
406         {
407             case RE.E_BOUND:
408             case RE.E_NBOUND:
409                 return ESC_COMPLEX;
410
411             case RE.E_ALNUM:
412             case RE.E_NALNUM:
413             case RE.E_SPACE:
414             case RE.E_NSPACE:
415             case RE.E_DIGIT:
416             case RE.E_NDIGIT:
417                 return ESC_CLASS;
418
419             case 'u':
420             case 'x':
421                 {
422                     // Exact required hex digits for escape type
423
int hexDigits = (escapeChar == 'u' ? 4 : 2);
424
425                     // Parse up to hexDigits characters from input
426
int val = 0;
427                     for ( ; idx < len && hexDigits-- > 0; idx++)
428                     {
429                         // Get char
430
char c = pattern.charAt(idx);
431
432                         // If it's a hexadecimal digit (0-9)
433
if (c >= '0' && c <= '9')
434                         {
435                             // Compute new value
436
val = (val << 4) + c - '0';
437                         }
438                         else
439                         {
440                             // If it's a hexadecimal letter (a-f)
441
c = Character.toLowerCase(c);
442                             if (c >= 'a' && c <= 'f')
443                             {
444                                 // Compute new value
445
val = (val << 4) + (c - 'a') + 10;
446                             }
447                             else
448                             {
449                                 // If it's not a valid digit or hex letter, the escape must be invalid
450
// because hexDigits of input have not been absorbed yet.
451
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
452                             }
453                         }
454                     }
455                     return (char)val;
456                 }
457
458             case 't':
459                 return '\t';
460
461             case 'n':
462                 return '\n';
463
464             case 'r':
465                 return '\r';
466
467             case 'f':
468                 return '\f';
469
470             case '0':
471             case '1':
472             case '2':
473             case '3':
474             case '4':
475             case '5':
476             case '6':
477             case '7':
478             case '8':
479             case '9':
480
481                 // An octal escape starts with a 0 or has two digits in a row
482
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
483                 {
484                     // Handle \nnn octal escapes
485
int val = escapeChar - '0';
486                     if (idx < len && Character.isDigit(pattern.charAt(idx)))
487                     {
488                         val = ((val << 3) + (pattern.charAt(idx++) - '0'));
489                         if (idx < len && Character.isDigit(pattern.charAt(idx)))
490                         {
491                             val = ((val << 3) + (pattern.charAt(idx++) - '0'));
492                         }
493                     }
494                     return (char)val;
495                 }
496
497                 // It's actually a backreference (\[1-9]), not an escape
498
return ESC_BACKREF;
499
500             default:
501
502                 // Simple quoting of a character
503
return escapeChar;
504         }
505     }
506
507     /**
508      * Compile a character class
509      * @return Index of class node
510      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
511      */

512     int characterClass() throws RESyntaxException
513     {
514         // Check for bad calling or empty class
515
if (pattern.charAt(idx) != '[')
516         {
517             internalError();
518         }
519
520         // Check for unterminated or empty class
521
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
522         {
523             syntaxError("Empty or unterminated class");
524         }
525
526         // Check for POSIX character class
527
if (idx < len && pattern.charAt(idx) == ':')
528         {
529             // Skip colon
530
idx++;
531
532             // POSIX character classes are denoted with lowercase ASCII strings
533
int idxStart = idx;
534             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
535             {
536                 idx++;
537             }
538             
539             // Should be a ":]" to terminate the POSIX character class
540
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
541             {
542                 // Get character class
543
String JavaDoc charClass = pattern.substring(idxStart, idx);
544
545                 // Select the POSIX class id
546
Character JavaDoc i = (Character JavaDoc)hashPOSIX.get(charClass);
547                 if (i != null)
548                 {
549                     // Move past colon and right bracket
550
idx += 2;
551
552                     // Return new POSIX character class node
553
return node(RE.OP_POSIXCLASS, i.charValue());
554                 }
555                 syntaxError("Invalid POSIX character class '" + charClass + "'");
556             }
557             syntaxError("Invalid POSIX character class syntax");
558         }
559
560         // Try to build a class. Create OP_ANYOF node
561
int ret = node(RE.OP_ANYOF, 0);
562
563         // Parse class declaration
564
char CHAR_INVALID = Character.MAX_VALUE;
565         char last = CHAR_INVALID;
566         char simpleChar = 0;
567         boolean include = true;
568         boolean definingRange = false;
569         int idxFirst = idx;
570         char rangeStart = Character.MIN_VALUE;
571         char rangeEnd;
572         RERange range = new RERange();
573         while (idx < len && pattern.charAt(idx) != ']')
574         {
575
576             switchOnCharacter:
577
578             // Switch on character
579
switch (pattern.charAt(idx))
580             {
581                 case '^':
582                     include = !include;
583                     if (idx == idxFirst)
584                     {
585                         range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
586                     }
587                     idx++;
588                     continue;
589
590                 case '\\':
591                 {
592                     // Escape always advances the stream
593
char c;
594                     switch (c = escape ())
595                     {
596                         case ESC_COMPLEX:
597                         case ESC_BACKREF:
598
599                             // Word boundaries and backrefs not allowed in a character class!
600
syntaxError("Bad character class");
601
602                         case ESC_CLASS:
603
604                             // Classes can't be an endpoint of a range
605
if (definingRange)
606                             {
607                                 syntaxError("Bad character class");
608                             }
609
610                             // Handle specific type of class (some are ok)
611
switch (pattern.charAt(idx - 1))
612                             {
613                                 case RE.E_NSPACE:
614                                 case RE.E_NDIGIT:
615                                 case RE.E_NALNUM:
616                                     syntaxError("Bad character class");
617
618                                 case RE.E_SPACE:
619                                     range.include('\t', include);
620                                     range.include('\r', include);
621                                     range.include('\f', include);
622                                     range.include('\n', include);
623                                     range.include('\b', include);
624                                     range.include(' ', include);
625                                     break;
626
627                                 case RE.E_ALNUM:
628                                     range.include('a', 'z', include);
629                                     range.include('A', 'Z', include);
630                                     range.include('_', include);
631
632                                     // Fall through!
633

634                                 case RE.E_DIGIT:
635                                     range.include('0', '9', include);
636                                     break;
637                             }
638
639                             // Make last char invalid (can't be a range start)
640
last = CHAR_INVALID;
641                             break;
642
643                         default:
644
645                             // Escape is simple so treat as a simple char
646
simpleChar = c;
647                             break switchOnCharacter;
648                     }
649                 }
650                 continue;
651
652                 case '-':
653
654                     // Start a range if one isn't already started
655
if (definingRange)
656                     {
657                         syntaxError("Bad class range");
658                     }
659                     definingRange = true;
660
661                     // If no last character, start of range is 0
662
rangeStart = (last == CHAR_INVALID ? 0 : last);
663
664                     // Premature end of range. define up to Character.MAX_VALUE
665
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
666                     {
667                         simpleChar = Character.MAX_VALUE;
668                         break;
669                     }
670                     continue;
671
672                 default:
673                     simpleChar = pattern.charAt(idx++);
674                     break;
675             }
676
677             // Handle simple character simpleChar
678
if (definingRange)
679             {
680                 // if we are defining a range make it now
681
rangeEnd = simpleChar;
682
683                 // Actually create a range if the range is ok
684
if (rangeStart >= rangeEnd)
685                 {
686                     syntaxError("Bad character class");
687                 }
688                 range.include(rangeStart, rangeEnd, include);
689
690                 // We are done defining the range
691
last = CHAR_INVALID;
692                 definingRange = false;
693             }
694             else
695             {
696                 // If simple character and not start of range, include it
697
if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
698                 {
699                     range.include(simpleChar, include);
700                 }
701                 last = simpleChar;
702             }
703         }
704
705         // Shouldn't be out of input
706
if (idx == len)
707         {
708             syntaxError("Unterminated character class");
709         }
710
711         // Absorb the ']' end of class marker
712
idx++;
713
714         // Emit character class definition
715
instruction[ret + RE.offsetOpdata] = (char)range.num;
716         for (int i = 0; i < range.num; i++)
717         {
718             emit((char)range.minRange[i]);
719             emit((char)range.maxRange[i]);
720         }
721         return ret;
722     }
723
724     /**
725      * Absorb an atomic character string. This method is a little tricky because
726      * it can un-include the last character of string if a closure operator follows.
727      * This is correct because *+? have higher precedence than concatentation (thus
728      * ABC* means AB(C*) and NOT (ABC)*).
729      * @return Index of new atom node
730      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
731      */

732     int atom() throws RESyntaxException
733     {
734         // Create a string node
735
int ret = node(RE.OP_ATOM, 0);
736
737         // Length of atom
738
int lenAtom = 0;
739
740         // Loop while we've got input
741

742         atomLoop:
743
744         while (idx < len)
745         {
746             // Is there a next char?
747
if ((idx + 1) < len)
748             {
749                 char c = pattern.charAt(idx + 1);
750
751                 // If the next 'char' is an escape, look past the whole escape
752
if (pattern.charAt(idx) == '\\')
753                 {
754                     int idxEscape = idx;
755                     escape();
756                     if (idx < len)
757                     {
758                         c = pattern.charAt(idx);
759                     }
760                     idx = idxEscape;
761                 }
762
763                 // Switch on next char
764
switch (c)
765                 {
766                     case '{':
767                     case '?':
768                     case '*':
769                     case '+':
770
771                         // If the next character is a closure operator and our atom is non-empty, the
772
// current character should bind to the closure operator rather than the atom
773
if (lenAtom != 0)
774                         {
775                             break atomLoop;
776                         }
777                 }
778             }
779
780             // Switch on current char
781
switch (pattern.charAt(idx))
782             {
783                 case ']':
784                 case '^':
785                 case '$':
786                 case '.':
787                 case '[':
788                 case '(':
789                 case ')':
790                 case '|':
791                     break atomLoop;
792
793                 case '{':
794                 case '?':
795                 case '*':
796                 case '+':
797
798                     // We should have an atom by now
799
if (lenAtom == 0)
800                     {
801                         // No atom before closure
802
syntaxError("Missing operand to closure");
803                     }
804                     break atomLoop;
805
806                 case '\\':
807
808                     {
809                         // Get the escaped character (advances input automatically)
810
int idxBeforeEscape = idx;
811                         char c = escape();
812
813                         // Check if it's a simple escape (as opposed to, say, a backreference)
814
if ((c & ESC_MASK) == ESC_MASK)
815                         {
816                             // Not a simple escape, so backup to where we were before the escape.
817
idx = idxBeforeEscape;
818                             break atomLoop;
819                         }
820
821                         // Add escaped char to atom
822
emit(c);
823                         lenAtom++;
824                     }
825                     break;
826
827                 default:
828
829                     // Add normal character to atom
830
emit(pattern.charAt(idx++));
831                     lenAtom++;
832                     break;
833             }
834         }
835
836         // This "shouldn't" happen
837
if (lenAtom == 0)
838         {
839             internalError();
840         }
841
842         // Emit the atom length into the program
843
instruction[ret + RE.offsetOpdata] = (char)lenAtom;
844         return ret;
845     }
846
847     /**
848      * Match a terminal node.
849      * @param flags Flags
850      * @return Index of terminal node (closeable)
851      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
852      */

853     int terminal(int[] flags) throws RESyntaxException
854     {
855         switch (pattern.charAt(idx))
856         {
857         case RE.OP_EOL:
858         case RE.OP_BOL:
859         case RE.OP_ANY:
860             return node(pattern.charAt(idx++), 0);
861
862         case '[':
863             return characterClass();
864
865         case '(':
866             return expr(flags);
867
868         case ')':
869             syntaxError("Unexpected close paren");
870
871         case '|':
872             internalError();
873
874         case ']':
875             syntaxError("Mismatched class");
876
877         case 0:
878             syntaxError("Unexpected end of input");
879
880         case '?':
881         case '+':
882         case '{':
883         case '*':
884             syntaxError("Missing operand to closure");
885
886         case '\\':
887             {
888                 // Don't forget, escape() advances the input stream!
889
int idxBeforeEscape = idx;
890
891                 // Switch on escaped character
892
switch (escape())
893                 {
894                     case ESC_CLASS:
895                     case ESC_COMPLEX:
896                         flags[0] &= ~NODE_NULLABLE;
897                         return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
898
899                     case ESC_BACKREF:
900                         {
901                             char backreference = (char)(pattern.charAt(idx - 1) - '0');
902                             if (parens <= backreference)
903                             {
904                                 syntaxError("Bad backreference");
905                             }
906                             flags[0] |= NODE_NULLABLE;
907                             return node(RE.OP_BACKREF, backreference);
908                         }
909
910                     default:
911
912                         // We had a simple escape and we want to have it end up in
913
// an atom, so we back up and fall though to the default handling
914
idx = idxBeforeEscape;
915                         flags[0] &= ~NODE_NULLABLE;
916                         break;
917                 }
918             }
919         }
920
921         // Everything above either fails or returns.
922
// If it wasn't one of the above, it must be the start of an atom.
923
flags[0] &= ~NODE_NULLABLE;
924         return atom();
925     }
926
927     /**
928      * Compile a possibly closured terminal
929      * @param flags Flags passed by reference
930      * @return Index of closured node
931      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
932      */

933     int closure(int[] flags) throws RESyntaxException
934     {
935         // Before terminal
936
int idxBeforeTerminal = idx;
937
938         // Values to pass by reference to terminal()
939
int[] terminalFlags = { NODE_NORMAL };
940
941         // Get terminal symbol
942
int ret = terminal(terminalFlags);
943
944         // Or in flags from terminal symbol
945
flags[0] |= terminalFlags[0];
946
947         // Advance input, set NODE_NULLABLE flag and do sanity checks
948
if (idx >= len)
949         {
950             return ret;
951         }
952         boolean greedy = true;
953         char closureType = pattern.charAt(idx);
954         switch (closureType)
955         {
956             case '?':
957             case '*':
958
959                 // The current node can be null
960
flags[0] |= NODE_NULLABLE;
961
962             case '+':
963
964                 // Eat closure character
965
idx++;
966
967             case '{':
968
969                 // Don't allow blantant stupidity
970
int opcode = instruction[ret + RE.offsetOpcode];
971                 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
972                 {
973                     syntaxError("Bad closure operand");
974                 }
975                 if ((terminalFlags[0] & NODE_NULLABLE) != 0)
976                 {
977                     syntaxError("Closure operand can't be nullable");
978                 }
979                 break;
980         }
981
982         // If the next character is a '?', make the closure non-greedy (reluctant)
983
if (idx < len && pattern.charAt(idx) == '?')
984         {
985             idx++;
986             greedy = false;
987         }
988
989         if (greedy)
990         {
991             // Actually do the closure now
992
switch (closureType)
993             {
994                 case '{':
995                 {
996                     // We look for our bracket in the list
997
boolean found = false;
998                     int i;
999                     allocBrackets();
1000                    for (i = 0; i < brackets; i++)
1001                    {
1002                        if (bracketStart[i] == idx)
1003                        {
1004                            found = true;
1005                            break;
1006                        }
1007                    }
1008                    
1009                    // If its not in the list we parse the {m,n}
1010
if (!found)
1011                    {
1012                        if (brackets >= maxBrackets)
1013                        {
1014                            syntaxError("Too many bracketed closures (limit is 10)");
1015                        }
1016                        bracketStart[brackets] = idx;
1017                        bracket();
1018                        bracketEnd[brackets] = idx;
1019                        i = brackets++;
1020                    }
1021                    
1022                    // If there's a min, rewind stream and reparse
1023
if (--bracketMin[i] > 0)
1024                    {
1025                        // Rewind stream and run it through again
1026
idx = idxBeforeTerminal;
1027                        break;
1028                    }
1029                    
1030                    // Do the right thing for maximum ({m,})
1031
if (bracketOpt[i] == bracketFinished)
1032                    {
1033                        // Drop through now and closure expression.
1034
// We are done with the {m,} expr, so skip rest
1035
closureType = '*';
1036                        bracketOpt[i] = 0;
1037                        idx = bracketEnd[i];
1038                    }
1039                    else
1040                        if (bracketOpt[i] == bracketUnbounded)
1041                        {
1042                            idx = idxBeforeTerminal;
1043                            bracketOpt[i] = bracketFinished;
1044                            break;
1045                        }
1046                        else
1047                            if (bracketOpt[i]-- > 0)
1048                            {
1049                                // Drop through to optionally close and then 'play it again sam!'
1050
idx = idxBeforeTerminal;
1051                                closureType = '?';
1052                            }
1053                            else
1054                            {
1055                                // We are done. skip the rest of {m,n} expr
1056
idx = bracketEnd[i];
1057                                break;
1058                            }
1059                }
1060                
1061                // Fall through!
1062

1063                case '?':
1064                case '*':
1065                    
1066                    if (!greedy)
1067                    {
1068                        break;
1069                    }
1070                    
1071                    if (closureType == '?')
1072                    {
1073                        // X? is compiled as (X|)
1074
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
1075
setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
1076
int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
1077
setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
1078
setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
1079
}
1080                    
1081                    if (closureType == '*')
1082                    {
1083                        // X* is compiled as (X{gotoX}|)
1084
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
1085
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
1086
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
1087
setNextOfEnd(ret + RE.nodeSize, ret); // the start again
1088
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
1089
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
1090
}
1091                    break;
1092                    
1093                case '+':
1094                {
1095                    // X+ is compiled as X({gotoX}|)
1096
int branch;
1097                    branch = node(RE.OP_BRANCH, 0); // a new branch
1098
setNextOfEnd(ret, branch); // is added to the end of X
1099
setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1100
setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1101
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1102
}
1103                break;
1104            }
1105        }
1106        else
1107        {
1108            // Add end after closured subexpr
1109
setNextOfEnd(ret, node(RE.OP_END, 0));
1110
1111            // Actually do the closure now
1112
switch (closureType)
1113            {
1114                case '?':
1115                    nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1116                    break;
1117                    
1118                case '*':
1119                    nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1120                    break;
1121
1122                case '+':
1123                    nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1124                    break;
1125            }
1126
1127            // Point to the expr after the closure
1128
setNextOfEnd(ret, lenInstruction);
1129        }
1130        return ret;
1131    }
1132
1133    /**
1134     * Compile one branch of an or operator (implements concatenation)
1135     * @param flags Flags passed by reference
1136     * @return Pointer to branch node
1137     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1138     */

1139    int branch(int[] flags) throws RESyntaxException
1140    {
1141        // Get each possibly closured piece and concat
1142
int node;
1143        int ret = node(RE.OP_BRANCH, 0);
1144        int chain = -1;
1145        int[] closureFlags = new int[1];
1146        boolean nullable = true;
1147        while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
1148        {
1149            // Get new node
1150
closureFlags[0] = NODE_NORMAL;
1151            node = closure(closureFlags);
1152            if (closureFlags[0] == NODE_NORMAL)
1153            {
1154                nullable = false;
1155            }
1156
1157            // If there's a chain, append to the end
1158
if (chain != -1)
1159            {
1160                setNextOfEnd(chain, node);
1161            }
1162
1163            // Chain starts at current
1164
chain = node;
1165        }
1166
1167        // If we don't run loop, make a nothing node
1168
if (chain == -1)
1169        {
1170            node(RE.OP_NOTHING, 0);
1171        }
1172
1173        // Set nullable flag for this branch
1174
if (nullable)
1175        {
1176            flags[0] |= NODE_NULLABLE;
1177        }
1178        return ret;
1179    }
1180
1181    /**
1182     * Compile an expression with possible parens around it. Paren matching
1183     * is done at this level so we can tie the branch tails together.
1184     * @param flags Flag value passed by reference
1185     * @return Node index of expression in instruction array
1186     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1187     */

1188    int expr(int[] flags) throws RESyntaxException
1189    {
1190        // Create open paren node unless we were called from the top level (which has no parens)
1191
boolean paren = false;
1192        int ret = -1;
1193        int closeParens = parens;
1194        if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
1195        {
1196            idx++;
1197            paren = true;
1198            ret = node(RE.OP_OPEN, parens++);
1199        }
1200        flags[0] &= ~NODE_TOPLEVEL;
1201
1202        // Create a branch node
1203
int branch = branch(flags);
1204        if (ret == -1)
1205        {
1206            ret = branch;
1207        }
1208        else
1209        {
1210            setNextOfEnd(ret, branch);
1211        }
1212
1213        // Loop through branches
1214
while (idx < len && pattern.charAt(idx) == '|')
1215        {
1216            idx++;
1217            branch = branch(flags);
1218            setNextOfEnd(ret, branch);
1219        }
1220
1221        // Create an ending node (either a close paren or an OP_END)
1222
int end;
1223        if (paren)
1224        {
1225            if (idx < len && pattern.charAt(idx) == ')')
1226            {
1227                idx++;
1228            }
1229            else
1230            {
1231                syntaxError("Missing close paren");
1232            }
1233            end = node(RE.OP_CLOSE, closeParens);
1234        }
1235        else
1236        {
1237            end = node(RE.OP_END, 0);
1238        }
1239
1240        // Append the ending node to the ret nodelist
1241
setNextOfEnd(ret, end);
1242
1243        // Hook the ends of each branch to the end node
1244
for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
1245        {
1246            // If branch, make the end of the branch's operand chain point to the end node.
1247
if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
1248            {
1249                setNextOfEnd(i + RE.nodeSize, end);
1250            }
1251        }
1252
1253        // Return the node list
1254
return ret;
1255    }
1256
1257    /**
1258     * Compiles a regular expression pattern into a program runnable by the pattern
1259     * matcher class 'RE'.
1260     * @param pattern Regular expression pattern to compile (see RECompiler class
1261     * for details).
1262     * @return A compiled regular expression program.
1263     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1264     * @see RECompiler
1265     * @see RE
1266     */

1267    public REProgram compile(String JavaDoc pattern) throws RESyntaxException
1268    {
1269        // Initialize variables for compilation
1270
this.pattern = pattern; // Save pattern in instance variable
1271
len = pattern.length(); // Precompute pattern length for speed
1272
idx = 0; // Set parsing index to the first character
1273
lenInstruction = 0; // Set emitted instruction count to zero
1274
parens = 1; // Set paren level to 1 (the implicit outer parens)
1275
brackets = 0; // No bracketed closures yet
1276

1277        // Initialize pass by reference flags value
1278
int[] flags = { NODE_TOPLEVEL };
1279
1280        // Parse expression
1281
expr(flags);
1282
1283        // Should be at end of input
1284
if (idx != len)
1285        {
1286            if (pattern.charAt(idx) == ')')
1287            {
1288                syntaxError("Unmatched close paren");
1289            }
1290            syntaxError("Unexpected input remains");
1291        }
1292
1293        // Return the result
1294
char[] ins = new char[lenInstruction];
1295        System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1296        return new REProgram(ins);
1297    }
1298
1299    /**
1300     * Local, nested class for maintaining character ranges for character classes.
1301     */

1302    class RERange
1303    {
1304        int size = 16; // Capacity of current range arrays
1305
int[] minRange = new int[size]; // Range minima
1306
int[] maxRange = new int[size]; // Range maxima
1307
int num = 0; // Number of range array elements in use
1308

1309        /**
1310         * Deletes the range at a given index from the range lists
1311         * @param index Index of range to delete from minRange and maxRange arrays.
1312         */

1313        void delete(int index)
1314        {
1315            // Return if no elements left or index is out of range
1316
if (num == 0 || index >= num)
1317            {
1318                return;
1319            }
1320            
1321            // Move elements down
1322
while (index++ < num)
1323            {
1324                if (index - 1 >= 0)
1325                {
1326                    minRange[index-1] = minRange[index];
1327                    maxRange[index-1] = maxRange[index];
1328                }
1329            }
1330            
1331            // One less element now
1332
num--;
1333        }
1334        
1335        /**
1336         * Merges a range into the range list, coalescing ranges if possible.
1337         * @param min Minimum end of range
1338         * @param max Maximum end of range
1339         */

1340        void merge(int min, int max)
1341        {
1342            // Loop through ranges
1343
for (int i = 0; i < num; i++)
1344            {
1345                // Min-max is subsumed by minRange[i]-maxRange[i]
1346
if (min >= minRange[i] && max <= maxRange[i])
1347                {
1348                    return;
1349                }
1350                
1351                // Min-max subsumes minRange[i]-maxRange[i]
1352
else if (min <= minRange[i] && max >= maxRange[i])
1353                {
1354                    delete(i);
1355                    merge(min, max);
1356                    return;
1357                }
1358                
1359                // Min is in the range, but max is outside
1360
else if (min >= minRange[i] && min <= maxRange[i])
1361                {
1362                    delete(i);
1363                    min = minRange[i];
1364                    merge(min, max);
1365                    return;
1366                }
1367                
1368                // Max is in the range, but min is outside
1369
else if (max >= minRange[i] && max <= maxRange[i])
1370                {
1371                    delete(i);
1372                    max = maxRange[i];
1373                    merge(min, max);
1374                    return;
1375                }
1376            }
1377            
1378            // Must not overlap any other ranges
1379
if (num >= size)
1380            {
1381                size *= 2;
1382                int[] newMin = new int[size];
1383                int[] newMax = new int[size];
1384                System.arraycopy(minRange, 0, newMin, 0, num);
1385                System.arraycopy(maxRange, 0, newMax, 0, num);
1386                minRange = newMin;
1387                maxRange = newMax;
1388            }
1389            minRange[num] = min;
1390            maxRange[num] = max;
1391            num++;
1392        }
1393        
1394        /**
1395         * Removes a range by deleting or shrinking all other ranges
1396         * @param min Minimum end of range
1397         * @param max Maximum end of range
1398         */

1399        void remove(int min, int max)
1400        {
1401            // Loop through ranges
1402
for (int i = 0; i < num; i++)
1403            {
1404                // minRange[i]-maxRange[i] is subsumed by min-max
1405
if (minRange[i] >= min && maxRange[i] <= max)
1406                {
1407                    delete(i);
1408                    i--;
1409                    return;
1410                }
1411                
1412                // min-max is subsumed by minRange[i]-maxRange[i]
1413
else if (min >= minRange[i] && max <= maxRange[i])
1414                {
1415                    int minr = minRange[i];
1416                    int maxr = maxRange[i];
1417                    delete(i);
1418                    if (minr < min - 1)
1419                    {
1420                        merge(minr, min - 1);
1421                    }
1422                    if (max + 1 < maxr)
1423                    {
1424                        merge(max + 1, maxr);
1425                    }
1426                    return;
1427                }
1428                
1429                // minRange is in the range, but maxRange is outside
1430
else if (minRange[i] >= min && minRange[i] <= max)
1431                {
1432                    minRange[i] = max + 1;
1433                    return;
1434                }
1435                
1436                // maxRange is in the range, but minRange is outside
1437
else if (maxRange[i] >= min && maxRange[i] <= max)
1438                {
1439                    maxRange[i] = min - 1;
1440                    return;
1441                }
1442            }
1443        }
1444        
1445        /**
1446         * Includes (or excludes) the range from min to max, inclusive.
1447         * @param min Minimum end of range
1448         * @param max Maximum end of range
1449         * @param include True if range should be included. False otherwise.
1450         */

1451        void include(int min, int max, boolean include)
1452        {
1453            if (include)
1454            {
1455                merge(min, max);
1456            }
1457            else
1458            {
1459                remove(min, max);
1460            }
1461        }
1462        
1463        /**
1464         * Includes a range with the same min and max
1465         * @param minmax Minimum and maximum end of range (inclusive)
1466         * @param include True if range should be included. False otherwise.
1467         */

1468        void include(char minmax, boolean include)
1469        {
1470            include(minmax, minmax, include);
1471        }
1472    }
1473}
1474
Popular Tags