KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > regexp > RECompiler


1 package org.apache.regexp;
2
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  * any, must include the following acknowlegement:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowlegement may appear in the software itself,
28  * if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  * Foundation" must not be used to endorse or promote products derived
32  * from this software without prior written permission. For written
33  * permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  * nor may "Apache" appear in their names without prior written
37  * permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */

59
60 import org.apache.regexp.RE;
61 import java.util.Hashtable JavaDoc;
62
63 /**
64  * A regular expression compiler class. This class compiles a pattern string into a
65  * regular expression program interpretable by the RE evaluator class. The 'recompile'
66  * command line tool uses this compiler to pre-compile regular expressions for use
67  * with RE. For a description of the syntax accepted by RECompiler and what you can
68  * do with regular expressions, see the documentation for the RE matcher class.
69  *
70  * @see RE
71  * @see recompile
72  *
73  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
74  * @author <a HREF="mailto:gholam@xtra.co.nz">Michael McCallum</a>
75  * @version $Id: RECompiler.java,v 1.11 2003/09/01 18:31:53 vgritsenko Exp $
76  */

77 public class RECompiler
78 {
79     // The compiled program
80
char[] instruction; // The compiled RE 'program' instruction buffer
81
int lenInstruction; // The amount of the program buffer currently in use
82

83     // Input state for compiling regular expression
84
String JavaDoc pattern; // Input string
85
int len; // Length of the pattern string
86
int idx; // Current input index into ac
87
int parens; // Total number of paren pairs
88

89     // Node flags
90
static final int NODE_NORMAL = 0; // No flags (nothing special)
91
static final int NODE_NULLABLE = 1; // True if node is potentially null
92
static final int NODE_TOPLEVEL = 2; // True if top level expr
93

94     // Special types of 'escapes'
95
static final char ESC_MASK = 0xfff0; // Escape complexity mask
96
static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
97
static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
98
static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
99

100     // {m,n} stacks
101
int maxBrackets = 10; // Maximum number of bracket pairs
102
static final int bracketUnbounded = -1; // Unbounded value
103
int brackets = 0; // Number of bracket sets
104
int[] bracketStart = null; // Starting point
105
int[] bracketEnd = null; // Ending point
106
int[] bracketMin = null; // Minimum number of matches
107
int[] bracketOpt = null; // Additional optional matches
108

109     // Lookup table for POSIX character class names
110
static Hashtable JavaDoc hashPOSIX = new Hashtable JavaDoc();
111     static
112     {
113         hashPOSIX.put("alnum", new Character JavaDoc(RE.POSIX_CLASS_ALNUM));
114         hashPOSIX.put("alpha", new Character JavaDoc(RE.POSIX_CLASS_ALPHA));
115         hashPOSIX.put("blank", new Character JavaDoc(RE.POSIX_CLASS_BLANK));
116         hashPOSIX.put("cntrl", new Character JavaDoc(RE.POSIX_CLASS_CNTRL));
117         hashPOSIX.put("digit", new Character JavaDoc(RE.POSIX_CLASS_DIGIT));
118         hashPOSIX.put("graph", new Character JavaDoc(RE.POSIX_CLASS_GRAPH));
119         hashPOSIX.put("lower", new Character JavaDoc(RE.POSIX_CLASS_LOWER));
120         hashPOSIX.put("print", new Character JavaDoc(RE.POSIX_CLASS_PRINT));
121         hashPOSIX.put("punct", new Character JavaDoc(RE.POSIX_CLASS_PUNCT));
122         hashPOSIX.put("space", new Character JavaDoc(RE.POSIX_CLASS_SPACE));
123         hashPOSIX.put("upper", new Character JavaDoc(RE.POSIX_CLASS_UPPER));
124         hashPOSIX.put("xdigit", new Character JavaDoc(RE.POSIX_CLASS_XDIGIT));
125         hashPOSIX.put("javastart", new Character JavaDoc(RE.POSIX_CLASS_JSTART));
126         hashPOSIX.put("javapart", new Character JavaDoc(RE.POSIX_CLASS_JPART));
127     }
128
129     /**
130      * Constructor. Creates (initially empty) storage for a regular expression program.
131      */

132     public RECompiler()
133     {
134         // Start off with a generous, yet reasonable, initial size
135
instruction = new char[128];
136         lenInstruction = 0;
137     }
138
139     /**
140      * Ensures that n more characters can fit in the program buffer.
141      * If n more can't fit, then the size is doubled until it can.
142      * @param n Number of additional characters to ensure will fit.
143      */

144     void ensure(int n)
145     {
146         // Get current program length
147
int curlen = instruction.length;
148
149         // If the current length + n more is too much
150
if (lenInstruction + n >= curlen)
151         {
152             // Double the size of the program array until n more will fit
153
while (lenInstruction + n >= curlen)
154             {
155                 curlen *= 2;
156             }
157
158             // Allocate new program array and move data into it
159
char[] newInstruction = new char[curlen];
160             System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
161             instruction = newInstruction;
162         }
163     }
164
165     /**
166      * Emit a single character into the program stream.
167      * @param c Character to add
168      */

169     void emit(char c)
170     {
171         // Make room for character
172
ensure(1);
173
174         // Add character
175
instruction[lenInstruction++] = c;
176     }
177
178     /**
179      * Inserts a node with a given opcode and opdata at insertAt. The node relative next
180      * pointer is initialized to 0.
181      * @param opcode Opcode for new node
182      * @param opdata Opdata for new node (only the low 16 bits are currently used)
183      * @param insertAt Index at which to insert the new node in the program
184      */

185     void nodeInsert(char opcode, int opdata, int insertAt)
186     {
187         // Make room for a new node
188
ensure(RE.nodeSize);
189
190         // Move everything from insertAt to the end down nodeSize elements
191
System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
192         instruction[insertAt + RE.offsetOpcode] = opcode;
193         instruction[insertAt + RE.offsetOpdata] = (char)opdata;
194         instruction[insertAt + RE.offsetNext] = 0;
195         lenInstruction += RE.nodeSize;
196     }
197
198     /**
199      * Appends a node to the end of a node chain
200      * @param node Start of node chain to traverse
201      * @param pointTo Node to have the tail of the chain point to
202      */

203     void setNextOfEnd(int node, int pointTo)
204     {
205         // Traverse the chain until the next offset is 0
206
int next = instruction[node + RE.offsetNext];
207         // while the 'node' is not the last in the chain
208
// and the 'node' is not the last in the program.
209
while ( next != 0 && node < lenInstruction )
210         {
211             // if the node we are supposed to point to is in the chain then
212
// point to the end of the program instead.
213
// Michael McCallum <gholam@xtra.co.nz>
214
// FIXME: // This is a _hack_ to stop infinite programs.
215
// I believe that the implementation of the reluctant matches is wrong but
216
// have not worked out a better way yet.
217
if ( node == pointTo ) {
218               pointTo = lenInstruction;
219             }
220             node += next;
221             next = instruction[node + RE.offsetNext];
222         }
223         // if we have reached the end of the program then dont set the pointTo.
224
// im not sure if this will break any thing but passes all the tests.
225
if ( node < lenInstruction ) {
226             // Point the last node in the chain to pointTo.
227
instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
228         }
229     }
230
231     /**
232      * Adds a new node
233      * @param opcode Opcode for node
234      * @param opdata Opdata for node (only the low 16 bits are currently used)
235      * @return Index of new node in program
236      */

237     int node(char opcode, int opdata)
238     {
239         // Make room for a new node
240
ensure(RE.nodeSize);
241
242         // Add new node at end
243
instruction[lenInstruction + RE.offsetOpcode] = opcode;
244         instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
245         instruction[lenInstruction + RE.offsetNext] = 0;
246         lenInstruction += RE.nodeSize;
247
248         // Return index of new node
249
return lenInstruction - RE.nodeSize;
250     }
251
252
253     /**
254      * Throws a new internal error exception
255      * @exception Error Thrown in the event of an internal error.
256      */

257     void internalError() throws Error JavaDoc
258     {
259         throw new Error JavaDoc("Internal error!");
260     }
261
262     /**
263      * Throws a new syntax error exception
264      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
265      */

266     void syntaxError(String JavaDoc s) throws RESyntaxException
267     {
268         throw new RESyntaxException(s);
269     }
270
271     /**
272      * Allocate storage for brackets only as needed
273      */

274     void allocBrackets()
275     {
276         // Allocate bracket stacks if not already done
277
if (bracketStart == null)
278         {
279             // Allocate storage
280
bracketStart = new int[maxBrackets];
281             bracketEnd = new int[maxBrackets];
282             bracketMin = new int[maxBrackets];
283             bracketOpt = new int[maxBrackets];
284
285             // Initialize to invalid values
286
for (int i = 0; i < maxBrackets; i++)
287             {
288                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
289             }
290         }
291     }
292
293     /** Enlarge storage for brackets only as needed. */
294     synchronized void reallocBrackets() {
295         // trick the tricky
296
if (bracketStart == null) {
297             allocBrackets();
298         }
299
300         int new_size = maxBrackets * 2;
301         int[] new_bS = new int[new_size];
302         int[] new_bE = new int[new_size];
303         int[] new_bM = new int[new_size];
304         int[] new_bO = new int[new_size];
305         // Initialize to invalid values
306
for (int i=brackets; i<new_size; i++) {
307             new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1;
308         }
309         System.arraycopy(bracketStart,0, new_bS,0, brackets);
310         System.arraycopy(bracketEnd,0, new_bE,0, brackets);
311         System.arraycopy(bracketMin,0, new_bM,0, brackets);
312         System.arraycopy(bracketOpt,0, new_bO,0, brackets);
313         bracketStart = new_bS;
314         bracketEnd = new_bE;
315         bracketMin = new_bM;
316         bracketOpt = new_bO;
317         maxBrackets = new_size;
318     }
319
320     /**
321      * Match bracket {m,n} expression put results in bracket member variables
322      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
323      */

324     void bracket() throws RESyntaxException
325     {
326         // Current character must be a '{'
327
if (idx >= len || pattern.charAt(idx++) != '{')
328         {
329             internalError();
330         }
331
332         // Next char must be a digit
333
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
334         {
335             syntaxError("Expected digit");
336         }
337
338         // Get min ('m' of {m,n}) number
339
StringBuffer JavaDoc number = new StringBuffer JavaDoc();
340         while (idx < len && Character.isDigit(pattern.charAt(idx)))
341         {
342             number.append(pattern.charAt(idx++));
343         }
344         try
345         {
346             bracketMin[brackets] = Integer.parseInt(number.toString());
347         }
348         catch (NumberFormatException JavaDoc e)
349         {
350             syntaxError("Expected valid number");
351         }
352
353         // If out of input, fail
354
if (idx >= len)
355         {
356             syntaxError("Expected comma or right bracket");
357         }
358
359         // If end of expr, optional limit is 0
360
if (pattern.charAt(idx) == '}')
361         {
362             idx++;
363             bracketOpt[brackets] = 0;
364             return;
365         }
366
367         // Must have at least {m,} and maybe {m,n}.
368
if (idx >= len || pattern.charAt(idx++) != ',')
369         {
370             syntaxError("Expected comma");
371         }
372
373         // If out of input, fail
374
if (idx >= len)
375         {
376             syntaxError("Expected comma or right bracket");
377         }
378
379         // If {m,} max is unlimited
380
if (pattern.charAt(idx) == '}')
381         {
382             idx++;
383             bracketOpt[brackets] = bracketUnbounded;
384             return;
385         }
386
387         // Next char must be a digit
388
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
389         {
390             syntaxError("Expected digit");
391         }
392
393         // Get max number
394
number.setLength(0);
395         while (idx < len && Character.isDigit(pattern.charAt(idx)))
396         {
397             number.append(pattern.charAt(idx++));
398         }
399         try
400         {
401             bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
402         }
403         catch (NumberFormatException JavaDoc e)
404         {
405             syntaxError("Expected valid number");
406         }
407
408         // Optional repetitions must be >= 0
409
if (bracketOpt[brackets] < 0)
410         {
411             syntaxError("Bad range");
412         }
413
414         // Must have close brace
415
if (idx >= len || pattern.charAt(idx++) != '}')
416         {
417             syntaxError("Missing close brace");
418         }
419     }
420
421     /**
422      * Match an escape sequence. Handles quoted chars and octal escapes as well
423      * as normal escape characters. Always advances the input stream by the
424      * right amount. This code "understands" the subtle difference between an
425      * octal escape and a backref. You can access the type of ESC_CLASS or
426      * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
427      * @return ESC_* code or character if simple escape
428      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
429      */

430     char escape() throws RESyntaxException
431     {
432         // "Shouldn't" happen
433
if (pattern.charAt(idx) != '\\')
434         {
435             internalError();
436         }
437
438         // Escape shouldn't occur as last character in string!
439
if (idx + 1 == len)
440         {
441             syntaxError("Escape terminates string");
442         }
443
444         // Switch on character after backslash
445
idx += 2;
446         char escapeChar = pattern.charAt(idx - 1);
447         switch (escapeChar)
448         {
449             case RE.E_BOUND:
450             case RE.E_NBOUND:
451                 return ESC_COMPLEX;
452
453             case RE.E_ALNUM:
454             case RE.E_NALNUM:
455             case RE.E_SPACE:
456             case RE.E_NSPACE:
457             case RE.E_DIGIT:
458             case RE.E_NDIGIT:
459                 return ESC_CLASS;
460
461             case 'u':
462             case 'x':
463                 {
464                     // Exact required hex digits for escape type
465
int hexDigits = (escapeChar == 'u' ? 4 : 2);
466
467                     // Parse up to hexDigits characters from input
468
int val = 0;
469                     for ( ; idx < len && hexDigits-- > 0; idx++)
470                     {
471                         // Get char
472
char c = pattern.charAt(idx);
473
474                         // If it's a hexadecimal digit (0-9)
475
if (c >= '0' && c <= '9')
476                         {
477                             // Compute new value
478
val = (val << 4) + c - '0';
479                         }
480                         else
481                         {
482                             // If it's a hexadecimal letter (a-f)
483
c = Character.toLowerCase(c);
484                             if (c >= 'a' && c <= 'f')
485                             {
486                                 // Compute new value
487
val = (val << 4) + (c - 'a') + 10;
488                             }
489                             else
490                             {
491                                 // If it's not a valid digit or hex letter, the escape must be invalid
492
// because hexDigits of input have not been absorbed yet.
493
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
494                             }
495                         }
496                     }
497                     return (char)val;
498                 }
499
500             case 't':
501                 return '\t';
502
503             case 'n':
504                 return '\n';
505
506             case 'r':
507                 return '\r';
508
509             case 'f':
510                 return '\f';
511
512             case '0':
513             case '1':
514             case '2':
515             case '3':
516             case '4':
517             case '5':
518             case '6':
519             case '7':
520             case '8':
521             case '9':
522
523                 // An octal escape starts with a 0 or has two digits in a row
524
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
525                 {
526                     // Handle \nnn octal escapes
527
int val = escapeChar - '0';
528                     if (idx < len && Character.isDigit(pattern.charAt(idx)))
529                     {
530                         val = ((val << 3) + (pattern.charAt(idx++) - '0'));
531                         if (idx < len && Character.isDigit(pattern.charAt(idx)))
532                         {
533                             val = ((val << 3) + (pattern.charAt(idx++) - '0'));
534                         }
535                     }
536                     return (char)val;
537                 }
538
539                 // It's actually a backreference (\[1-9]), not an escape
540
return ESC_BACKREF;
541
542             default:
543
544                 // Simple quoting of a character
545
return escapeChar;
546         }
547     }
548
549     /**
550      * Compile a character class
551      * @return Index of class node
552      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
553      */

554     int characterClass() throws RESyntaxException
555     {
556         // Check for bad calling or empty class
557
if (pattern.charAt(idx) != '[')
558         {
559             internalError();
560         }
561
562         // Check for unterminated or empty class
563
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
564         {
565             syntaxError("Empty or unterminated class");
566         }
567
568         // Check for POSIX character class
569
if (idx < len && pattern.charAt(idx) == ':')
570         {
571             // Skip colon
572
idx++;
573
574             // POSIX character classes are denoted with lowercase ASCII strings
575
int idxStart = idx;
576             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
577             {
578                 idx++;
579             }
580
581             // Should be a ":]" to terminate the POSIX character class
582
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
583             {
584                 // Get character class
585
String JavaDoc charClass = pattern.substring(idxStart, idx);
586
587                 // Select the POSIX class id
588
Character JavaDoc i = (Character JavaDoc)hashPOSIX.get(charClass);
589                 if (i != null)
590                 {
591                     // Move past colon and right bracket
592
idx += 2;
593
594                     // Return new POSIX character class node
595
return node(RE.OP_POSIXCLASS, i.charValue());
596                 }
597                 syntaxError("Invalid POSIX character class '" + charClass + "'");
598             }
599             syntaxError("Invalid POSIX character class syntax");
600         }
601
602         // Try to build a class. Create OP_ANYOF node
603
int ret = node(RE.OP_ANYOF, 0);
604
605         // Parse class declaration
606
char CHAR_INVALID = Character.MAX_VALUE;
607         char last = CHAR_INVALID;
608         char simpleChar = 0;
609         boolean include = true;
610         boolean definingRange = false;
611         int idxFirst = idx;
612         char rangeStart = Character.MIN_VALUE;
613         char rangeEnd;
614         RERange range = new RERange();
615         while (idx < len && pattern.charAt(idx) != ']')
616         {
617
618             switchOnCharacter:
619
620             // Switch on character
621
switch (pattern.charAt(idx))
622             {
623                 case '^':
624                     include = !include;
625                     if (idx == idxFirst)
626                     {
627                         range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
628                     }
629                     idx++;
630                     continue;
631
632                 case '\\':
633                 {
634                     // Escape always advances the stream
635
char c;
636                     switch (c = escape ())
637                     {
638                         case ESC_COMPLEX:
639                         case ESC_BACKREF:
640
641                             // Word boundaries and backrefs not allowed in a character class!
642
syntaxError("Bad character class");
643
644                         case ESC_CLASS:
645
646                             // Classes can't be an endpoint of a range
647
if (definingRange)
648                             {
649                                 syntaxError("Bad character class");
650                             }
651
652                             // Handle specific type of class (some are ok)
653
switch (pattern.charAt(idx - 1))
654                             {
655                                 case RE.E_NSPACE:
656                                 case RE.E_NDIGIT:
657                                 case RE.E_NALNUM:
658                                     syntaxError("Bad character class");
659
660                                 case RE.E_SPACE:
661                                     range.include('\t', include);
662                                     range.include('\r', include);
663                                     range.include('\f', include);
664                                     range.include('\n', include);
665                                     range.include('\b', include);
666                                     range.include(' ', include);
667                                     break;
668
669                                 case RE.E_ALNUM:
670                                     range.include('a', 'z', include);
671                                     range.include('A', 'Z', include);
672                                     range.include('_', include);
673
674                                     // Fall through!
675

676                                 case RE.E_DIGIT:
677                                     range.include('0', '9', include);
678                                     break;
679                             }
680
681                             // Make last char invalid (can't be a range start)
682
last = CHAR_INVALID;
683                             break;
684
685                         default:
686
687                             // Escape is simple so treat as a simple char
688
simpleChar = c;
689                             break switchOnCharacter;
690                     }
691                 }
692                 continue;
693
694                 case '-':
695
696                     // Start a range if one isn't already started
697
if (definingRange)
698                     {
699                         syntaxError("Bad class range");
700                     }
701                     definingRange = true;
702
703                     // If no last character, start of range is 0
704
rangeStart = (last == CHAR_INVALID ? 0 : last);
705
706                     // Premature end of range. define up to Character.MAX_VALUE
707
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
708                     {
709                         simpleChar = Character.MAX_VALUE;
710                         break;
711                     }
712                     continue;
713
714                 default:
715                     simpleChar = pattern.charAt(idx++);
716                     break;
717             }
718
719             // Handle simple character simpleChar
720
if (definingRange)
721             {
722                 // if we are defining a range make it now
723
rangeEnd = simpleChar;
724
725                 // Actually create a range if the range is ok
726
if (rangeStart >= rangeEnd)
727                 {
728                     syntaxError("Bad character class");
729                 }
730                 range.include(rangeStart, rangeEnd, include);
731
732                 // We are done defining the range
733
last = CHAR_INVALID;
734                 definingRange = false;
735             }
736             else
737             {
738                 // If simple character and not start of range, include it
739
if (idx >= len || pattern.charAt(idx) != '-')
740                 {
741                     range.include(simpleChar, include);
742                 }
743                 last = simpleChar;
744             }
745         }
746
747         // Shouldn't be out of input
748
if (idx == len)
749         {
750             syntaxError("Unterminated character class");
751         }
752
753         // Absorb the ']' end of class marker
754
idx++;
755
756         // Emit character class definition
757
instruction[ret + RE.offsetOpdata] = (char)range.num;
758         for (int i = 0; i < range.num; i++)
759         {
760             emit((char)range.minRange[i]);
761             emit((char)range.maxRange[i]);
762         }
763         return ret;
764     }
765
766     /**
767      * Absorb an atomic character string. This method is a little tricky because
768      * it can un-include the last character of string if a closure operator follows.
769      * This is correct because *+? have higher precedence than concatentation (thus
770      * ABC* means AB(C*) and NOT (ABC)*).
771      * @return Index of new atom node
772      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
773      */

774     int atom() throws RESyntaxException
775     {
776         // Create a string node
777
int ret = node(RE.OP_ATOM, 0);
778
779         // Length of atom
780
int lenAtom = 0;
781
782         // Loop while we've got input
783

784         atomLoop:
785
786         while (idx < len)
787         {
788             // Is there a next char?
789
if ((idx + 1) < len)
790             {
791                 char c = pattern.charAt(idx + 1);
792
793                 // If the next 'char' is an escape, look past the whole escape
794
if (pattern.charAt(idx) == '\\')
795                 {
796                     int idxEscape = idx;
797                     escape();
798                     if (idx < len)
799                     {
800                         c = pattern.charAt(idx);
801                     }
802                     idx = idxEscape;
803                 }
804
805                 // Switch on next char
806
switch (c)
807                 {
808                     case '{':
809                     case '?':
810                     case '*':
811                     case '+':
812
813                         // If the next character is a closure operator and our atom is non-empty, the
814
// current character should bind to the closure operator rather than the atom
815
if (lenAtom != 0)
816                         {
817                             break atomLoop;
818                         }
819                 }
820             }
821
822             // Switch on current char
823
switch (pattern.charAt(idx))
824             {
825                 case ']':
826                 case '^':
827                 case '$':
828                 case '.':
829                 case '[':
830                 case '(':
831                 case ')':
832                 case '|':
833                     break atomLoop;
834
835                 case '{':
836                 case '?':
837                 case '*':
838                 case '+':
839
840                     // We should have an atom by now
841
if (lenAtom == 0)
842                     {
843                         // No atom before closure
844
syntaxError("Missing operand to closure");
845                     }
846                     break atomLoop;
847
848                 case '\\':
849
850                     {
851                         // Get the escaped character (advances input automatically)
852
int idxBeforeEscape = idx;
853                         char c = escape();
854
855                         // Check if it's a simple escape (as opposed to, say, a backreference)
856
if ((c & ESC_MASK) == ESC_MASK)
857                         {
858                             // Not a simple escape, so backup to where we were before the escape.
859
idx = idxBeforeEscape;
860                             break atomLoop;
861                         }
862
863                         // Add escaped char to atom
864
emit(c);
865                         lenAtom++;
866                     }
867                     break;
868
869                 default:
870
871                     // Add normal character to atom
872
emit(pattern.charAt(idx++));
873                     lenAtom++;
874                     break;
875             }
876         }
877
878         // This "shouldn't" happen
879
if (lenAtom == 0)
880         {
881             internalError();
882         }
883
884         // Emit the atom length into the program
885
instruction[ret + RE.offsetOpdata] = (char)lenAtom;
886         return ret;
887     }
888
889     /**
890      * Match a terminal node.
891      * @param flags Flags
892      * @return Index of terminal node (closeable)
893      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
894      */

895     int terminal(int[] flags) throws RESyntaxException
896     {
897         switch (pattern.charAt(idx))
898         {
899         case RE.OP_EOL:
900         case RE.OP_BOL:
901         case RE.OP_ANY:
902             return node(pattern.charAt(idx++), 0);
903
904         case '[':
905             return characterClass();
906
907         case '(':
908             return expr(flags);
909
910         case ')':
911             syntaxError("Unexpected close paren");
912
913         case '|':
914             internalError();
915
916         case ']':
917             syntaxError("Mismatched class");
918
919         case 0:
920             syntaxError("Unexpected end of input");
921
922         case '?':
923         case '+':
924         case '{':
925         case '*':
926             syntaxError("Missing operand to closure");
927
928         case '\\':
929             {
930                 // Don't forget, escape() advances the input stream!
931
int idxBeforeEscape = idx;
932
933                 // Switch on escaped character
934
switch (escape())
935                 {
936                     case ESC_CLASS:
937                     case ESC_COMPLEX:
938                         flags[0] &= ~NODE_NULLABLE;
939                         return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
940
941                     case ESC_BACKREF:
942                         {
943                             char backreference = (char)(pattern.charAt(idx - 1) - '0');
944                             if (parens <= backreference)
945                             {
946                                 syntaxError("Bad backreference");
947                             }
948                             flags[0] |= NODE_NULLABLE;
949                             return node(RE.OP_BACKREF, backreference);
950                         }
951
952                     default:
953
954                         // We had a simple escape and we want to have it end up in
955
// an atom, so we back up and fall though to the default handling
956
idx = idxBeforeEscape;
957                         flags[0] &= ~NODE_NULLABLE;
958                         break;
959                 }
960             }
961         }
962
963         // Everything above either fails or returns.
964
// If it wasn't one of the above, it must be the start of an atom.
965
flags[0] &= ~NODE_NULLABLE;
966         return atom();
967     }
968
969     /**
970      * Compile a possibly closured terminal
971      * @param flags Flags passed by reference
972      * @return Index of closured node
973      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
974      */

975     int closure(int[] flags) throws RESyntaxException
976     {
977         // Before terminal
978
int idxBeforeTerminal = idx;
979
980         // Values to pass by reference to terminal()
981
int[] terminalFlags = { NODE_NORMAL };
982
983         // Get terminal symbol
984
int ret = terminal(terminalFlags);
985
986         // Or in flags from terminal symbol
987
flags[0] |= terminalFlags[0];
988
989         // Advance input, set NODE_NULLABLE flag and do sanity checks
990
if (idx >= len)
991         {
992             return ret;
993         }
994         boolean greedy = true;
995         char closureType = pattern.charAt(idx);
996         switch (closureType)
997         {
998             case '?':
999             case '*':
1000
1001                // The current node can be null
1002
flags[0] |= NODE_NULLABLE;
1003
1004            case '+':
1005
1006                // Eat closure character
1007
idx++;
1008
1009            case '{':
1010
1011                // Don't allow blantant stupidity
1012
int opcode = instruction[ret + RE.offsetOpcode];
1013                if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
1014                {
1015                    syntaxError("Bad closure operand");
1016                }
1017                if ((terminalFlags[0] & NODE_NULLABLE) != 0)
1018                {
1019                    syntaxError("Closure operand can't be nullable");
1020                }
1021                break;
1022        }
1023
1024        // If the next character is a '?', make the closure non-greedy (reluctant)
1025
if (idx < len && pattern.charAt(idx) == '?')
1026        {
1027            idx++;
1028            greedy = false;
1029        }
1030
1031        if (greedy)
1032        {
1033            // Actually do the closure now
1034
switch (closureType)
1035            {
1036                case '{':
1037                {
1038                    // We look for our bracket in the list
1039
boolean found = false;
1040                    int i;
1041                    allocBrackets();
1042                    for (i = 0; i < brackets; i++)
1043                    {
1044                        if (bracketStart[i] == idx)
1045                        {
1046                            found = true;
1047                            break;
1048                        }
1049                    }
1050
1051                    // If its not in the list we parse the {m,n}
1052
if (!found)
1053                    {
1054                        if (brackets >= maxBrackets)
1055                        {
1056                            reallocBrackets();
1057                        }
1058                        bracketStart[brackets] = idx;
1059                        bracket();
1060                        bracketEnd[brackets] = idx;
1061                        i = brackets++;
1062                    }
1063
1064                    // Process min first
1065
if (bracketMin[i]-- > 0)
1066                    {
1067                        if (bracketMin[i] > 0 || bracketOpt[i] != 0) {
1068                            // Rewind stream and run it through again - more matchers coming
1069
idx = idxBeforeTerminal;
1070                        } else {
1071                            // Bug #1030: No optinal matches - no need to rewind
1072
idx = bracketEnd[i];
1073                        }
1074                        break;
1075                    }
1076
1077                    // Do the right thing for maximum ({m,})
1078
if (bracketOpt[i] == bracketUnbounded)
1079                    {
1080                        // Drop through now and closure expression.
1081
// We are done with the {m,} expr, so skip rest
1082
closureType = '*';
1083                        bracketOpt[i] = 0;
1084                        idx = bracketEnd[i];
1085                    }
1086                    else
1087                        if (bracketOpt[i]-- > 0)
1088                        {
1089                            if (bracketOpt[i] > 0)
1090                            {
1091                                // More optional matchers - 'play it again sam!'
1092
idx = idxBeforeTerminal;
1093                            } else {
1094                                // Bug #1030: We are done - this one is last and optional
1095
idx = bracketEnd[i];
1096                            }
1097                            // Drop through to optionally close
1098
closureType = '?';
1099                        }
1100                        else
1101                        {
1102                            // Rollback terminal - neither min nor opt matchers present
1103
lenInstruction = ret;
1104                            node(RE.OP_NOTHING, 0);
1105
1106                            // We are done. skip the rest of {m,n} expr
1107
idx = bracketEnd[i];
1108                            break;
1109                        }
1110                }
1111
1112                // Fall through!
1113

1114                case '?':
1115                case '*':
1116
1117                    if (!greedy)
1118                    {
1119                        break;
1120                    }
1121
1122                    if (closureType == '?')
1123                    {
1124                        // X? is compiled as (X|)
1125
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
1126
setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
1127
int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
1128
setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
1129
setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
1130
}
1131
1132                    if (closureType == '*')
1133                    {
1134                        // X* is compiled as (X{gotoX}|)
1135
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
1136
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
1137
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
1138
setNextOfEnd(ret + RE.nodeSize, ret); // the start again
1139
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
1140
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
1141
}
1142                    break;
1143
1144                case '+':
1145                {
1146                    // X+ is compiled as X({gotoX}|)
1147
int branch;
1148                    branch = node(RE.OP_BRANCH, 0); // a new branch
1149
setNextOfEnd(ret, branch); // is added to the end of X
1150
setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1151
setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1152
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1153
}
1154                break;
1155            }
1156        }
1157        else
1158        {
1159            // Add end after closured subexpr
1160
setNextOfEnd(ret, node(RE.OP_END, 0));
1161
1162            // Actually do the closure now
1163
switch (closureType)
1164            {
1165                case '?':
1166                    nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1167                    break;
1168
1169                case '*':
1170                    nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1171                    break;
1172
1173                case '+':
1174                    nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1175                    break;
1176            }
1177
1178            // Point to the expr after the closure
1179
setNextOfEnd(ret, lenInstruction);
1180        }
1181        return ret;
1182    }
1183
1184    /**
1185     * Compile one branch of an or operator (implements concatenation)
1186     * @param flags Flags passed by reference
1187     * @return Pointer to branch node
1188     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1189     */

1190    int branch(int[] flags) throws RESyntaxException
1191    {
1192        // Get each possibly closured piece and concat
1193
int node;
1194        int ret = node(RE.OP_BRANCH, 0);
1195        int chain = -1;
1196        int[] closureFlags = new int[1];
1197        boolean nullable = true;
1198        while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
1199        {
1200            // Get new node
1201
closureFlags[0] = NODE_NORMAL;
1202            node = closure(closureFlags);
1203            if (closureFlags[0] == NODE_NORMAL)
1204            {
1205                nullable = false;
1206            }
1207
1208            // If there's a chain, append to the end
1209
if (chain != -1)
1210            {
1211                setNextOfEnd(chain, node);
1212            }
1213
1214            // Chain starts at current
1215
chain = node;
1216        }
1217
1218        // If we don't run loop, make a nothing node
1219
if (chain == -1)
1220        {
1221            node(RE.OP_NOTHING, 0);
1222        }
1223
1224        // Set nullable flag for this branch
1225
if (nullable)
1226        {
1227            flags[0] |= NODE_NULLABLE;
1228        }
1229        return ret;
1230    }
1231
1232    /**
1233     * Compile an expression with possible parens around it. Paren matching
1234     * is done at this level so we can tie the branch tails together.
1235     * @param flags Flag value passed by reference
1236     * @return Node index of expression in instruction array
1237     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1238     */

1239    int expr(int[] flags) throws RESyntaxException
1240    {
1241        // Create open paren node unless we were called from the top level (which has no parens)
1242
int paren = -1;
1243        int ret = -1;
1244        int closeParens = parens;
1245        if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
1246        {
1247            // if its a cluster ( rather than a proper subexpression ie with backrefs )
1248
if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' )
1249            {
1250                paren = 2;
1251                idx += 3;
1252                ret = node( RE.OP_OPEN_CLUSTER, 0 );
1253            }
1254            else
1255            {
1256                paren = 1;
1257                idx++;
1258                ret = node(RE.OP_OPEN, parens++);
1259            }
1260        }
1261        flags[0] &= ~NODE_TOPLEVEL;
1262
1263        // Create a branch node
1264
int branch = branch(flags);
1265        if (ret == -1)
1266        {
1267            ret = branch;
1268        }
1269        else
1270        {
1271            setNextOfEnd(ret, branch);
1272        }
1273
1274        // Loop through branches
1275
while (idx < len && pattern.charAt(idx) == '|')
1276        {
1277            idx++;
1278            branch = branch(flags);
1279            setNextOfEnd(ret, branch);
1280        }
1281
1282        // Create an ending node (either a close paren or an OP_END)
1283
int end;
1284        if ( paren > 0 )
1285        {
1286            if (idx < len && pattern.charAt(idx) == ')')
1287            {
1288                idx++;
1289            }
1290            else
1291            {
1292                syntaxError("Missing close paren");
1293            }
1294            if ( paren == 1 )
1295            {
1296                end = node(RE.OP_CLOSE, closeParens);
1297            }
1298            else
1299            {
1300                end = node( RE.OP_CLOSE_CLUSTER, 0 );
1301            }
1302        }
1303        else
1304        {
1305            end = node(RE.OP_END, 0);
1306        }
1307
1308        // Append the ending node to the ret nodelist
1309
setNextOfEnd(ret, end);
1310
1311        // Hook the ends of each branch to the end node
1312
int currentNode = ret;
1313        int nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
1314        // while the next node o
1315
while ( nextNodeOffset != 0 && currentNode < lenInstruction )
1316        {
1317            // If branch, make the end of the branch's operand chain point to the end node.
1318
if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH )
1319            {
1320                setNextOfEnd( currentNode + RE.nodeSize, end );
1321            }
1322            nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
1323            currentNode += nextNodeOffset;
1324        }
1325
1326        // Return the node list
1327
return ret;
1328    }
1329
1330    /**
1331     * Compiles a regular expression pattern into a program runnable by the pattern
1332     * matcher class 'RE'.
1333     * @param pattern Regular expression pattern to compile (see RECompiler class
1334     * for details).
1335     * @return A compiled regular expression program.
1336     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
1337     * @see RECompiler
1338     * @see RE
1339     */

1340    public REProgram compile(String JavaDoc pattern) throws RESyntaxException
1341    {
1342        // Initialize variables for compilation
1343
this.pattern = pattern; // Save pattern in instance variable
1344
len = pattern.length(); // Precompute pattern length for speed
1345
idx = 0; // Set parsing index to the first character
1346
lenInstruction = 0; // Set emitted instruction count to zero
1347
parens = 1; // Set paren level to 1 (the implicit outer parens)
1348
brackets = 0; // No bracketed closures yet
1349

1350        // Initialize pass by reference flags value
1351
int[] flags = { NODE_TOPLEVEL };
1352
1353        // Parse expression
1354
expr(flags);
1355
1356        // Should be at end of input
1357
if (idx != len)
1358        {
1359            if (pattern.charAt(idx) == ')')
1360            {
1361                syntaxError("Unmatched close paren");
1362            }
1363            syntaxError("Unexpected input remains");
1364        }
1365
1366        // Return the result
1367
char[] ins = new char[lenInstruction];
1368        System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1369        return new REProgram(parens, ins);
1370    }
1371
1372    /**
1373     * Local, nested class for maintaining character ranges for character classes.
1374     */

1375    class RERange
1376    {
1377        int size = 16; // Capacity of current range arrays
1378
int[] minRange = new int[size]; // Range minima
1379
int[] maxRange = new int[size]; // Range maxima
1380
int num = 0; // Number of range array elements in use
1381

1382        /**
1383         * Deletes the range at a given index from the range lists
1384         * @param index Index of range to delete from minRange and maxRange arrays.
1385         */

1386        void delete(int index)
1387        {
1388            // Return if no elements left or index is out of range
1389
if (num == 0 || index >= num)
1390            {
1391                return;
1392            }
1393
1394            // Move elements down
1395
while (++index < num)
1396            {
1397                if (index - 1 >= 0)
1398                {
1399                    minRange[index-1] = minRange[index];
1400                    maxRange[index-1] = maxRange[index];
1401                }
1402            }
1403
1404            // One less element now
1405
num--;
1406        }
1407
1408        /**
1409         * Merges a range into the range list, coalescing ranges if possible.
1410         * @param min Minimum end of range
1411         * @param max Maximum end of range
1412         */

1413        void merge(int min, int max)
1414        {
1415            // Loop through ranges
1416
for (int i = 0; i < num; i++)
1417            {
1418                // Min-max is subsumed by minRange[i]-maxRange[i]
1419
if (min >= minRange[i] && max <= maxRange[i])
1420                {
1421                    return;
1422                }
1423
1424                // Min-max subsumes minRange[i]-maxRange[i]
1425
else if (min <= minRange[i] && max >= maxRange[i])
1426                {
1427                    delete(i);
1428                    merge(min, max);
1429                    return;
1430                }
1431
1432                // Min is in the range, but max is outside
1433
else if (min >= minRange[i] && min <= maxRange[i])
1434                {
1435                    delete(i);
1436                    min = minRange[i];
1437                    merge(min, max);
1438                    return;
1439                }
1440
1441                // Max is in the range, but min is outside
1442
else if (max >= minRange[i] && max <= maxRange[i])
1443                {
1444                    delete(i);
1445                    max = maxRange[i];
1446                    merge(min, max);
1447                    return;
1448                }
1449            }
1450
1451            // Must not overlap any other ranges
1452
if (num >= size)
1453            {
1454                size *= 2;
1455                int[] newMin = new int[size];
1456                int[] newMax = new int[size];
1457                System.arraycopy(minRange, 0, newMin, 0, num);
1458                System.arraycopy(maxRange, 0, newMax, 0, num);
1459                minRange = newMin;
1460                maxRange = newMax;
1461            }
1462            minRange[num] = min;
1463            maxRange[num] = max;
1464            num++;
1465        }
1466
1467        /**
1468         * Removes a range by deleting or shrinking all other ranges
1469         * @param min Minimum end of range
1470         * @param max Maximum end of range
1471         */

1472        void remove(int min, int max)
1473        {
1474            // Loop through ranges
1475
for (int i = 0; i < num; i++)
1476            {
1477                // minRange[i]-maxRange[i] is subsumed by min-max
1478
if (minRange[i] >= min && maxRange[i] <= max)
1479                {
1480                    delete(i);
1481                    i--;
1482                    return;
1483                }
1484
1485                // min-max is subsumed by minRange[i]-maxRange[i]
1486
else if (min >= minRange[i] && max <= maxRange[i])
1487                {
1488                    int minr = minRange[i];
1489                    int maxr = maxRange[i];
1490                    delete(i);
1491                    if (minr < min)
1492                    {
1493                        merge(minr, min - 1);
1494                    }
1495                    if (max < maxr)
1496                    {
1497                        merge(max + 1, maxr);
1498                    }
1499                    return;
1500                }
1501
1502                // minRange is in the range, but maxRange is outside
1503
else if (minRange[i] >= min && minRange[i] <= max)
1504                {
1505                    minRange[i] = max + 1;
1506                    return;
1507                }
1508
1509                // maxRange is in the range, but minRange is outside
1510
else if (maxRange[i] >= min && maxRange[i] <= max)
1511                {
1512                    maxRange[i] = min - 1;
1513                    return;
1514                }
1515            }
1516        }
1517
1518        /**
1519         * Includes (or excludes) the range from min to max, inclusive.
1520         * @param min Minimum end of range
1521         * @param max Maximum end of range
1522         * @param include True if range should be included. False otherwise.
1523         */

1524        void include(int min, int max, boolean include)
1525        {
1526            if (include)
1527            {
1528                merge(min, max);
1529            }
1530            else
1531            {
1532                remove(min, max);
1533            }
1534        }
1535
1536        /**
1537         * Includes a range with the same min and max
1538         * @param minmax Minimum and maximum end of range (inclusive)
1539         * @param include True if range should be included. False otherwise.
1540         */

1541        void include(char minmax, boolean include)
1542        {
1543            include(minmax, minmax, include);
1544        }
1545    }
1546}
1547
Popular Tags