KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > regexp > RECompiler


1 package org.apache.regexp;
2
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  * any, must include the following acknowlegement:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowlegement may appear in the software itself,
28  * if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  * Foundation" must not be used to endorse or promote products derived
32  * from this software without prior written permission. For written
33  * permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  * nor may "Apache" appear in their names without prior written
37  * permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */

59
60 import org.apache.regexp.RE;
61 import java.util.Hashtable JavaDoc;
62
63 /**
64  * A regular expression compiler class. This class compiles a pattern string into a
65  * regular expression program interpretable by the RE evaluator class. The 'recompile'
66  * command line tool uses this compiler to pre-compile regular expressions for use
67  * with RE. For a description of the syntax accepted by RECompiler and what you can
68  * do with regular expressions, see the documentation for the RE matcher class.
69  *
70  * @see RE
71  * @see recompile
72  *
73  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
74  * @author <a HREF="mailto:gholam@xtra.co.nz">Michael McCallum</a>
75  * @version $Id: RECompiler.java,v 1.11 2003/09/01 18:31:53 vgritsenko Exp $
76  */

77 public class RECompiler
78 {
79     // The compiled program
80
char[] instruction; // The compiled RE 'program' instruction buffer
81
int lenInstruction; // The amount of the program buffer currently in use
82

83     // Input state for compiling regular expression
84
String JavaDoc pattern; // Input string
85
int len; // Length of the pattern string
86
int idx; // Current input index into ac
87
int parens; // Total number of paren pairs
88

89     // Node flags
90
static final int NODE_NORMAL = 0; // No flags (nothing special)
91
static final int NODE_NULLABLE = 1; // True if node is potentially null
92
static final int NODE_TOPLEVEL = 2; // True if top level expr
93

94     // Special types of 'escapes'
95
static final char ESC_MASK = 0xfff0; // Escape complexity mask
96
static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
97
static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
98
static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
99

100     // {m,n} stacks
101
int maxBrackets = 10; // Maximum number of bracket pairs
102
static final int bracketUnbounded = -1; // Unbounded value
103
int brackets = 0; // Number of bracket sets
104
int[] bracketStart = null; // Starting point
105
int[] bracketEnd = null; // Ending point
106
int[] bracketMin = null; // Minimum number of matches
107
int[] bracketOpt = null; // Additional optional matches
108

109     // Lookup table for POSIX character class names
110
static Hashtable JavaDoc hashPOSIX = new Hashtable JavaDoc();
111     static
112     {
113         hashPOSIX.put("alnum", new Character JavaDoc(RE.POSIX_CLASS_ALNUM));
114         hashPOSIX.put("alpha", new Character JavaDoc(RE.POSIX_CLASS_ALPHA));
115         hashPOSIX.put("blank", new Character JavaDoc(RE.POSIX_CLASS_BLANK));
116         hashPOSIX.put("cntrl", new Character JavaDoc(RE.POSIX_CLASS_CNTRL));
117         hashPOSIX.put("digit", new Character JavaDoc(RE.POSIX_CLASS_DIGIT));
118         hashPOSIX.put("graph", new Character JavaDoc(RE.POSIX_CLASS_GRAPH));
119         hashPOSIX.put("lower", new Character JavaDoc(RE.POSIX_CLASS_LOWER));
120         hashPOSIX.put("print", new Character JavaDoc(RE.POSIX_CLASS_PRINT));
121         hashPOSIX.put("punct", new Character JavaDoc(RE.POSIX_CLASS_PUNCT));
122         hashPOSIX.put("space", new Character JavaDoc(RE.POSIX_CLASS_SPACE));
123         hashPOSIX.put("upper", new Character JavaDoc(RE.POSIX_CLASS_UPPER));
124         hashPOSIX.put("xdigit", new Character JavaDoc(RE.POSIX_CLASS_XDIGIT));
125         hashPOSIX.put("javastart", new Character JavaDoc(RE.POSIX_CLASS_JSTART));
126         hashPOSIX.put("javapart", new Character JavaDoc(RE.POSIX_CLASS_JPART));
127     }
128
129     /**
130      * Constructor. Creates (initially empty) storage for a regular expression program.
131      */

132     public RECompiler()
133     {
134         // Start off with a generous, yet reasonable, initial size
135
instruction = new char[128];
136         lenInstruction = 0;
137     }
138
139     /**
140      * Ensures that n more characters can fit in the program buffer.
141      * If n more can't fit, then the size is doubled until it can.
142      * @param n Number of additional characters to ensure will fit.
143      */

144     void ensure(int n)
145     {
146         // Get current program length
147
int curlen = instruction.length;
148
149         // If the current length + n more is too much
150
if (lenInstruction + n >= curlen)
151         {
152             // Double the size of the program array until n more will fit
153
while (lenInstruction + n >= curlen)
154             {
155                 curlen *= 2;
156             }
157
158             // Allocate new program array and move data into it
159
char[] newInstruction = new char[curlen];
160             System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
161             instruction = newInstruction;
162         }
163     }
164
165     /**
166      * Emit a single character into the program stream.
167      * @param c Character to add
168      */

169     void emit(char c)
170     {
171         // Make room for character
172
ensure(1);
173
174         // Add character
175
instruction[lenInstruction++] = c;
176     }
177
178     /**
179      * Inserts a node with a given opcode and opdata at insertAt. The node relative next
180      * pointer is initialized to 0.
181      * @param opcode Opcode for new node
182      * @param opdata Opdata for new node (only the low 16 bits are currently used)
183      * @param insertAt Index at which to insert the new node in the program
184      */

185     void nodeInsert(char opcode, int opdata, int insertAt)
186     {
187         // Make room for a new node
188
ensure(RE.nodeSize);
189
190         // Move everything from insertAt to the end down nodeSize elements
191
System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
192         instruction[insertAt + RE.offsetOpcode] = opcode;
193         instruction[insertAt + RE.offsetOpdata] = (char)opdata;
194         instruction[insertAt + RE.offsetNext] = 0;
195         lenInstruction += RE.nodeSize;
196     }
197
198     /**
199      * Appends a node to the end of a node chain
200      * @param node Start of node chain to traverse
201      * @param pointTo Node to have the tail of the chain point to
202      */

203     void setNextOfEnd(int node, int pointTo)
204     {
205         // Traverse the chain until the next offset is 0
206
int next = instruction[node + RE.offsetNext];
207         // while the 'node' is not the last in the chain
208
// and the 'node' is not the last in the program.
209
while ( next != 0 && node < lenInstruction )
210         {
211             // if the node we are supposed to point to is in the chain then
212
// point to the end of the program instead.
213
// Michael McCallum <gholam@xtra.co.nz>
214
// FIXME: // This is a _hack_ to stop infinite programs.
215
// I believe that the implementation of the reluctant matches is wrong but
216
// have not worked out a better way yet.
217
if ( node == pointTo ) {
218               pointTo = lenInstruction;
219             }
220             node += next;
221             next = instruction[node + RE.offsetNext];
222         }
223         // if we have reached the end of the program then dont set the pointTo.
224
// im not sure if this will break any thing but passes all the tests.
225
if ( node < lenInstruction ) {
226             // Point the last node in the chain to pointTo.
227
instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
228         }
229     }
230
231     /**
232      * Adds a new node
233      * @param opcode Opcode for node
234      * @param opdata Opdata for node (only the low 16 bits are currently used)
235      * @return Index of new node in program
236      */

237     int node(char opcode, int opdata)
238     {
239         // Make room for a new node
240
ensure(RE.nodeSize);
241
242         // Add new node at end
243
instruction[lenInstruction + RE.offsetOpcode] = opcode;
244         instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
245         instruction[lenInstruction + RE.offsetNext] = 0;
246         lenInstruction += RE.nodeSize;
247
248         // Return index of new node
249
return lenInstruction - RE.nodeSize;
250     }
251
252
253     /**
254      * Throws a new internal error exception
255      * @exception Error Thrown in the event of an internal error.
256      */

257     void internalError() throws Error JavaDoc
258     {
259         throw new Error JavaDoc("Internal error!");
260     }
261
262     /**
263      * Throws a new syntax error exception
264      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
265      */

266     void syntaxError(String JavaDoc s) throws RESyntaxException
267     {
268         throw new RESyntaxException(s);
269     }
270
271     /**
272      * Allocate storage for brackets only as needed
273      */

274     void allocBrackets()
275     {
276         // Allocate bracket stacks if not already done
277
if (bracketStart == null)
278         {
279             // Allocate storage
280
bracketStart = new int[maxBrackets];
281             bracketEnd = new int[maxBrackets];
282             bracketMin = new int[maxBrackets];
283             bracketOpt = new int[maxBrackets];
284
285             // Initialize to invalid values
286
for (int i = 0; i < maxBrackets; i++)
287             {
288                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
289             }
290         }
291     }
292
293     /** Enlarge storage for brackets only as needed. */
294     synchronized void reallocBrackets() {
295         // trick the tricky
296
if (bracketStart == null) {
297             allocBrackets();
298         }
299
300         int new_size = maxBrackets * 2;
301         int[] new_bS = new int[new_size];
302         int[] new_bE = new int[new_size];
303         int[] new_bM = new int[new_size];
304         int[] new_bO = new int[new_size];
305         // Initialize to invalid values
306
for (int i=brackets; i<new_size; i++) {
307             new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1;
308         }
309         System.arraycopy(bracketStart,0, new_bS,0, brackets);
310         System.arraycopy(bracketEnd,0, new_bE,0, brackets);
311         System.arraycopy(bracketMin,0, new_bM,0, brackets);
312         System.arraycopy(bracketOpt,0, new_bO,0, brackets);
313         bracketStart = new_bS;
314         bracketEnd = new_bE;
315         bracketMin = new_bM;
316         bracketOpt = new_bO;
317         maxBrackets = new_size;
318     }
319
320     /**
321      * Match bracket {m,n} expression put results in bracket member variables
322      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
323      */

324     void bracket() throws RESyntaxException
325     {
326         // Current character must be a '{'
327
if (idx >= len || pattern.charAt(idx++) != '{')
328         {
329             internalError();
330         }
331
332         // Next char must be a digit
333
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
334         {
335             syntaxError("Expected digit");
336         }
337
338         // Get min ('m' of {m,n}) number
339
StringBuffer JavaDoc number = new StringBuffer JavaDoc();
340         while (idx < len && Character.isDigit(pattern.charAt(idx)))
341         {
342             number.append(pattern.charAt(idx++));
343         }
344         try
345         {
346             bracketMin[brackets] = Integer.parseInt(number.toString());
347         }
348         catch (NumberFormatException JavaDoc e)
349         {
350             syntaxError("Expected valid number");
351         }
352
353         // If out of input, fail
354
if (idx >= len)
355         {
356             syntaxError("Expected comma or right bracket");
357         }
358
359         // If end of expr, optional limit is 0
360
if (pattern.charAt(idx) == '}')
361         {
362             idx++;
363             bracketOpt[brackets] = 0;
364             return;
365         }
366
367         // Must have at least {m,} and maybe {m,n}.
368
if (idx >= len || pattern.charAt(idx++) != ',')
369         {
370             syntaxError("Expected comma");
371         }
372
373         // If out of input, fail
374
if (idx >= len)
375         {
376             syntaxError("Expected comma or right bracket");
377         }
378
379         // If {m,} max is unlimited
380
if (pattern.charAt(idx) == '}')
381         {
382             idx++;
383             bracketOpt[brackets] = bracketUnbounded;
384             return;
385         }
386
387         // Next char must be a digit
388
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
389         {
390             syntaxError("Expected digit");
391         }
392
393         // Get max number
394
number.setLength(0);
395         while (idx < len && Character.isDigit(pattern.charAt(idx)))
396         {
397             number.append(pattern.charAt(idx++));
398         }
399         try
400         {
401             bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
402         }
403         catch (NumberFormatException JavaDoc e)
404         {
405             syntaxError("Expected valid number");
406         }
407
408         // Optional repetitions must be >= 0
409
if (bracketOpt[brackets] < 0)
410         {
411             syntaxError("Bad range");
412         }
413
414         // Must have close brace
415
if (idx >= len || pattern.charAt(idx++) != '}')
416         {
417             syntaxError("Missing close brace");
418         }
419     }
420
421     /**
422      * Match an escape sequence. Handles quoted chars and octal escapes as well
423      * as normal escape characters. Always advances the input stream by the
424      * right amount. This code "understands" the subtle difference between an
425      * octal escape and a backref. You can access the type of ESC_CLASS or
426      * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
427      * @return ESC_* code or character if simple escape
428      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
429      */

430     char escape() throws RESyntaxException
431     {
432         // "Shouldn't" happen
433
if (pattern.charAt(idx) != '\\')
434         {
435             internalError();
436         }
437
438         // Escape shouldn't occur as last character in string!
439
if (idx + 1 == len)
440         {
441             syntaxError("Escape terminates string");
442         }
443
444         // Switch on character after backslash
445
idx += 2;
446         char escapeChar = pattern.charAt(idx - 1);
447         switch (escapeChar)
448         {
449             case RE.E_BOUND:
450             case RE.E_NBOUND:
451                 return ESC_COMPLEX;
452
453             case RE.E_ALNUM:
454             case RE.E_NALNUM:
455             case RE.E_SPACE:
456             case RE.E_NSPACE:
457             case RE.E_DIGIT:
458             case RE.E_NDIGIT:
459                 return ESC_CLASS;
460
461             case 'u':
462             case 'x':
463                 {
464                     // Exact required hex digits for escape type
465
int hexDigits = (escapeChar == 'u' ? 4 : 2);
466
467                     // Parse up to hexDigits characters from input
468
int val = 0;
469                     for ( ; idx < len && hexDigits-- > 0; idx++)
470                     {
471                         // Get char
472
char c = pattern.charAt(idx);
473
474                         // If it's a hexadecimal digit (0-9)
475
if (c >= '0' && c <= '9')
476                         {
477                             // Compute new value
478
val = (val << 4) + c - '0';
479                         }
480                         else
481                         {
482                             // If it's a hexadecimal letter (a-f)
483
c = Character.toLowerCase(c);
484                             if (c >= 'a' && c <= 'f')
485                             {
486                                 // Compute new value
487
val = (val << 4) + (c - 'a') + 10;
488                             }
489                             else
490                             {
491                                 // If it's not a valid digit or hex letter, the escape must be invalid
492
// because hexDigits of input have not been absorbed yet.
493
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
494                             }
495                         }
496                     }
497                     return (char)val;
498                 }
499
500             case 't':
501                 return '\t';
502
503             case 'n':
504                 return '\n';
505
506             case 'r':
507                 return '\r';
508
509             case 'f':
510                 return '\f';
511
512             case '0':
513             case '1':
514             case '2':
515             case '3':
516             case '4':
517             case '5':
518             case '6':
519             case '7':
520             case '8':
521             case '9':
522
523                 // An octal escape starts with a 0 or has two digits in a row
524
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
525                 {
526                     // Handle \nnn octal escapes
527
int val = escapeChar - '0';
528                     if (idx < len && Character.isDigit(pattern.charAt(idx)))
529                     {
530                         val = ((val << 3) + (pattern.charAt(idx++) - '0'));
531                         if (idx < len && Character.isDigit(pattern.charAt(idx)))
532                         {
533                             val = ((val << 3) + (pattern.charAt(idx++) - '0'));
534                         }
535                     }
536                     return (char)val;
537                 }
538
539                 // It's actually a backreference (\[1-9]), not an escape
540
return ESC_BACKREF;
541
542             default:
543
544                 // Simple quoting of a character
545
return escapeChar;
546         }
547     }
548
549     /**
550      * Compile a character class
551      * @return Index of class node
552      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
553      */

554     int characterClass() throws RESyntaxException
555     {
556         // Check for bad calling or empty class
557
if (pattern.charAt(idx) != '[')
558         {
559             internalError();
560         }
561
562         // Check for unterminated or empty class
563
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
564         {
565             syntaxError("Empty or unterminated class");
566         }
567
568         // Check for POSIX character class
569
if (idx < len && pattern.charAt(idx) == ':')
570         {
571             // Skip colon
572
idx++;
573
574             // POSIX character classes are denoted with lowercase ASCII strings
575
int idxStart = idx;
576             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
577             {
578                 idx++;
579             }
580
581             // Should be a ":]" to terminate the POSIX character class
582
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
583             {
584                 // Get character class
585
String JavaDoc charClass = pattern.substring(idxStart, idx);
586
587                 // Select the POSIX class id
588
Character JavaDoc i = (Character JavaDoc)hashPOSIX.get(charClass);
589                 if (i != null)
590                 {
591                     // Move past colon and right bracket
592
idx += 2;
593
594                     // Return new POSIX character class node
595
return node(RE.OP_POSIXCLASS, i.charValue());
596                 }
597                 syntaxError("Invalid POSIX character class '" + charClass + "'");
598             }
599             syntaxError("Invalid POSIX character class syntax");
600         }
601
602         // Try to build a class. Create OP_ANYOF node
603
int ret = node(RE.OP_ANYOF, 0);
604
605         // Parse class declaration
606
char CHAR_INVALID = Character.MAX_VALUE;
607         char last = CHAR_INVALID;
608         char simpleChar = 0;
609         boolean include = true;
610         boolean definingRange = false;
611         int idxFirst = idx;
612         char rangeStart = Character.MIN_VALUE;
613         char rangeEnd;
614         RERange range = new RERange();
615         while (idx < len && pattern.charAt(idx) != ']')
616         {
617
618             switchOnCharacter:
619
620             // Switch on character
621
switch (pattern.charAt(idx))
622             {
623                 case '^':
624                     include = !include;
625                     if (idx == idxFirst)
626                     {
627                         range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
628                     }
629                     idx++;
630                     continue;
631
632                 case '\\':
633                 {
634                     // Escape always advances the stream
635
char c;
636                     switch (c = escape ())
637                     {
638                         case ESC_COMPLEX:
639                         case ESC_BACKREF:
640
641                             // Word boundaries and backrefs not allowed in a character class!
642
syntaxError("Bad character class");
643
644                         case ESC_CLASS:
645
646                             // Classes can't be an endpoint of a range
647
if (definingRange)
648                             {
649                                 syntaxError("Bad character class");
650                             }
651
652                             // Handle specific type of class (some are ok)
653
switch (pattern.charAt(idx - 1))
654                             {
655                                 case RE.E_NSPACE:
656                                 case RE.E_NDIGIT:
657                                 case RE.E_NALNUM:
658                                     syntaxError("Bad character class");
659
660                                 case RE.E_SPACE:
661                                     range.include('\t', include);
662                                     range.include('\r', include);
663                                     range.include('\f', include);
664                                     range.include('\n', include);
665                                     range.include('\b', include);
666                                     range.include(' ', include);
667                                     break;
668
669                                 case RE.E_ALNUM:
670                                     range.include('a', 'z', include);
671                                     range.include('A', 'Z', include);
672                                     range.include('_', include);
673
674                                     // Fall through!
675

676                                 case RE.E_DIGIT:
677                                     range.include('0', '9', include);
678                                     break;
679                             }
680
681                             // Make last char invalid (can't be a range start)
682
last = CHAR_INVALID;
683                             break;
684
685                         default:
686
687                             // Escape is simple so treat as a simple char
688
simpleChar = c;
689                             break switchOnCharacter;
690                     }
691                 }
692                 continue;
693
694                 case '-':
695
696                     // Start a range if one isn't already started
697
if (definingRange)
698                     {
699                         syntaxError("Bad class range");
700                     }
701                     definingRange = true;
702
703                     // If no last character, start of range is 0
704
rangeStart = (last == CHAR_INVALID ? 0 : last);
705
706                     // Premature end of range. define up to Character.MAX_VALUE
707
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
708                     {
709                         simpleChar = Character.MAX_VALUE;
710                         break;
711                     }
712                     continue;
713
714                 default:
715                     simpleChar = pattern.charAt(idx++);
716                     break;
717             }
718
719             // Handle simple character simpleChar
720
if (definingRange)
721             {
722                 // if we are defining a range make it now
723
rangeEnd = simpleChar;
724
725                 // Actually create a range if the range is ok
726
if (rangeStart >= rangeEnd)
727                 {
728                     syntaxError("Bad character class");
729                 }
730                 range.include(rangeStart, rangeEnd, include);
731
732                 // We are done defining the range
733
last = CHAR_INVALID;
734                 definingRange = false;
735             }
736             else
737             {
738                 // If simple character and not start of range, include it
739
if (idx >= len || pattern.charAt(idx) != '-')
740                 {
741                     range.include(simpleChar, include);
742                 }
743                 last = simpleChar;
744             }
745         }
746
747         // Shouldn't be out of input
748
if (idx == len)
749         {
750             syntaxError("Unterminated character class");
751         }
752
753         // Absorb the ']' end of class marker
754
idx++;
755
756         // Emit character class definition
757
instruction[ret + RE.offsetOpdata] = (char)range.num;
758         for (int i = 0; i < range.num; i++)
759         {
760             emit((char)range.minRange[i]);
761             emit((char)range.maxRange[i]);
762         }
763         return ret;
764     }
765
766     /**
767      * Absorb an atomic character string. This method is a little tricky because
768      * it can un-include the last character of string if a closure operator follows.
769      * This is correct because *+? have higher precedence than concatentation (thus
770      * ABC* means AB(C*) and NOT (ABC)*).
771      * @return Index of new atom node
772      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
773      */

774     int atom() throws RESyntaxException
775     {
776         // Create a string node
777
int ret = node(RE.OP_ATOM, 0);
778
779         // Length of atom
780
int lenAtom = 0;
781
782         // Loop while we've got input
783

784         atomLoop:
785
786         while (idx < len)
787         {
788             // Is there a next char?
789
if ((idx + 1) < len)
790             {
791                 char c = pattern.charAt(idx + 1);
792
793                 // If the next 'char' is an escape, look past the whole escape
794
if (pattern.charAt(idx) == '\\')
795                 {
796                     int idxEscape = idx;
797                     escape();
798                     if (idx < len)
799                     {
800                         c = pattern.charAt(idx);
801                     }
802                     idx = idxEscape;
803                 }
804
805                 // Switch on next char
806
switch (c)
807                 {
808                     case '{':
809                     case '?':
810                     case '*':
811                     case '+':
812
813                         // If the next character is a closure operator and our atom is non-empty, the
814
// current character should bind to the closure operator rather than the atom
815
if (lenAtom != 0)
816                         {
817                             break atomLoop;
818                         }
819                 }
820             }
821
822             // Switch on current char
823
switch (pattern.charAt(idx))
824             {
825                 case ']':
826                 case '^':
827                 case '$':
828                 case '.':
829                 case '[':
830                 case '(':
831                 case ')':
832                 case '|':
833                     break atomLoop;
834
835                 case '{':
836                 case '?':
837                 case '*':
838                 case '+':
839
840                     // We should have an atom by now
841
if (lenAtom == 0)
842                     {
843                         // No atom before closure
844
syntaxError("Missing operand to closure");
845