KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > regexp > RE


1 package org.apache.regexp;
2
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  * any, must include the following acknowlegement:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowlegement may appear in the software itself,
28  * if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  * Foundation" must not be used to endorse or promote products derived
32  * from this software without prior written permission. For written
33  * permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  * nor may "Apache" appear in their names without prior written
37  * permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */

59  
60 import java.io.Serializable JavaDoc;
61 import java.util.Vector JavaDoc;
62
63 /**
64  * RE is an efficient, lightweight regular expression evaluator/matcher
65  * class. Regular expressions are pattern descriptions which enable
66  * sophisticated matching of strings. In addition to being able to
67  * match a string against a pattern, you can also extract parts of the
68  * match. This is especially useful in text parsing! Details on the
69  * syntax of regular expression patterns are given below.
70  *
71  * <p>
72  *
73  * To compile a regular expression (RE), you can simply construct an RE
74  * matcher object from the string specification of the pattern, like this:
75  *
76  * <pre>
77  *
78  * RE r = new RE("a*b");
79  *
80  * </pre>
81  *
82  * <p>
83  *
84  * Once you have done this, you can call either of the RE.match methods to
85  * perform matching on a String. For example:
86  *
87  * <pre>
88  *
89  * boolean matched = r.match("aaaab");
90  *
91  * </pre>
92  *
93  * will cause the boolean matched to be set to true because the
94  * pattern "a*b" matches the string "aaaab".
95  *
96  * <p>
97  * If you were interested in the <i>number</i> of a's which matched the
98  * first part of our example expression, you could change the expression to
99  * "(a*)b". Then when you compiled the expression and matched it against
100  * something like "xaaaab", you would get results like this:
101  *
102  * <pre>
103  *
104  * RE r = new RE("(a*)b"); // Compile expression
105  * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
106  *
107  * <br>
108  *
109  * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
110  * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
111  *
112  * <br>
113  *
114  * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
115  * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
116  * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
117  *
118  * <br>
119  *
120  * int startInside = r.getParenStart(1); // startInside will be index 1
121  * int endInside = r.getParenEnd(1); // endInside will be index 5
122  * int lenInside = r.getParenLength(1); // lenInside will be 4
123  *
124  * </pre>
125  *
126  * You can also refer to the contents of a parenthesized expression
127  * within a regular expression itself. This is called a
128  * 'backreference'. The first backreference in a regular expression is
129  * denoted by \1, the second by \2 and so on. So the expression:
130  *
131  * <pre>
132  *
133  * ([0-9]+)=\1
134  *
135  * </pre>
136  *
137  * will match any string of the form n=n (like 0=0 or 2=2).
138  *
139  * <p>
140  *
141  * The full regular expression syntax accepted by RE is described here:
142  *
143  * <pre>
144  *
145  * <br>
146  *
147  * <b><font face=times roman>Characters</font></b>
148  *
149  * <br>
150  *
151  * <i>unicodeChar</i> Matches any identical unicode character
152  * \ Used to quote a meta-character (like '*')
153  * \\ Matches a single '\' character
154  * \0nnn Matches a given octal character
155  * \xhh Matches a given 8-bit hexadecimal character
156  * \\uhhhh Matches a given 16-bit hexadecimal character
157  * \t Matches an ASCII tab character
158  * \n Matches an ASCII newline character
159  * \r Matches an ASCII return character
160  * \f Matches an ASCII form feed character
161  *
162  * <br>
163  *
164  * <b><font face=times roman>Character Classes</font></b>
165  *
166  * <br>
167  *
168  * [abc] Simple character class
169  * [a-zA-Z] Character class with ranges
170  * [^abc] Negated character class
171  *
172  * <br>
173  *
174  * <b><font face=times roman>Standard POSIX Character Classes</font></b>
175  *
176  * <br>
177  *
178  * [:alnum:] Alphanumeric characters.
179  * [:alpha:] Alphabetic characters.
180  * [:blank:] Space and tab characters.
181  * [:cntrl:] Control characters.
182  * [:digit:] Numeric characters.
183  * [:graph:] Characters that are printable and are also visible.
184  * (A space is printable, but not visible, while an
185  * `a' is both.)
186  * [:lower:] Lower-case alphabetic characters.
187  * [:print:] Printable characters (characters that are not
188  * control characters.)
189  * [:punct:] Punctuation characters (characters that are not letter,
190  * digits, control characters, or space characters).
191  * [:space:] Space characters (such as space, tab, and formfeed,
192  * to name a few).
193  * [:upper:] Upper-case alphabetic characters.
194  * [:xdigit:] Characters that are hexadecimal digits.
195  *
196  * <br>
197  *
198  * <b><font face=times roman>Non-standard POSIX-style Character
199  * Classes</font></b>
200  *
201  * <br>
202  *
203  * [:javastart:] Start of a Java identifier
204  * [:javapart:] Part of a Java identifier
205  *
206  * <br>
207  *
208  * <b><font face=times roman>Predefined Classes</font></b>
209  *
210  * <br>
211  *
212  * . Matches any character other than newline
213  * \w Matches a "word" character (alphanumeric plus "_")
214  * \W Matches a non-word character
215  * \s Matches a whitespace character
216  * \S Matches a non-whitespace character
217  * \d Matches a digit character
218  * \D Matches a non-digit character
219  *
220  * <br>
221  *
222  * <b><font face=times roman>Boundary Matchers</font></b>
223  *
224  * <br>
225  *
226  * ^ Matches only at the beginning of a line
227  * $ Matches only at the end of a line
228  * \b Matches only at a word boundary
229  * \B Matches only at a non-word boundary
230  *
231  * <br>
232  *
233  * <b><font face=times roman>Greedy Closures</font></b>
234  *
235  * <br>
236  *
237  * A* Matches A 0 or more times (greedy)
238  * A+ Matches A 1 or more times (greedy)
239  * A? Matches A 1 or 0 times (greedy)
240  * A{n} Matches A exactly n times (greedy)
241  * A{n,} Matches A at least n times (greedy)
242  * A{n,m} Matches A at least n but not more than m times (greedy)
243  *
244  * <br>
245  *
246  * <b><font face=times roman>Reluctant Closures</font></b>
247  *
248  * <br>
249  *
250  * A*? Matches A 0 or more times (reluctant)
251  * A+? Matches A 1 or more times (reluctant)
252  * A?? Matches A 0 or 1 times (reluctant)
253  *
254  * <br>
255  *
256  * <b><font face=times roman>Logical Operators</font></b>
257  *
258  * <br>
259  *
260  * AB Matches A followed by B
261  * A|B Matches either A or B
262  * (A) Used for subexpression grouping
263  * (?:A) Used for subexpression clustering (just like grouping but
264  * no backrefs)
265  *
266  * <br>
267  *
268  * <b><font face=times roman>Backreferences</font></b>
269  *
270  * <br>
271  *
272  * \1 Backreference to 1st parenthesized subexpression
273  * \2 Backreference to 2nd parenthesized subexpression
274  * \3 Backreference to 3rd parenthesized subexpression
275  * \4 Backreference to 4th parenthesized subexpression
276  * \5 Backreference to 5th parenthesized subexpression
277  * \6 Backreference to 6th parenthesized subexpression
278  * \7 Backreference to 7th parenthesized subexpression
279  * \8 Backreference to 8th parenthesized subexpression
280  * \9 Backreference to 9th parenthesized subexpression
281  *
282  * <br>
283  *
284  * </pre>
285  *
286  * <p>
287  *
288  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
289  * that they match as many elements of the string as possible without
290  * causing the overall match to fail. If you want a closure to be
291  * reluctant (non-greedy), you can simply follow it with a '?'. A
292  * reluctant closure will match as few elements of the string as
293  * possible when finding matches. {m,n} closures don't currently
294  * support reluctancy.
295  *
296  * <p>
297  *
298  * RE runs programs compiled by the RECompiler class. But the RE
299  * matcher class does not include the actual regular expression compiler
300  * for reasons of efficiency. In fact, if you want to pre-compile one
301  * or more regular expressions, the 'recompile' class can be invoked
302  * from the command line to produce compiled output like this:
303  *
304  * <pre>
305  *
306  * // Pre-compiled regular expression "a*b"
307  * char[] re1Instructions =
308  * {
309  * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
310  * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
311  * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
312  * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
313  * 0x0000,
314  * };
315  *
316  * <br>
317  *
318  * REProgram re1 = new REProgram(re1Instructions);
319  *
320  * </pre>
321  *
322  * You can then construct a regular expression matcher (RE) object from
323  * the pre-compiled expression re1 and thus avoid the overhead of
324  * compiling the expression at runtime. If you require more dynamic
325  * regular expressions, you can construct a single RECompiler object and
326  * re-use it to compile each expression. * Similarly, you can change the
327  * program run by a given matcher object at any time. * However, RE and
328  * RECompiler are not threadsafe (for efficiency reasons, and because
329  * requiring thread safety in this class is deemed to be a rare
330  * requirement), so you will need to construct a separate compiler or
331  * matcher object for each thread (unless you do thread synchronization
332  * yourself).
333  *
334  * </pre>
335  * <br><p><br>
336  *
337  * <font color=red>
338  * <i>ISSUES:</i>
339  *
340  * <ul>
341  * <li>com.weusours.util.re is not currently compatible with all
342  * standard POSIX regcomp flags</li>
343  * <li>com.weusours.util.re does not support POSIX equivalence classes
344  * ([=foo=] syntax) (I18N/locale issue)</li>
345  * <li>com.weusours.util.re does not support nested POSIX character
346  * classes (definitely should, but not completely trivial)</li>
347  * <li>com.weusours.util.re Does not support POSIX character collation
348  * concepts ([.foo.] syntax) (I18N/locale issue)</li>
349  * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
350  * <li>Should RE support character iterators (for backwards RE matching!)?</li>
351  * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
352  * <li>Not *all* possibilities are considered for greediness when backreferences
353  * are involved (as POSIX suggests should be the case). The POSIX RE
354  * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
355  * of acdacaa where \1 is "a". This is not the case in this RE package,
356  * and actually Perl doesn't go to this extent either! Until someone
357  * actually complains about this, I'm not sure it's worth "fixing".
358  * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
359  * </ul>
360  *
361  * </font>
362  *
363  * @see recompile
364  * @see RECompiler
365  *
366  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
367  * @author <a HREF="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
368  * @version $Id: RE.java,v 1.13 2003/06/02 02:18:41 vgritsenko Exp $
369  */

370 public class RE implements Serializable JavaDoc
371 {
372     /**
373      * Specifies normal, case-sensitive matching behaviour.
374      */

375     public static final int MATCH_NORMAL = 0x0000;
376
377     /**
378      * Flag to indicate that matching should be case-independent (folded)
379      */

380     public static final int MATCH_CASEINDEPENDENT = 0x0001;
381
382     /**
383      * Newlines should match as BOL/EOL (^ and $)
384      */

385     public static final int MATCH_MULTILINE = 0x0002;
386
387     /**
388      * Consider all input a single body of text - newlines are matched by .
389      */

390     public static final int MATCH_SINGLELINE = 0x0004;
391
392     /************************************************
393      * *
394      * The format of a node in a program is: *
395      * *
396      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
397      * *
398      * char OPCODE - instruction *
399      * char OPDATA - modifying data *
400      * char OPNEXT - next node (relative offset) *
401      * *
402      ************************************************/

403
404                  // Opcode Char Opdata/Operand Meaning
405
// ---------- ---------- --------------- --------------------------------------------------
406
static final char OP_END = 'E'; // end of program
407
static final char OP_BOL = '^'; // match only if at beginning of line
408
static final char OP_EOL = '$'; // match only if at end of line
409
static final char OP_ANY = '.'; // match any single character except newline
410
static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
411
static final char OP_BRANCH = '|'; // node match this alternative or the next one
412
static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
413
static final char OP_STAR = '*'; // node kleene closure
414
static final char OP_PLUS = '+'; // node positive closure
415
static final char OP_MAYBE = '?'; // node optional closure
416
static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
417
static final char OP_OPEN = '('; // number nth opening paren
418
static final char OP_OPEN_CLUSTER = '<'; // opening cluster
419
static final char OP_CLOSE = ')'; // number nth closing paren
420
static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
421
static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
422
static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
423
static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
424
static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
425
static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
426
static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
427
static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
428

429     // Escape codes
430
static final char E_ALNUM = 'w'; // Alphanumeric
431
static final char E_NALNUM = 'W'; // Non-alphanumeric
432
static final char E_BOUND = 'b'; // Word boundary
433
static final char E_NBOUND = 'B'; // Non-word boundary
434
static final char E_SPACE = 's'; // Whitespace
435
static final char E_NSPACE = 'S'; // Non-whitespace
436
static final char E_DIGIT = 'd'; // Digit
437
static final char E_NDIGIT = 'D'; // Non-digit
438

439     // Posix character classes
440
static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
441
static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
442
static final char POSIX_CLASS_BLANK = 'b'; // Blanks
443
static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
444
static final char POSIX_CLASS_DIGIT = 'd'; // Digits
445
static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
446
static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
447
static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
448
static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
449
static final char POSIX_CLASS_SPACE = 's'; // Spaces
450
static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
451
static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
452
static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
453
static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
454

455     // Limits
456
static final int maxNode = 65536; // Maximum number of nodes in a program
457
static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
458

459     // Node layout constants
460
static final int offsetOpcode = 0; // Opcode offset (first character)
461
static final int offsetOpdata = 1; // Opdata offset (second char)
462
static final int offsetNext = 2; // Next index offset (third char)
463
static final int nodeSize = 3; // Node size (in chars)
464

465     /** Line Separator */
466     static final String JavaDoc NEWLINE = System.getProperty("line.separator");
467
468     // State of current program
469
REProgram program; // Compiled regular expression 'program'
470
transient CharacterIterator search; // The string being matched against
471
int matchFlags; // Match behaviour flags
472
int maxParen = MAX_PAREN;
473
474     // Parenthesized subexpressions
475
transient int parenCount; // Number of subexpressions matched (num open parens + 1)
476
transient int start0; // Cache of start[0]
477
transient int end0; // Cache of start[0]
478
transient int start1; // Cache of start[1]
479
transient int end1; // Cache of start[1]
480
transient int start2; // Cache of start[2]
481
transient int end2; // Cache of start[2]
482
transient int[] startn; // Lazy-alloced array of sub-expression starts
483
transient int[] endn; // Lazy-alloced array of sub-expression ends
484

485     // Backreferences
486
transient int[] startBackref; // Lazy-alloced array of backref starts
487
transient int[] endBackref; // Lazy-alloced array of backref ends
488

489     /**
490      * Constructs a regular expression matcher from a String by compiling it
491      * using a new instance of RECompiler. If you will be compiling many
492      * expressions, you may prefer to use a single RECompiler object instead.
493      * @param pattern The regular expression pattern to compile.
494      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
495      * @see RECompiler
496      * @see recompile
497      */

498     public RE(String JavaDoc pattern) throws RESyntaxException
499     {
500         this(pattern, MATCH_NORMAL);
501     }
502
503     /**
504      * Constructs a regular expression matcher from a String by compiling it
505      * using a new instance of RECompiler. If you will be compiling many
506      * expressions, you may prefer to use a single RECompiler object instead.
507      * @param pattern The regular expression pattern to compile.
508      * @param matchFlags The matching style
509      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
510      * @see RECompiler
511      * @see recompile
512      */

513     public RE(String JavaDoc pattern, int matchFlags) throws RESyntaxException
514     {
515         this(new RECompiler().compile(pattern));
516         setMatchFlags(matchFlags);
517     }
518
519     /**
520      * Construct a matcher for a pre-compiled regular expression from program
521      * (bytecode) data. Permits special flags to be passed in to modify matching
522      * behaviour.
523      * @param program Compiled regular expression program (see RECompiler and/or recompile)
524      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
525      *
526      * <pre>
527      *
528      * MATCH_NORMAL // Normal (case-sensitive) matching
529      * MATCH_CASEINDEPENDENT // Case folded comparisons
530      * MATCH_MULTILINE // Newline matches as BOL/EOL
531      *
532      * </pre>
533      *
534      * @see RECompiler
535      * @see REProgram
536      * @see recompile
537      */

538     public RE(REProgram program, int matchFlags)
539     {
540         setProgram(program);
541         setMatchFlags(matchFlags);
542     }
543
544     /**
545      * Construct a matcher for a pre-compiled regular expression from program
546      * (bytecode) data.
547      * @param program Compiled regular expression program
548      * @see RECompiler
549      * @see recompile
550      */

551     public RE(REProgram program)
552     {
553         this(program, MATCH_NORMAL);
554     }
555
556     /**
557      * Constructs a regular expression matcher with no initial program.
558      * This is likely to be an uncommon practice, but is still supported.
559      */

560     public RE()
561     {
562         this((REProgram)null, MATCH_NORMAL);
563     }
564
565     /**
566      * Converts a 'simplified' regular expression to a full regular expression
567      * @param pattern The pattern to convert
568      * @return The full regular expression
569      */

570     public static String JavaDoc simplePatternToFullRegularExpression(String JavaDoc pattern)
571     {
572         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
573         for (int i = 0; i < pattern.length(); i++)
574         {
575             char c = pattern.charAt(i);
576             switch (c)
577             {
578                 case '*':
579                     buf.append(".*");
580                     break;
581
582                 case '.':
583                 case '[':
584                 case ']':
585                 case '\\':
586                 case '+':
587                 case '?':
588                 case '{':
589                 case '}':
590                 case '$':
591                 case '^':
592                 case '|':
593                 case '(':
594                 case ')':
595                     buf.append('\\');
596                 default:
597                     buf.append(c);
598                     break;
599             }
600         }
601         return buf.toString();
602     }
603
604     /**
605      * Sets match behaviour flags which alter the way RE does matching.
606      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
607      *
608      * <pre>
609      *
610      * MATCH_NORMAL // Normal (case-sensitive) matching
611      * MATCH_CASEINDEPENDENT // Case folded comparisons
612      * MATCH_MULTILINE // Newline matches as BOL/EOL
613      *
614      * </pre>
615      *
616      */

617     public void setMatchFlags(int matchFlags)
618     {
619         this.matchFlags = matchFlags;
620     }
621
622     /**
623      * Returns the current match behaviour flags.
624      * @return Current match behaviour flags (RE.MATCH_*).
625      *
626      * <pre>
627      *
628      * MATCH_NORMAL // Normal (case-sensitive) matching
629      * MATCH_CASEINDEPENDENT // Case folded comparisons
630      * MATCH_MULTILINE // Newline matches as BOL/EOL
631      *
632      * </pre>
633      *
634      * @see #setMatchFlags
635      *
636      */

637     public int getMatchFlags()
638     {
639         return matchFlags;
640     }
641
642     /**
643      * Sets the current regular expression program used by this matcher object.
644      * @param program Regular expression program compiled by RECompiler.
645      * @see RECompiler
646      * @see REProgram
647      * @see recompile
648      */

649     public void setProgram(REProgram program)
650     {
651         this.program = program;
652         if (program != null && program.maxParens != -1) {
653             this.maxParen = program.maxParens;
654         } else {
655             this.maxParen = MAX_PAREN;
656         }
657     }
658
659     /**
660      * Returns the current regular expression program in use by this matcher object.
661      * @return Regular expression program
662      * @see #setProgram
663      */

664     public REProgram getProgram()
665     {
666         return program;
667     }
668
669     /**
670      * Returns the number of parenthesized subexpressions available after a successful match.
671      * @return Number of available parenthesized subexpressions
672      */

673     public int getParenCount()
674     {
675         return parenCount;
676     }
677
678     /**
679      * Gets the contents of a parenthesized subexpression after a successful match.
680      * @param which Nesting level of subexpression
681      * @return String
682      */

683     public String JavaDoc getParen(int which)
684     {
685         int start;
686         if (which < parenCount && (start = getParenStart(which)) >= 0)
687         {
688             return search.substring(start, getParenEnd(which));
689         }
690         return null;
691     }
692
693     /**
694      * Returns the start index of a given paren level.
695      * @param which Nesting level of subexpression
696      * @return String index
697      */

698     public final int getParenStart(int which)
699     {
700         if (which < parenCount)
701         {
702             switch (which)
703             {
704                 case 0:
705                     return start0;
706                     
707                 case 1:
708                     return start1;
709                     
710                 case 2:
711                     return start2;
712                     
713                 default:
714                     if (startn == null)
715                     {
716                         allocParens();
717                     }
718                     return startn[which];
719             }
720         }
721         return -1;
722     }
723
724     /**
725      * Returns the end index of a given paren level.
726      * @param which Nesting level of subexpression
727      * @return String index
728      */

729     public final int getParenEnd(int which)
730     {
731         if (which < parenCount)
732         {
733             switch (which)
734             {
735                 case 0:
736                     return end0;
737                     
738                 case 1:
739                     return end1;
740                     
741                 case 2:
742                     return end2;
743                     
744                 default:
745                     if (endn == null)
746                     {
747                         allocParens();
748                     }
749                     return endn[which];
750             }
751         }
752         return -1;
753     }
754
755     /**
756      * Returns the length of a given paren level.
757      * @param which Nesting level of subexpression
758      * @return Number of characters in the parenthesized subexpression
759      */

760     public final int getParenLength(int which)
761     {
762         if (which < parenCount)
763         {
764             return getParenEnd(which) - getParenStart(which);
765         }
766         return -1;
767     }
768
769     /**
770      * Sets the start of a paren level
771      * @param which Which paren level
772      * @param i Index in input array
773      */

774     protected final void setParenStart(int which, int i)
775     {
776         if (which < parenCount)
777         {
778             switch (which)
779             {
780                 case 0:
781                     start0 = i;
782                     break;
783                     
784                 case 1:
785                     start1 = i;
786                     break;
787                     
788                 case 2:
789                     start2 = i;
790                     break;
791                     
792                 default:
793                     if (startn == null)
794                     {
795                         allocParens();
796                     }
797                     startn[which] = i;
798                     break;
799             }
800         }
801     }
802
803     /**
804      * Sets the end of a paren level
805      * @param which Which paren level
806      * @param i Index in input array
807      */

808     protected final void setParenEnd(int which, int i)
809     {
810         if (which < parenCount)
811         {
812             switch (which)
813             {
814                 case 0:
815                     end0 = i;
816                     break;
817                     
818                 case 1:
819                     end1 = i;
820                     break;
821                     
822                 case 2:
823                     end2 = i;
824                     break;
825                     
826                 default:
827                     if (endn == null)
828                     {
829                         allocParens();
830                     }
831                     endn[which] = i;
832                     break;
833             }
834         }
835     }
836
837     /**
838      * Throws an Error representing an internal error condition probably resulting
839      * from a bug in the regular expression compiler (or possibly data corruption).
840      * In practice, this should be very rare.
841      * @param s Error description
842      */

843     protected void internalError(String JavaDoc s) throws Error JavaDoc
844     {
845         throw new Error JavaDoc("RE internal error: " + s);
846     }
847
848     /**
849      * Performs lazy allocation of subexpression arrays
850      */

851     private final void allocParens()
852     {
853         // Allocate arrays for subexpressions
854
startn = new int[maxParen];
855         endn = new int[maxParen];
856
857         // Set sub-expression pointers to invalid values
858
for (int i = 0; i < maxParen; i++)
859         {
860             startn[i] = -1;
861             endn[i] = -1;
862         }
863     }
864
865     /**
866      * Try to match a string against a subset of nodes in the program
867      * @param firstNode Node to start at in program
868      * @param lastNode Last valid node (used for matching a subexpression without
869      * matching the rest of the program as well).
870      * @param idxStart Starting position in character array
871      * @return Final input array index if match succeeded. -1 if not.
872      */

873     protected int matchNodes(int firstNode, int lastNode, int idxStart)
874     {
875         // Our current place in the string
876
int idx = idxStart;
877
878         // Loop while node is valid
879
int next, opcode, opdata;
880         int idxNew;
881         char[] instruction = program.instruction;
882         for (int node = firstNode; node < lastNode; )
883         {
884             opcode = instruction[node + offsetOpcode];
885             next = node + (short)instruction[node + offsetNext];
886             opdata = instruction[node + offsetOpdata];
887
888             switch (opcode)
889             {
890                 case OP_RELUCTANTMAYBE:
891                     {
892                         int once = 0;
893                         do
894                         {
895                             // Try to match the rest without using the reluctant subexpr
896
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
897                             {
898                                 return idxNew;
899                             }
900                         }
901                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
902                         return -1;
903                     }
904
905                 case OP_RELUCTANTPLUS:
906                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
907                     {
908                         // Try to match the rest without using the reluctant subexpr
909
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
910                         {
911                             return idxNew;
912                         }
913                     }
914                     return -1;
915
916                 case OP_RELUCTANTSTAR:
917                     do
918                     {
919                         // Try to match the rest without using the reluctant subexpr
920
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
921                         {
922                             return idxNew;
923                         }
924                     }
925                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
926                     return -1;
927
928                 case OP_OPEN:
929
930                     // Match subexpression
931
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
932                     {
933                         startBackref[opdata] = idx;
934                     }
935                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
936                     {
937                         // Increase valid paren count
938
if ((opdata + 1) > parenCount)
939                         {
940                             parenCount = opdata + 1;
941                         }
942
943                         // Don't set paren if already set later on
944
if (getParenStart(opdata) == -1)
945                         {
946                             setParenStart(opdata, idx);
947                         }
948                     }
949                     return idxNew;
950
951                 case OP_CLOSE:
952
953                     // Done matching subexpression
954
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
955                     {
956                         endBackref[opdata] = idx;
957                     }
958                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
959                     {
960                         // Increase valid paren count
961
if ((opdata + 1) > parenCount)
962                         {
963                             parenCount = opdata + 1;
964                         }
965
966                         // Don't set paren if already set later on
967
if (getParenEnd