Java > Open Source Codes > com > jcorporate > expresso > ext > regexp > RE


1   /* ====================================================================
2    * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3    *
4    * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5    *
6    * Redistribution and use in source and binary forms, with or without
7    * modification, are permitted provided that the following conditions
8    * are met:
9    *
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice, this list of conditions and the following disclaimer.
12   *
13   * 2. Redistributions in binary form must reproduce the above copyright
14   *    notice, this list of conditions and the following disclaimer in
15   *    the documentation and/or other materials provided with the
16   *    distribution.
17   *
18   * 3. The end-user documentation included with the redistribution,
19   *    if any, must include the following acknowledgment:
20   *       "This product includes software developed by Jcorporate Ltd.
21   *        (http://www.jcorporate.com/)."
22   *    Alternately, this acknowledgment may appear in the software itself,
23   *    if and wherever such third-party acknowledgments normally appear.
24   *
25   * 4. "Jcorporate" and product names such as "Expresso" must
26   *    not be used to endorse or promote products derived from this
27   *    software without prior written permission. For written permission,
28   *    please contact info@jcorporate.com.
29   *
30   * 5. Products derived from this software may not be called "Expresso",
31   *    or other Jcorporate product names; nor may "Expresso" or other
32   *    Jcorporate product names appear in their name, without prior
33   *    written permission of Jcorporate Ltd.
34   *
35   * 6. No product derived from this software may compete in the same
36   *    market space, i.e. framework, without prior written permission
37   *    of Jcorporate Ltd. For written permission, please contact
38   *    partners@jcorporate.com.
39   *
40   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43   * DISCLAIMED.  IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44   * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46   * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51   * SUCH DAMAGE.
52   * ====================================================================
53   *
54   * This software consists of voluntary contributions made by many
55   * individuals on behalf of the Jcorporate Ltd. Contributions back
56   * to the project(s) are encouraged when you make modifications.
57   * Please send them to support@jcorporate.com. For more information
58   * on Jcorporate Ltd. and its products, please see
59   * <http://www.jcorporate.com/>.
60   *
61   * Portions of this software are based upon other open source
62   * products and are subject to their respective licenses.
63   */
64  
65  package com.jcorporate.expresso.ext.regexp;
66  
67  /*
68   * ====================================================================
69   *
70   * The Apache Software License, Version 1.1
71   *
72   * Copyright (c) 1999 The Apache Software Foundation.  All rights
73   * reserved.
74   *
75   * Redistribution and use in source and binary forms, with or without
76   * modification, are permitted provided that the following conditions
77   * are met:
78   *
79   * 1. Redistributions of source code must retain the above copyright
80   *    notice, this list of conditions and the following disclaimer.
81   *
82   * 2. Redistributions in binary form must reproduce the above copyright
83   *    notice, this list of conditions and the following disclaimer in
84   *    the documentation and/or other materials provided with the
85   *    distribution.
86   *
87   * 3. The end-user documentation included with the redistribution, if
88   *    any, must include the following acknowlegement:
89   *       "This product includes software developed by the
90   *        Apache Software Foundation (http://www.apache.org/)."
91   *    Alternately, this acknowlegement may appear in the software itself,
92   *    if and wherever such third-party acknowlegements normally appear.
93   *
94   * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
95   *    Foundation" must not be used to endorse or promote products derived
96   *    from this software without prior written permission. For written
97   *    permission, please contact apache@apache.org.
98   *
99   * 5. Products derived from this software may not be called "Apache"
100  *    nor may "Apache" appear in their names without prior written
101  *    permission of the Apache Group.
102  *
103  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
104  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
105  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
106  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
107  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
108  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
109  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
110  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
112  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
113  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
114  * SUCH DAMAGE.
115  * ====================================================================
116  *
117  * This software consists of voluntary contributions made by many
118  * individuals on behalf of the Apache Software Foundation.  For more
119  * information on the Apache Software Foundation, please see
120  * <http://www.apache.org/>.
121  *
122  */
123 
124 import java.util.Vector  ;
125 
126 
127 /**
128  * RE is an efficient, lightweight regular expression evaluator/matcher class.
129  * Regular expressions are pattern descriptions which enable sophisticated matching of
130  * strings.  In addition to being able to match a string against a pattern, you
131  * can also extract parts of the match.  This is especially useful in text parsing!
132  * Details on the syntax of regular expression patterns are given below.
133  * <p/>
134  * <p/>
135  * <p/>
136  * To compile a regular expression (RE), you can simply construct an RE matcher
137  * object from the string specification of the pattern, like this:
138  * <p/>
139  * <pre>
140  * <p/>
141  *     RE r = new RE("a*b");
142  * <p/>
143  * </pre>
144  * <p/>
145  * <p/>
146  * <p/>
147  * Once you have done this, you can call either of the RE.match methods to
148  * perform matching on a String.  For example:
149  * <p/>
150  * <pre>
151  * <p/>
152  *     boolean matched = r.match("aaaab");
153  * <p/>
154  * </pre>
155  * <p/>
156  * will cause the boolean matched to be set to true because the
157  * pattern "a*b" matches the string "aaaab".
158  * <p/>
159  * <p/>
160  * If you were interested in the <i>number</i> of a's which matched the first
161  * part of our example expression, you could change the expression to
162  * "(a*)b".  Then when you compiled the expression and matched it against
163  * something like "xaaaab", you would get results like this:
164  * <p/>
165  * <pre>
166  * <p/>
167  *     RE r = new RE("(a*)b");                  // Compile expression
168  *     boolean matched = r.match("xaaaab");     // Match against "xaaaab"
169  * <p/>
170  * <br>
171  * <p/>
172  *     String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
173  *     String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
174  * <p/>
175  * <br>
176  * <p/>
177  *     int startWholeExpr = getParenStart(0);   // startWholeExpr will be index 1
178  *     int endWholeExpr = getParenEnd(0);       // endWholeExpr will be index 6
179  *     int lenWholeExpr = getParenLength(0);    // lenWholeExpr will be 5
180  * <p/>
181  * <br>
182  * <p/>
183  *     int startInside = getParenStart(1);      // startInside will be index 1
184  *     int endInside = getParenEnd(1);          // endInside will be index 5
185  *     int lenInside = getParenLength(1);       // lenInside will be 4
186  * <p/>
187  * </pre>
188  * <p/>
189  * You can also refer to the contents of a parenthesized expression within
190  * a regular expression itself.  This is called a 'backreference'.  The first
191  * backreference in a regular expression is denoted by \1, the second by \2
192  * and so on.  So the expression:
193  * <p/>
194  * <pre>
195  * <p/>
196  *     ([0-9]+)=\1
197  * <p/>
198  * </pre>
199  * <p/>
200  * will match any string of the form n=n (like 0=0 or 2=2).
201  * <p/>
202  * <p/>
203  * <p/>
204  * The full regular expression syntax accepted by RE is described here:
205  * <p/>
206  * <pre>
207  * <p/>
208  * <br>
209  * <p/>
210  *  <b><font face=times roman>Characters</font></b>
211  * <p/>
212  * <br>
213  * <p/>
214  *    <i>unicodeChar</i>          Matches any identical unicode character
215  *    \                    Used to quote a meta-character (like '*')
216  *    \\                   Matches a single '\' character
217  *    \0nnn                Matches a given octal character
218  *    \xhh                 Matches a given 8-bit hexadecimal character
219  *    \\uhhhh               Matches a given 16-bit hexadecimal character
220  *    \t                   Matches an ASCII tab character
221  *    \n                   Matches an ASCII newline character
222  *    \r                   Matches an ASCII return character
223  *    \f                   Matches an ASCII form feed character
224  * <p/>
225  * <br>
226  * <p/>
227  *  <b><font face=times roman>Character Classes</font></b>
228  * <p/>
229  * <br>
230  * <p/>
231  *    [abc]                Simple character class
232  *    [a-zA-Z]             Character class with ranges
233  *    [^abc]               Negated character class
234  * <p/>
235  * <br>
236  * <p/>
237  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
238  * <p/>
239  * <br>
240  * <p/>
241  *    [:alnum:]            Alphanumeric characters.
242  *    [:alpha:]            Alphabetic characters.
243  *    [:blank:]            Space and tab characters.
244  *    [:cntrl:]            Control characters.
245  *    [:digit:]            Numeric characters.
246  *    [:graph:]            Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
247  *    [:lower:]            Lower-case alphabetic characters.
248  *    [:print:]            Printable characters (characters that are not control characters.)
249  *    [:punct:]            Punctuation characters (characters that are not letter, digits, control characters, or space characters).
250  *    [:space:]            Space characters (such as space, tab, and formfeed, to name a few).
251  *    [:upper:]            Upper-case alphabetic characters.
252  *    [:xdigit:]           Characters that are hexadecimal digits.
253  * <p/>
254  * <br>
255  * <p/>
256  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
257  * <p/>
258  * <br>
259  * <p/>
260  *    [:javastart:]        Start of a Java identifier
261  *    [:javapart:]         Part of a Java identifier
262  * <p/>
263  * <br>
264  * <p/>
265  *  <b><font face=times roman>Predefined Classes</font></b>
266  * <p/>
267  * <br>
268  * <p/>
269  *    .                    Matches any character other than newline
270  *    \w                   Matches a "word" character (alphanumeric plus "_")
271  *    \W                   Matches a non-word character
272  *    \s                   Matches a whitespace character
273  *    \S                   Matches a non-whitespace character
274  *    \d                   Matches a digit character
275  *    \D                   Matches a non-digit character
276  * <p/>
277  * <br>
278  * <p/>
279  *  <b><font face=times roman>Boundary Matchers</font></b>
280  * <p/>
281  * <br>
282  * <p/>
283  *    ^                    Matches only at the beginning of a line
284  *    $                    Matches only at the end of a line
285  *    \b                   Matches only at a word boundary
286  *    \B                   Matches only at a non-word boundary
287  * <p/>
288  * <br>
289  * <p/>
290  *  <b><font face=times roman>Greedy Closures</font></b>
291  * <p/>
292  * <br>
293  * <p/>
294  *    A*                   Matches A 0 or more times (greedy)
295  *    A+                   Matches A 1 or more times (greedy)
296  *    A?                   Matches A 1 or 0 times (greedy)
297  *    A{n}                 Matches A exactly n times (greedy)
298  *    A{n,}                Matches A at least n times (greedy)
299  *    A{n,m}               Matches A at least n but not more than m times (greedy)
300  * <p/>
301  * <br>
302  * <p/>
303  *  <b><font face=times roman>Reluctant Closures</font></b>
304  * <p/>
305  * <br>
306  * <p/>
307  *    A*?                  Matches A 0 or more times (reluctant)
308  *    A+?                  Matches A 1 or more times (reluctant)
309  *    A??                  Matches A 0 or 1 times (reluctant)
310  * <p/>
311  * <br>
312  * <p/>
313  *  <b><font face=times roman>Logical Operators</font></b>
314  * <p/>
315  * <br>
316  * <p/>
317  *    AB                   Matches A followed by B
318  *    A|B                  Matches either A or B
319  *    (A)                  Used for subexpression grouping
320  * <p/>
321  * <br>
322  * <p/>
323  *  <b><font face=times roman>Backreferences</font></b>
324  * <p/>
325  * <br>
326  * <p/>
327  *    \1                   Backreference to 1st parenthesized subexpression
328  *    \2                   Backreference to 2nd parenthesized subexpression
329  *    \3                   Backreference to 3rd parenthesized subexpression
330  *    \4                   Backreference to 4th parenthesized subexpression
331  *    \5                   Backreference to 5th parenthesized subexpression
332  *    \6                   Backreference to 6th parenthesized subexpression
333  *    \7                   Backreference to 7th parenthesized subexpression
334  *    \8                   Backreference to 8th parenthesized subexpression
335  *    \9                   Backreference to 9th parenthesized subexpression
336  * <p/>
337  * <br>
338  * <p/>
339  * </pre>
340  * <p/>
341  * <p/>
342  * <p/>
343  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
344  * match as many elements of the string as possible without causing the overall
345  * match to fail.  If you want a closure to be reluctant (non-greedy), you can
346  * simply follow it with a '?'.  A reluctant closure will match as few elements
347  * of the string as possible when finding matches.  {m,n} closures don't currently
348  * support reluctancy.
349  * <p/>
350  * <p/>
351  * <p/>
352  * RE runs programs compiled by the RECompiler class.  But the RE matcher class
353  * does not include the actual regular expression compiler for reasons of
354  * efficiency.  In fact, if you want to pre-compile one or more regular expressions,
355  * the 'recompile' class can be invoked from the command line to produce compiled
356  * output like this:
357  * <p/>
358  * <pre>
359  * <p/>
360  *    // Pre-compiled regular expression "a*b"
361  *    char[] re1Instructions =
362  *    {
363  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
364  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
365  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
366  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
367  *        0x0000,
368  *    };
369  * <p/>
370  *    <br>
371  * <p/>
372  *    REProgram re1 = new REProgram(re1Instructions);
373  * <p/>
374  * </pre>
375  * <p/>
376  * You can then construct a regular expression matcher (RE) object from the pre-compiled
377  * expression re1 and thus avoid the overhead of compiling the expression at runtime.
378  * If you require more dynamic regular expressions, you can construct a single RECompiler
379  * object and re-use it to compile each expression.  Similarly, you can change the
380  * program run by a given matcher object at any time.  However, RE and RECompiler are
381  * not threadsafe (for efficiency reasons, and because requiring thread safety in this
382  * class is deemed to be a rare requirement), so you will need to construct a separate
383  * compiler or matcher object for each thread (unless you do thread synchronization
384  * yourself).
385  * <p/>
386  * </pre>
387  * <br><p><br>
388  * <p/>
389  * <font color=red>
390  * <i>ISSUES:</i>
391  * <p/>
392  * <ul>
393  * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
394  * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
395  * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
396  * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
397  * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
398  * <li>Should RE support character iterators (for backwards RE matching!)?
399  * <li>Should RE support reluctant {m,n} closures (does anyone care)?
400  * <li>Not *all* possibilities are considered for greediness when backreferences
401  * are involved (as POSIX suggests should be the case).  The POSIX RE
402  * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
403  * of acdacaa where \1 is "a".  This is not the case in this RE package,
404  * and actually Perl doesn't go to this extent either!  Until someone
405  * actually complains about this, I'm not sure it's worth "fixing".
406  * If it ever is fixed, test #137 in RETest.txt should be updated.
407  * </ul>
408  * <p/>
409  * </font>
410  *
411  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
412  * @version $Id: RE.java,v 1.9 2004/11/18 02:03:28 lhamel Exp $
413  * @see RECompiler
414  * @deprecated since v5.6, use jakarta oro
415  */
416 public class RE {
417 
418     /**
419      * Specifies normal, case-sensitive matching behaviour.
420      */
421     public static final int MATCH_NORMAL = 0x0000;
422 
423     /**
424      * Flag to indicate that matching should be case-independent (folded)
425      */
426     public static final int MATCH_CASEINDEPENDENT = 0x0001;
427 
428     /**
429      * Newlines should match as BOL/EOL (^ and $)
430      */
431     public static final int MATCH_MULTILINE = 0x0002;
432 
433     /**
434      * *********************************************
435      * *
436      * The format of a node in a program is:        *
437      * *
438      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
439      * *
440      * char OPCODE - instruction                    *
441      * char OPDATA - modifying data                 *
442      * char OPNEXT - next node (relative offset)    *
443      * *
444      * **********************************************
445      */
446     //   Opcode              Char       Opdata/Operand  Meaning
447     //   ----------          ---------- --------------- --------------------------------------------------
448     static final char OP_END = 'E'; //                 end of program
449     static final char OP_BOL = '^'; //                 match only if at beginning of line
450     static final char OP_EOL = '$'; //                 match only if at end of line
451     static final char OP_ANY = '.'; //                 match any single character except newline
452     static final char OP_ANYOF = '['; // count/ranges    match any char in the list of ranges
453     static final char OP_BRANCH = '|'; // node            match this alternative or the next one
454     static final char OP_ATOM = 'A'; // length/string   length of string followed by string itself
455     static final char OP_STAR = '*'; // node            kleene closure
456     static final char OP_PLUS = '+'; // node            positive closure
457     static final char OP_MAYBE = '?'; // node            optional closure
458     static final char OP_ESCAPE = '\\'; // escape          special escape code char class (escape is E_* code)
459     static final char OP_OPEN = '('; // number          nth opening paren
460     static final char OP_CLOSE = ')'; // number          nth closing paren
461     static final char OP_BACKREF = '#'; // number          reference nth already matched parenthesized string
462     static final char OP_GOTO = 'G'; //                 nothing but a (back-)pointer
463     static final char OP_NOTHING = 'N'; //                 match null string such as in '(a|)'
464     static final char OP_RELUCTANTSTAR = '8'; // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
465     static final char OP_RELUCTANTPLUS = '='; // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
466     static final char OP_RELUCTANTMAYBE = '/'; // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
467     static final char OP_POSIXCLASS = 'P'; // classid         one of the posix character classes
468 
469     // Escape codes
470     static final char E_ALNUM = 'w'; // Alphanumeric
471     static final char E_NALNUM = 'W'; // Non-alphanumeric
472     static final char E_BOUND = 'b'; // Word boundary
473     static final char E_NBOUND = 'B'; // Non-word boundary
474     static final char E_SPACE = 's'; // Whitespace
475     static final char E_NSPACE = 'S'; // Non-whitespace
476     static final char E_DIGIT = 'd'; // Digit
477     static final char E_NDIGIT = 'D'; // Non-digit
478 
479     // Posix character classes
480     static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
481     static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
482     static final char POSIX_CLASS_BLANK = 'b'; // Blanks
483     static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
484     static final char POSIX_CLASS_DIGIT = 'd'; // Digits
485     static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
486     static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
487     static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
488     static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
489     static final char POSIX_CLASS_SPACE = 's'; // Spaces
490     static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
491     static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
492     static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
493     static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
494 
495     // Limits
496     static final int maxNode = 65536; // Maximum number of nodes in a program
497     static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
498 
499     // Node layout constants
500     static final int offsetOpcode = 0; // Opcode offset (first character)
501     static final int offsetOpdata = 1; // Opdata offset (second char)
502     static final int offsetNext = 2; // Next index offset (third char)
503     static final int nodeSize = 3; // Node size (in chars)
504 
505     /**
506      * Line Separator
507      */
508     static final String   NEWLINE = System.getProperty("line.separator");
509 
510     // State of current program
511     REProgram program; // Compiled regular expression 'program'
512     CharacterIterator search; // The string being matched against
513     int idx; // Current index in string being searched
514     int matchFlags; // Match behaviour flags
515 
516     // Parenthesized subexpressions
517     int parenCount; // Number of subexpressions matched (num open parens + 1)
518     int start0; // Cache of start[0]
519     int end0; // Cache of start[0]
520     int start1; // Cache of start[1]
521     int end1; // Cache of start[1]
522     int start2; // Cache of start[2]
523     int end2; // Cache of start[2]
524     int[] startn; // Lazy-alloced array of sub-expression starts
525     int[] endn; // Lazy-alloced array of sub-expression ends
526 
527     // Backreferences
528     int[] startBackref; // Lazy-alloced array of backref starts
529     int[] endBackref; // Lazy-alloced array of backref ends
530 
531     /**
532      * Flag bit that indicates that subst should replace all occurrences of this
533      * regular expression.
534      */
535     public static final int REPLACE_ALL = 0x0000;
536 
537     /**
538      * Flag bit that indicates that subst should only replace the first occurrence
539      * of this regular expression.
540      */
541     public static final int REPLACE_FIRSTONLY = 0x0001;
542 
543     /**
544      * Constructs a regular expression matcher with no initial program.
545      * This is likely to be an uncommon practice, but is still supported.
546      */
547     public RE() {
548         this((REProgram) null, MATCH_NORMAL);
549     }
550 
551     /**
552      * Construct a matcher for a pre-compiled regular expression from program
553      * (bytecode) data.
554      *
555      * @param program Compiled regular expression program
556      * @see RECompiler
557      */
558     public RE(REProgram program) {
559         this(program, MATCH_NORMAL);
560     }
561 
562     /**
563      * Construct a matcher for a pre-compiled regular expression from program
564      * (bytecode) data.  Permits special flags to be passed in to modify matching
565      * behaviour.
566      *
567      * @param program    Compiled regular expression program (see RECompiler and/or recompile)
568      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
569      *                   <p/>
570      *                   <pre>
571      *                                     <p/>
572      *                                       MATCH_NORMAL              // Normal (case-sensitive) matching
573      *                                       MATCH_CASEINDEPENDENT     // Case folded comparisons
574      *                                       MATCH_MULTILINE           // Newline matches as BOL/EOL
575      *                                     <p/>
576      *                                     </pre>
577      * @see RECompiler
578      * @see REProgram
579      */
580     public RE(REProgram program, int matchFlags) {
581         setProgram(program);
582         setMatchFlags(matchFlags);
583     }
584 
585     /**
586      * Constructs a regular expression matcher from a String by compiling it
587      * using a new instance of RECompiler.  If you will be compiling many
588      * expressions, you may prefer to use a single RECompiler object instead.
589      *
590      * @param pattern The regular expression pattern to compile.
591      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
592      * @see RECompiler
593      */
594     public RE(String   pattern)
595             throws RESyntaxException {
596         this(pattern, MATCH_NORMAL);
597     }
598 
599     /**
600      * Constructs a regular expression matcher from a String by compiling it
601      * using a new instance of RECompiler.  If you will be compiling many
602      * expressions, you may prefer to use a single RECompiler object instead.
603      *
604      * @param pattern    The regular expression pattern to compile.
605      * @param matchFlags The matching style
606      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
607      * @see RECompiler
608      */
609     public RE(String   pattern, int matchFlags)
610             throws RESyntaxException {
611         this(new RECompiler().compile(pattern));
612         setMatchFlags(matchFlags);
613     }
614 
615     /**
616      * Performs lazy allocation of subexpression arrays
617      */
618     private final void allocParens() {
619 
620         // Allocate arrays for subexpressions
621         startn = new int[maxParen];
622         endn = new int[maxParen];
623 
624         // Set sub-expression pointers to invalid values
625         for (int i = 0; i < maxParen; i++) {
626             startn[i] = -1;
627             endn[i] = -1;
628         }
629     }
630 
631     /**
632      * Returns the current match behaviour flags.
633      *
634      * @return Current match behaviour flags (RE.MATCH_*).
635      *         <p/>
636      *         <pre>
637      *                 <p/>
638      *                   MATCH_NORMAL              // Normal (case-sensitive) matching
639      *                   MATCH_CASEINDEPENDENT     // Case folded comparisons
640      *                   MATCH_MULTILINE           // Newline matches as BOL/EOL
641      *                 <p/>
642      *                 </pre>
643      * @see #setMatchFlags
644      */
645     public int getMatchFlags() {
646         return matchFlags;
647     }
648 
649     /**
650      * Gets the contents of a parenthesized subexpression after a successful match.
651      *
652      * @param which Nesting level of subexpression
653      * @return String
654      */
655     public String   getParen(int which) {
656         if (which < parenCount) {
657             return search.substring(getParenStart(which), getParenEnd(which));
658         }
659 
660         return null;
661     }
662 
663     /**
664      * Returns the number of parenthesized subexpressions available after a successful match.
665      *
666      * @return Number of available parenthesized subexpressions
667      */
668     public int getParenCount() {
669         return parenCount;
670     }
671 
672     /**
673      * Returns the end index of a given paren level.
674      *
675      * @param which Nesting level of subexpression
676      * @return String index
677      */
678     public final int getParenEnd(int which) {
679         if (which < parenCount) {
680             switch (which) {
681                 case 0:
682                     return end0;
683 
684                 case 1:
685                     return end1;
686 
687                 case 2:
688                     return end2;
689 
690                 default:
691 
692                     if (endn == null) {
693                         allocParens();
694                     }
695 
696                     return endn[which];
697             }
698         }
699 
700         return -1;
701     }
702 
703     /**
704      * Returns the length of a given paren level.
705      *
706      * @param which Nesting level of subexpression
707      * @return Number of characters in the parenthesized subexpression
708      */
709     public final int getParenLength(int which) {
710         if (which < parenCount) {
711             return getParenEnd(which) - getParenStart(which);
712         }
713 
714         return -1;
715     }
716 
717     /**
718      * Returns the start index of a given paren level.
719      *
720      * @param which Nesting level of subexpression
721      * @return String index
722      */
723     public final int getParenStart(int which) {
724         if (which < parenCount) {
725             switch (which) {
726                 case 0:
727                     return start0;
728 
729                 case 1:
730                     return start1;
731 
732                 case 2:
733                     return start2;
734 
735                 default:
736 
737                     if (startn == null) {
738                         allocParens();
739                     }
740 
741                     return startn[which];
742             }
743         }
744 
745         return -1;
746     }
747 
748     /**
749      * Returns the current regular expression program in use by this matcher object.
750      *
751      * @return Regular expression program
752      * @see #setProgram
753      */
754     public REProgram getProgram() {
755         return program;
756     }
757 
758     /**
759      * Returns an array of Strings, whose toString representation matches a regular
760      * expression. This method works like the Perl function of the same name.  Given
761      * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
762      * aaaab], the array of Strings returned by grep would be [aab, aaaab].
763      *
764      * @param search Array of Objects to search
765      * @return Array of Objects whose toString value matches this regular expression.
766      */
767     public String  [] grep(Object  [] search) {
768 
769         // Create new vector to hold return items
770         Vector   v = new Vector  ();
771 
772         // Traverse array of objects
773         for (int i = 0; i < search.length; i++) {
774 
775             // Get next object as a string
776             String   s = search[i].toString();
777 
778             // If it matches this regexp, add it to the list
779             if (match(s)) {
780                 v.addElement(s);
781             }
782         }
783 
784         // Return vector as an array of strings
785         String  [] ret = new String  [v.size()];
786         v.copyInto(ret);
787 
788         return ret;
789     }
790 
791     /**
792      * Throws an Error representing an internal error condition probably resulting
793      * from a bug in the regular expression compiler (or possibly data corruption).
794      * In practice, this should be very rare.
795      *
796      * @param s Error description
797      */
798     protected void internalError(String   s)
799             throws Error   {
800         throw new Error  ("RE internal error: " + s);
801     }
802 
803     /**
804      * @return true if at the i-th position in the 'search' a newline ends
805      */
806     private boolean isNewline(int i) {
807         if (i < NEWLINE.length() - 1) {
808             return false;
809         }
810         if (search.charAt(i) == '\n') {
811             return true;
812         }
813         for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
814             if (NEWLINE.charAt(j) != search.charAt(i)) {
815                 return false;
816             }
817         }
818 
819         return true;
820     }
821 
822     /**
823      * Matches the current regular expression program against a character array,
824      * starting at a given index.
825      *
826      * @param search String to match against
827      * @param i      Index to start searching at
828      * @return True if string matched
829      */
830     public boolean match(CharacterIterator search, int i) {
831 
832         // There is no compiled program to search with!
833         if (program == null) {
834 
835             // This should be uncommon enough to be an error case rather
836             // than an exception (which would have to be handled everywhere)
837             internalError("No RE program to run!");
838         }
839 
840         // Save string to search
841         this.search = search;
842 
843         // Can we optimize the search by looking for a prefix string?
844         if (program.prefix == null) {
845 
846             // Unprefixed matching must try for a match at each character
847             for (; !search.isEnd(i - 1); i++) {
848 
849                 // Try a match at index i
850                 if (matchAt(i)) {
851                     return true;
852                 }
853             }
854 
855             return false;
856         } else {
857 
858             // Prefix-anchored matching is possible
859             char[] prefix = program.prefix;
860 
861             for (; !search.isEnd(i + prefix.length - 1); i++) {
862 
863                 // If the first character of the prefix matches
864                 if (search.charAt(i) == prefix[0]) {
865 
866                     // Save first character position
867                     int firstChar = i++;
868                     int k;
869 
870                     for (k = 1; k < prefix.length;) {
871 
872                         // If there's a mismatch of any character in the prefix, give up
873                         if (search.charAt(i++) != prefix[k++]) {
874                             break;
875                         }
876                     }
877                     // See if the whole prefix string matched
878                     if (k == prefix.length) {
879 
880                         // We matched the full prefix at firstChar, so try it
881                         if (matchAt(firstChar)) {
882                             return true;
883                         }
884                     }
885 
886                     // Match failed, reset i to continue the search
887                     i = firstChar;
888                 }
889             }
890 
891             return false;
892         }
893     }
894 
895     /**
896      * Matches the current regular expression program against a String.
897      *
898      * @param search String to match against
899      * @return True if string matched
900      */
901     public boolean match(String   search) {
902         return match(search, 0);
903     }
904 
905     /**
906      * Matches the current regular expression program against a character array,
907      * starting at a given index.
908      *
909      * @param search String to match against
910      * @param i      Index to start searching at
911      * @return True if string matched
912      */
913     public boolean match(String   search, int i) {
914         return match(new StringCharacterIterator(search), i);
915     }
916 
917     /**
918      * Match the current regular expression program against the current
919      * input string, starting at index i of the input string.  This method
920      * is only meant for internal use.
921      *
922      * @param i The input string index to start matching at
923      * @return True if the input matched the expression
924      */
925     protected boolean matchAt(int i) {
926 
927         // Initialize start pointer, paren cache and paren count
928         start0 = -1;
929         end0 = -1;
930         start1 = -1;
931         end1 = -1;
932         start2 = -1;
933         end2 = -1;
934         startn = null;
935         endn = null;
936         parenCount = 1;
937         setParenStart(0, i);
938 
939         // Allocate backref arrays (unless optimizations indicate otherwise)
940         if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
941             startBackref = new int[maxParen];
942             endBackref = new int[maxParen];
943         }
944 
945         // Match against string
946         int idx;
947 
948         if ((idx = matchNodes(0, maxNode, i)) != -1) {
949             setParenEnd(0, idx);
950 
951             return true;
952         }
953 
954         // Didn't match
955         parenCount = 0;
956 
957         return false;
958     }
959 
960     /**
961      * Try to match a string against a subset of nodes in the program
962      *
963      * @param firstNode Node to start at in program
964      * @param lastNode  Last valid node (used for matching a subexpression without
965      *                  matching the rest of the program as well).
966      * @param idxStart  Starting position in character array
967      * @return Final input array index if match succeeded.  -1 if not.
968      */
969     protected int matchNodes(int firstNode, int lastNode, int idxStart) {
970 
971         // Our current place in the string
972         int idx = idxStart;
973 
974         // Loop while node is valid
975         int next;
976         int opcode;
977         int opdata;
978         int idxNew;
979         char[] instruction = program.instruction;
980 
981         for (int node = firstNode; node < lastNode;) {
982             opcode = instruction[node + offsetOpcode];
983             next = node + (short) instruction[node + offsetNext];
984             opdata = instruction[node + offsetOpdata];
985 
986             switch (opcode) {
987                 case OP_RELUCTANTMAYBE:
988                     {
989                         int once = 0;
990 
991                         do {
992 
993                             // Try to match the rest without using the reluctant subexpr
994                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
995                                 return idxNew;
996                             }
997                         } while ((once++ == 0) && (idx = matchNodes(node + nodeSize,
998                                 next, idx)) != -1);
999 
1000                        return -1;
1001                    }
1002                case OP_RELUCTANTPLUS:
1003                    while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) {
1004
1005                        // Try to match the rest without using the reluctant subexpr
1006                        if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1007                            return idxNew;
1008                        }
1009                    }
1010
1011                    return -1;
1012
1013                case OP_RELUCTANTSTAR:
1014                    do {
1015
1016                        // Try to match the rest without using the reluctant subexpr
1017                        if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1018                            return idxNew;
1019                        }
1020                    } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
1021
1022                    return -1;
1023
1024                case OP_OPEN:
1025
1026                    // Match subexpression
1027                    if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1028                        startBackref[opdata] = idx;
1029                    }
1030                    if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1031
1032                        // Increase valid paren count
1033                        if ((opdata + 1) > parenCount) {
1034                            parenCount = opdata + 1;
1035                        }
1036                        // Don't set paren if already set later on
1037                        if (getParenStart(opdata) == -1) {
1038                            setParenStart(opdata, idx);
1039                        }
1040                    }
1041
1042                    return idxNew;
1043
1044                case OP_CLOSE:
1045
1046                    // Done matching subexpression
1047                    if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1048                        endBackref[opdata] = idx;
1049                    }
1050                    if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1051
1052                        // Increase valid paren count
1053                        if ((opdata + 1) > parenCount) {
1054                            parenCount = opdata + 1;
1055                        }
1056                        // Don't set paren if already set later on
1057                        if (getParenEnd(opdata) == -1) {
1058                            setParenEnd(opdata, idx);
1059                        }
1060                    }
1061
1062                    return idxNew;
1063
1064                case OP_BACKREF:
1065                    {
1066
1067                        // Get the start and end of the backref
1068                        int s = startBackref[opdata];
1069                        int e = endBackref[opdata];
1070
1071                        // We don't know the backref yet
1072                        if (s == -1 || e == -1) {
1073                            return -1;
1074                        }
1075                        // The backref is empty size
1076                        if (s == e) {
1077                            break;
1078                        }
1079
1080                        // Get the length of the backref
1081                        int l = e - s;
1082
1083                        // If there's not enough input left, give up.
1084                        if (search.isEnd(idx + l)) {
1085                            return -1;
1086                        }
1087                        // Case fold the backref?
1088                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1089
1090                            // Compare backref to input, case-folding as we go
1091                            for (int i = 0; i < l; i++) {
1092                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(
1093                                        s + i))) {
1094                                    return -1;
1095                                }
1096                            }
1097                        } else {
1098
1099                            // Compare backref to input
1100                            for (int i = 0; i < l; i++) {
1101                                if (search.charAt(idx++) != search.charAt(s + i)) {
1102                                    return -1;
1103                                }
1104                            }
1105                        }
1106                    }
1107
1108                    break;
1109
1110                case OP_BOL:
1111
1112                    // Fail if we're not at the start of the string
1113                    if (idx != 0) {
1114
1115                        // If we're multiline matching, we could still be at the start of a line
1116                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1117
1118                            // If not at start of line, give up
1119                            if (idx <= 0 || !isNewline(idx - 1)) {
1120                                return -1;
1121                            } else {
1122                                break;
1123                            }
1124                        }
1125
1126                        return -1;
1127                    }
1128
1129                    break;
1130
1131                case OP_EOL:
1132
1133                    // If we're not at the end of string
1134                    if (!search.isEnd(0) && !search.isEnd(idx)) {
1135
1136                        // If we're multi-line matching
1137                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1138
1139                            // Give up if we're not at the end of a line
1140                            if (!isNewline(idx)) {
1141                                return -1;
1142                            } else {
1143                                break;
1144                            }
1145                        }
1146
1147                        return -1;
1148                    }
1149
1150                    break;
1151
1152                case OP_ESCAPE:
1153
1154                    // Which escape?
1155                    switch (opdata) {
1156
1157                        // Word boundary match
1158                        case E_NBOUND:
1159                        case E_BOUND:
1160                            {
1161                                char cLast = ((idx == getParenStart(0))
1162                                        ? '\n' : search.charAt(idx - 1));
1163                                char cNext = ((search.isEnd(idx))
1164                                        ? '\n' : search.charAt(idx));
1165
1166                                if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
1167                                    return -1;
1168                                }
1169                            }
1170
1171                            break;
1172
1173                            // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1174                        case E_ALNUM:
1175                        case E_NALNUM:
1176                        case E_DIGIT:
1177                        case E_NDIGIT:
1178                        case E_SPACE:
1179                        case E_NSPACE:
1180
1181                            // Give up if out of input
1182                            if (search.isEnd(idx)) {
1183                                return -1;
1184                            }
1185                            // Switch on escape
1186                            switch (opdata) {
1187                                case E_ALNUM:
1188                                case E_NALNUM:
1189                                    if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM))) {
1190                                        return -1;
1191                                    }
1192
1193                                    break;
1194
1195                                case E_DIGIT:
1196                                case E_NDIGIT:
1197                                    if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) {
1198                                        return -1;
1199                                    }
1200
1201                                    break;
1202
1203                                case E_SPACE:
1204                                case E_NSPACE:
1205                                    if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) {
1206                                        return -1;
1207                                    }
1208
1209                                    break;
1210                            }
1211
1212                            idx++;
1213                            break;
1214
1215                        default:
1216                            internalError("Unrecognized escape '" + opdata +
1217                                    "'");
1218                    }
1219
1220                    break;
1221
1222                case OP_ANY:
1223
1224                    // Match anything but a newline
1225                    if (search.isEnd(idx) || search.charAt(idx++) == '\n') {
1226                        return -1;
1227                    }
1228
1229                    break;
1230
1231                case OP_ATOM:
1232                    {
1233
1234                        // Match an atom value
1235                        if (search.isEnd(idx)) {
1236                            return -1;
1237                        }
1238
1239                        // Get length of atom and starting index
1240                        int lenAtom = opdata;
1241                        int startAtom = node + nodeSize;
1242
1243                        // Give up if not enough input remains to have a match
1244                        if (search.isEnd(lenAtom + idx - 1)) {
1245                            return -1;
1246                        }
1247                        // Match atom differently depending on casefolding flag
1248                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1249                            for (int i = 0; i < lenAtom; i++) {
1250                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(
1251                                        instruction[startAtom + i])) {
1252                                    return -1;
1253                                }
1254                            }
1255                        } else {
1256                            for (int i = 0; i < lenAtom; i++) {
1257                                if (search.charAt(idx++) != instruction[startAtom + i]) {
1258                                    return -1;
1259                                }
1260                            }
1261                        }
1262                    }
1263
1264                    break;
1265
1266                case OP_POSIXCLASS:
1267                    {
1268
1269                        // Out of input?
1270                        if (search.isEnd(idx)) {
1271                            return -1;
1272                        }
1273                        switch (opdata) {
1274                            case POSIX_CLASS_ALNUM:
1275                                if (!Character.isLetterOrDigit(search.charAt(idx))) {
1276                                    return -1;
1277                                }
1278
1279                                break;
1280
1281                            case POSIX_CLASS_ALPHA:
1282                                if (!Character.isLetter(search.charAt(idx))) {
1283                                    return -1;
1284                                }
1285
1286                                break;
1287
1288                            case POSIX_CLASS_DIGIT:
1289                                if (!Character.isDigit(search.charAt(idx))) {
1290                                    return -1;
1291                                }
1292
1293                                break;
1294
1295                            case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1296                                if (!Character.isSpaceChar(search.charAt(idx))) {
1297                                    return -1;
1298                                }
1299
1300                                break;
1301
1302                            case POSIX_CLASS_SPACE:
1303                                if (!Character.isWhitespace(search.charAt(idx))) {
1304                                    return -1;
1305                                }
1306
1307                                break;
1308
1309                            case POSIX_CLASS_CNTRL:
1310                                if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1311                                    return -1;
1312                                }
1313
1314                                break;
1315
1316                            case POSIX_CLASS_GRAPH: // JWL - bugbug???
1317                                switch (Character.getType(search.charAt(idx))) {
1318                                    case Character.MATH_SYMBOL:
1319                                    case Character.CURRENCY_SYMBOL:
1320                                    case Character.MODIFIER_SYMBOL:
1321                                    case Character.OTHER_SYMBOL:
1322                                        break;
1323
1324                                    default:
1325                                        return -1;
1326                                }
1327
1328                                break;
1329
1330                            case POSIX_CLASS_LOWER:
1331                                if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1332                                    return -1;
1333                                }
1334
1335                                break;
1336
1337                            case POSIX_CLASS_UPPER:
1338                                if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1339                                    return -1;
1340                                }
1341
1342                                break;
1343
1344                            case POSIX_CLASS_PRINT:
1345                                if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1346                                    return -1;
1347                                }
1348
1349                                break;
1350
1351                            case POSIX_CLASS_PUNCT:
1352                                {
1353                                    int type = Character.getType(search.charAt(idx));
1354
1355                                    switch (type) {
1356                                        case Character.DASH_PUNCTUATION:
1357                                        case Character.START_PUNCTUATION:
1358                                        case Character.END_PUNCTUATION:
1359                                        case Character.CONNECTOR_PUNCTUATION:
1360                                        case Character.OTHER_PUNCTUATION:
1361                                            break;
1362
1363                                        default:
1364                                            return -1;
1365                                    }
1366                                }
1367
1368                                break;
1369
1370                            case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1371                                {
1372                                    boolean isXDigit = ((search.charAt(idx) >= '0' &&
1373                                            search.charAt(idx) <= '9') ||
1374                                            (search.charAt(idx) >= 'a' &&
1375                                            search.charAt(idx) <= 'f') ||
1376                                            (search.charAt(idx) >= 'A' &&
1377                                            search.charAt(idx) <= 'F'));
1378
1379                                    if (!isXDigit) {
1380                                        return -1;
1381                                    }
1382                                }
1383
1384                                break;
1385
1386                            case POSIX_CLASS_JSTART:
1387                                if (!Character.isJavaIdentifierStart(search.charAt(idx))) {
1388                                    return -1;
1389                                }
1390
1391                                break;
1392
1393                            case POSIX_CLASS_JPART:
1394                                if (!Character.isJavaIdentifierPart(search.charAt(idx))) {
1395                                    return -1;
1396                                }
1397
1398                                break;
1399
1400                            default:
1401                                internalError("Bad posix class");
1402                                break;
1403                        }
1404
1405                        // Matched.
1406                        idx++;
1407                    }
1408
1409                    break;
1410
1411                case OP_ANYOF:
1412                    {
1413
1414                        // Out of input?
1415                        if (search.isEnd(idx)) {
1416                            return -1;
1417                        }
1418
1419                        // Get character to match against character class and maybe casefold
1420                        char c = search.charAt(idx);
1421                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1422
1423                        if (caseFold) {
1424                            c = Character.toLowerCase(c);
1425                        }
1426
1427                        // Loop through character class checking our match character
1428                        int idxRange = node + nodeSize;
1429                        int idxEnd = idxRange + (opdata * 2);
1430                        boolean match = false;
1431
1432                        for (int i = idxRange; i < idxEnd;) {
1433
1434                            // Get start, end and match characters
1435                            char s = instruction[i++];
1436                            char e = instruction[i++];
1437
1438                            // Fold ends of range and match character
1439                            if (caseFold) {
1440                                s = Character.toLowerCase(s);
1441                                e = Character.toLowerCase(e);
1442                            }
1443                            // If the match character is in range, break out
1444                            if (c >= s && c <= e) {
1445                                match = true;
1446                                break;
1447                            }
1448                        }
1449                        // Fail if we didn't match the character class
1450                        if (!match) {
1451                            return -1;
1452                        }
1453
1454                        idx++;
1455                    }
1456
1457                    break;
1458
1459                case OP_BRANCH:
1460                    {
1461
1462                        // Check for choices
1463                        if (instruction[next + offsetOpcode] != OP_BRANCH) {
1464
1465                            // If there aren't any other choices, just evaluate this branch.
1466                            node += nodeSize;
1467                            continue;
1468                        }
1469
1470                        // Try all available branches
1471                        short nextBranch;
1472
1473                        do {
1474
1475                            // Try matching the branch against the string
1476                            if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) {
1477                                return idxNew;
1478                            }
1479
1480                            // Go to next branch (if any)
1481                            nextBranch = (short) instruction[node + offsetNext];
1482                            node += nextBranch;
1483                        } while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1484                        // Failed to match any branch!
1485                        return -1;
1486                    }
1487                case OP_NOTHING:
1488                case OP_GOTO:
1489
1490                    // Just advance to the next node without doing anything
1491                    break;
1492
1493                case OP_END:
1494
1495                    // Match has succeeded!
1496                    setParenEnd(0, idx);
1497
1498                    return idx;
1499
1500                default:
1501
1502                    // Corrupt program
1503                    internalError("Invalid opcode '" + opcode + "'");
1504            }
1505
1506            // Advance to the next node in the program
1507            node = next;
1508        }
1509
1510        // We "should" never end up here
1511        internalError("Corrupt program");
1512
1513        return -1;
1514    }
1515
1516    /**
1517     * Sets match behaviour flags which alter the way RE does matching.
1518     *
1519     * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
1520     *                   <p/>
1521     *                   <pre>
1522     *                                     <p/>
1523     *                                       MATCH_NORMAL              // Normal (case-sensitive) matching
1524     *                                       MATCH_CASEINDEPENDENT     // Case folded comparisons
1525     *                                       MATCH_MULTILINE           // Newline matches as BOL/EOL
1526     *                                     <p/>
1527     *                                     </pre>
1528     */
1529    public void setMatchFlags(int matchFlags) {
1530        this.matchFlags = matchFlags;
1531    }
1532
1533    /**
1534     * Sets the end of a paren level
1535     *
1536     * @param which Which paren level
1537     * @param i     Index in input array
1538     */
1539    protected final void setParenEnd(int which, int i) {
1540        if (which < parenCount) {
1541            switch (which) {
1542                case 0:
1543                    end0 = i;
1544                    break;
1545
1546                case 1:
1547                    end1 = i;
1548                    break;
1549
1550                case 2:
1551                    end2 = i;
1552                    break;
1553
1554                default:
1555
1556                    if (endn == null) {
1557                        allocParens();
1558                    }
1559
1560                    endn[which] = i;
1561                    break;
1562            }
1563        }
1564    }
1565
1566    /**
1567     * Sets the start of a paren level
1568     *
1569     * @param which Which paren level
1570     * @param i     Index in input array
1571     */
1572    protected final void setParenStart(int which, int i) {
1573        if (which < parenCount) {
1574            switch (which) {
1575                case 0:
1576                    start0 = i;
1577                    break;
1578
1579                case 1:
1580                    start1 = i;
1581                    break;
1582
1583                case 2:
1584                    start2 = i;
1585                    break;
1586
1587                default:
1588
1589                    if (startn == null) {
1590                        allocParens();
1591                    }
1592
1593                    startn[which] = i;
1594                    break;
1595            }
1596        }
1597    }
1598
1599    /**
1600     * Sets the current regular expression program used by this matcher object.
1601     *
1602     * @param program Regular expression program compiled by RECompiler.
1603     * @see RECompiler
1604     * @see REProgram
1605     */
1606    public void setProgram(REProgram program) {
1607        this.program = program;
1608    }
1609
1610    /**
1611     * Converts a 'simplified' regular expression to a full regular expression
1612     *
1613     * @param pattern The pattern to convert
1614     * @return The full regular expression
1615     */
1616    public static String   simplePatternToFullRegularExpression(String   pattern) {
1617        StringBuffer   buf = new StringBuffer  ();
1618
1619        for (int i = 0; i < pattern.length(); i++) {
1620            char c = pattern.charAt(i);
1621
1622            switch (c) {
1623                case '*':
1624                    buf.append(".*");
1625                    break;
1626
1627                case '.':
1628                case '[':
1629                case ']':
1630                case '\\':
1631                case '+':
1632                case '?':
1633                case '{':
1634                case '}':
1635                case '$':
1636                case '^':
1637                case '|':
1638                case '(':
1639                case ')':
1640                    buf.append('\\');
1641
1642                default:
1643                    buf.append(c);
1644                    break;
1645            }
1646        }
1647
1648        return buf.toString();
1649    }
1650
1651    /**
1652     * Splits a string into an array of strings on regular expression boundaries.
1653     * This function works the same way as the Perl function of the same name.
1654     * Given a regular expression of "[ab]+" and a string to split of
1655     * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1656     * "[xyzzy, yyz, 123]".
1657     *
1658     * @param s String to split on this regular exression
1659     * @return Array of strings
1660     */
1661    public String  [] split(String   s) {
1662
1663        // Create new vector
1664        Vector   v = new Vector  ();
1665
1666        // Start at position 0 and search the whole string
1667        int pos = 0;
1668        int len = s.length();
1669
1670        // Try a match at each position
1671        while (pos < len && match(s, pos)) {
1672
1673            // Get start of match
1674            int start = getParenStart(0);
1675
1676            // Get end of match
1677            int newpos = getParenEnd(0);
1678
1679            // Check if no progress was made
1680            if (newpos == pos) {
1681                v.addElement(s.substring(pos, start + 1));
1682                newpos++;
1683            } else {
1684                v.addElement(s.substring(pos, start));
1685            }
1686
1687            // Move to new position
1688            pos = newpos;
1689        }
1690
1691        // Push remainder if it's not empty
1692        String   remainder = s.substring(pos);
1693
1694        if (remainder.length() != 0) {
1695            v.addElement(remainder);
1696        }
1697
1698        // Return vector as an array of strings
1699        String  [] ret = new String  [v.size()];
1700        v.copyInto(ret);
1701
1702        return ret;
1703    }
1704
1705    /**
1706     * Substitutes a string for this regular expression in another string.
1707     * This method works like the Perl function of the same name.
1708     * Given a regular expression of "a*b", a String to substituteIn of
1709     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1710     * resulting String returned by subst would be "-foo-garply-wacky-".
1711     *
1712     * @param substituteIn String to substitute within
1713     * @param substitution String to substitute for all matches of this regular expression.
1714     * @return The string substituteIn with zero or more occurrences of the current
1715     *         regular expression replaced with the substitution String (if this regular
1716     *         expression object doesn't match at any position, the original String is returned
1717     *         unchanged).
1718     */
1719    public String   subst(String   substituteIn, String   substitution) {
1720        return subst(substituteIn, substitution, REPLACE_ALL);
1721    }
1722
1723    /**
1724     * Substitutes a string for this regular expression in another string.
1725     * This method works like the Perl function of the same name.
1726     * Given a regular expression of "a*b", a String to substituteIn of
1727     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1728     * resulting String returned by subst would be "-foo-garply-wacky-".
1729     *
1730     * @param substituteIn String to substitute within
1731     * @param substitution String to substitute for matches of this regular expression
1732     * @param flags        One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1733     *                     flag bit is set, only the first occurrence of this regular expression is replaced.
1734     *                     If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1735     *                     replaced.
1736     * @return The string substituteIn with zero or more occurrences of the current
1737     *         regular expression replaced with the substitution String (if this regular
1738     *         expression object doesn't match at any position, the original String is returned
1739     *         unchanged).
1740     */
1741    public String   subst(String   substituteIn, String   substitution, int flags) {
1742
1743        // String to return
1744        StringBuffer   ret = new StringBuffer  ();
1745
1746        // Start at position 0 and search the whole string
1747        int pos = 0;
1748        int len = substituteIn.length();
1749
1750        // Try a match at each position
1751        while (pos < len && match(substituteIn, pos)) {
1752
1753            // Append string before match
1754            ret.append(substituteIn.substring(pos, getParenStart(0)));
1755
1756            // Append substitution
1757            ret.append(substitution);
1758
1759            // Move forward, skipping past match
1760            int newpos = getParenEnd(0);
1761
1762            // We always want to make progress!
1763            if (newpos == pos) {
1764                newpos++;
1765            }
1766
1767            // Try new position
1768            pos = newpos;
1769
1770            // Break out if we're only supposed to replace one occurrence
1771            if ((flags & REPLACE_FIRSTONLY) != 0) {
1772                break;
1773            }
1774        }
1775        // If there's remaining input, append it
1776        if (pos < len) {
1777            ret.append(substituteIn.substring(pos));
1778        }
1779
1780        // Return string buffer as string
1781        return ret.toString();
1782    }
1783}
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags