Java > Open Source Codes > com > sun > org > apache > regexp > internal > RE


1   package com.sun.org.apache.regexp.internal;
2   
3   /*
4    * ====================================================================
5    * 
6    * The Apache Software License, Version 1.1
7    *
8    * Copyright (c) 1999 The Apache Software Foundation.  All rights 
9    * reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   *
15   * 1. Redistributions of source code must retain the above copyright
16   *    notice, this list of conditions and the following disclaimer. 
17   *
18   * 2. Redistributions in binary form must reproduce the above copyright
19   *    notice, this list of conditions and the following disclaimer in
20   *    the documentation and/or other materials provided with the
21   *    distribution.
22   *
23   * 3. The end-user documentation included with the redistribution, if
24   *    any, must include the following acknowlegement:  
25   *       "This product includes software developed by the 
26   *        Apache Software Foundation (http://www.apache.org/)."
27   *    Alternately, this acknowlegement may appear in the software itself,
28   *    if and wherever such third-party acknowlegements normally appear.
29   *
30   * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31   *    Foundation" must not be used to endorse or promote products derived
32   *    from this software without prior written permission. For written 
33   *    permission, please contact apache@apache.org.
34   *
35   * 5. Products derived from this software may not be called "Apache"
36   *    nor may "Apache" appear in their names without prior written
37   *    permission of the Apache Group.
38   *
39   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50   * SUCH DAMAGE.
51   * ====================================================================
52   *
53   * This software consists of voluntary contributions made by many
54   * individuals on behalf of the Apache Software Foundation.  For more
55   * information on the Apache Software Foundation, please see
56   * <http://www.apache.org/>.
57   *
58   */ 
59   
60  import java.util.Vector  ;
61  
62  /**
63   * RE is an efficient, lightweight regular expression evaluator/matcher class.
64   * Regular expressions are pattern descriptions which enable sophisticated matching of
65   * strings.  In addition to being able to match a string against a pattern, you
66   * can also extract parts of the match.  This is especially useful in text parsing!
67   * Details on the syntax of regular expression patterns are given below.
68   *
69   * <p>
70   *
71   * To compile a regular expression (RE), you can simply construct an RE matcher
72   * object from the string specification of the pattern, like this:
73   *
74   * <pre>
75   *
76   *     RE r = new RE("a*b");
77   *
78   * </pre>
79   *
80   * <p>
81   *
82   * Once you have done this, you can call either of the RE.match methods to
83   * perform matching on a String.  For example:
84   *
85   * <pre>
86   *
87   *     boolean matched = r.match("aaaab");
88   *
89   * </pre>
90   *
91   * will cause the boolean matched to be set to true because the
92   * pattern "a*b" matches the string "aaaab".
93   *
94   * <p>
95   * If you were interested in the <i>number</i> of a's which matched the first
96   * part of our example expression, you could change the expression to
97   * "(a*)b".  Then when you compiled the expression and matched it against
98   * something like "xaaaab", you would get results like this:
99   *
100  * <pre>
101  *
102  *     RE r = new RE("(a*)b");                  // Compile expression
103  *     boolean matched = r.match("xaaaab");     // Match against "xaaaab"
104  *
105  * <br>
106  *
107  *     String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
108  *     String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
109  *
110  * <br>
111  *
112  *     int startWholeExpr = getParenStart(0);   // startWholeExpr will be index 1
113  *     int endWholeExpr = getParenEnd(0);       // endWholeExpr will be index 6
114  *     int lenWholeExpr = getParenLength(0);    // lenWholeExpr will be 5
115  *
116  * <br>
117  *
118  *     int startInside = getParenStart(1);      // startInside will be index 1
119  *     int endInside = getParenEnd(1);          // endInside will be index 5
120  *     int lenInside = getParenLength(1);       // lenInside will be 4
121  *
122  * </pre>
123  *
124  * You can also refer to the contents of a parenthesized expression within
125  * a regular expression itself.  This is called a 'backreference'.  The first
126  * backreference in a regular expression is denoted by \1, the second by \2
127  * and so on.  So the expression:
128  *
129  * <pre>
130  *
131  *     ([0-9]+)=\1
132  *
133  * </pre>
134  *
135  * will match any string of the form n=n (like 0=0 or 2=2).
136  *
137  * <p>
138  *
139  * The full regular expression syntax accepted by RE is described here:
140  *
141  * <pre>
142  *
143  * <br>
144  *
145  *  <b><font face=times roman>Characters</font></b>
146  *
147  * <br>
148  *
149  *    <i>unicodeChar</i>          Matches any identical unicode character
150  *    \                    Used to quote a meta-character (like '*')
151  *    \\                   Matches a single '\' character
152  *    \0nnn                Matches a given octal character
153  *    \xhh                 Matches a given 8-bit hexadecimal character
154  *    \\uhhhh               Matches a given 16-bit hexadecimal character
155  *    \t                   Matches an ASCII tab character
156  *    \n                   Matches an ASCII newline character
157  *    \r                   Matches an ASCII return character
158  *    \f                   Matches an ASCII form feed character
159  *
160  * <br>
161  *
162  *  <b><font face=times roman>Character Classes</font></b>
163  *
164  * <br>
165  *
166  *    [abc]                Simple character class
167  *    [a-zA-Z]             Character class with ranges
168  *    [^abc]               Negated character class
169  *
170  * <br>
171  *
172  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
173  *
174  * <br>
175  *
176  *    [:alnum:]            Alphanumeric characters. 
177  *    [:alpha:]            Alphabetic characters. 
178  *    [:blank:]            Space and tab characters. 
179  *    [:cntrl:]            Control characters. 
180  *    [:digit:]            Numeric characters. 
181  *    [:graph:]            Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.) 
182  *    [:lower:]            Lower-case alphabetic characters. 
183  *    [:print:]            Printable characters (characters that are not control characters.) 
184  *    [:punct:]            Punctuation characters (characters that are not letter, digits, control characters, or space characters). 
185  *    [:space:]            Space characters (such as space, tab, and formfeed, to name a few). 
186  *    [:upper:]            Upper-case alphabetic characters. 
187  *    [:xdigit:]           Characters that are hexadecimal digits.
188  *         
189  * <br>
190  *
191  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
192  *
193  * <br>
194  *
195  *    [:javastart:]        Start of a Java identifier
196  *    [:javapart:]         Part of a Java identifier
197  *
198  * <br>
199  *         
200  *  <b><font face=times roman>Predefined Classes</font></b>
201  *
202  * <br>
203  *
204  *    .                    Matches any character other than newline
205  *    \w                   Matches a "word" character (alphanumeric plus "_")
206  *    \W                   Matches a non-word character
207  *    \s                   Matches a whitespace character
208  *    \S                   Matches a non-whitespace character
209  *    \d                   Matches a digit character
210  *    \D                   Matches a non-digit character
211  *
212  * <br>
213  *
214  *  <b><font face=times roman>Boundary Matchers</font></b>
215  *
216  * <br>
217  *
218  *    ^                    Matches only at the beginning of a line
219  *    $                    Matches only at the end of a line
220  *    \b                   Matches only at a word boundary
221  *    \B                   Matches only at a non-word boundary
222  *
223  * <br>
224  *
225  *  <b><font face=times roman>Greedy Closures</font></b>
226  *
227  * <br>
228  *
229  *    A*                   Matches A 0 or more times (greedy)
230  *    A+                   Matches A 1 or more times (greedy)
231  *    A?                   Matches A 1 or 0 times (greedy)
232  *    A{n}                 Matches A exactly n times (greedy)
233  *    A{n,}                Matches A at least n times (greedy)
234  *    A{n,m}               Matches A at least n but not more than m times (greedy)
235  *
236  * <br>
237  *
238  *  <b><font face=times roman>Reluctant Closures</font></b>
239  *
240  * <br>
241  *
242  *    A*?                  Matches A 0 or more times (reluctant)
243  *    A+?                  Matches A 1 or more times (reluctant)
244  *    A??                  Matches A 0 or 1 times (reluctant)
245  *
246  * <br>
247  *
248  *  <b><font face=times roman>Logical Operators</font></b>
249  *
250  * <br>
251  *
252  *    AB                   Matches A followed by B
253  *    A|B                  Matches either A or B
254  *    (A)                  Used for subexpression grouping
255  *
256  * <br>
257  *
258  *  <b><font face=times roman>Backreferences</font></b>
259  *
260  * <br>
261  *
262  *    \1                   Backreference to 1st parenthesized subexpression
263  *    \2                   Backreference to 2nd parenthesized subexpression
264  *    \3                   Backreference to 3rd parenthesized subexpression
265  *    \4                   Backreference to 4th parenthesized subexpression
266  *    \5                   Backreference to 5th parenthesized subexpression
267  *    \6                   Backreference to 6th parenthesized subexpression
268  *    \7                   Backreference to 7th parenthesized subexpression
269  *    \8                   Backreference to 8th parenthesized subexpression
270  *    \9                   Backreference to 9th parenthesized subexpression
271  *
272  * <br>
273  *
274  * </pre>
275  *
276  * <p>
277  *
278  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
279  * match as many elements of the string as possible without causing the overall
280  * match to fail.  If you want a closure to be reluctant (non-greedy), you can
281  * simply follow it with a '?'.  A reluctant closure will match as few elements
282  * of the string as possible when finding matches.  {m,n} closures don't currently
283  * support reluctancy.
284  *
285  * <p>
286  *
287  * RE runs programs compiled by the RECompiler class.  But the RE matcher class
288  * does not include the actual regular expression compiler for reasons of
289  * efficiency.  In fact, if you want to pre-compile one or more regular expressions,
290  * the 'recompile' class can be invoked from the command line to produce compiled
291  * output like this:
292  *
293  * <pre>
294  *
295  *    // Pre-compiled regular expression "a*b"
296  *    char[] re1Instructions =
297  *    {
298  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
299  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
300  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
301  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
302  *        0x0000,
303  *    };
304  *
305  *    <br>
306  *
307  *    REProgram re1 = new REProgram(re1Instructions);
308  *
309  * </pre>
310  *
311  * You can then construct a regular expression matcher (RE) object from the pre-compiled
312  * expression re1 and thus avoid the overhead of compiling the expression at runtime.
313  * If you require more dynamic regular expressions, you can construct a single RECompiler
314  * object and re-use it to compile each expression.  Similarly, you can change the
315  * program run by a given matcher object at any time.  However, RE and RECompiler are
316  * not threadsafe (for efficiency reasons, and because requiring thread safety in this
317  * class is deemed to be a rare requirement), so you will need to construct a separate
318  * compiler or matcher object for each thread (unless you do thread synchronization
319  * yourself).
320  *
321  * </pre>
322  * <br><p><br>
323  *
324  * <font color=red>
325  * <i>ISSUES:</i>
326  *
327  * <ul>
328  *  <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
329  *  <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
330  *  <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
331  *  <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
332  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)
333  *  <li>Should RE support character iterators (for backwards RE matching!)?
334  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?
335  *  <li>Not *all* possibilities are considered for greediness when backreferences
336  *      are involved (as POSIX suggests should be the case).  The POSIX RE
337  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
338  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
339  *      and actually Perl doesn't go to this extent either!  Until someone
340  *      actually complains about this, I'm not sure it's worth "fixing".
341  *      If it ever is fixed, test #137 in RETest.txt should be updated.
342  * </ul>
343  *
344  * </font>
345  *
346  * @see recompile
347  * @see RECompiler
348  *
349  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
350  * @version $Id: RE.java,v 1.6 2000/08/22 17:19:38 jon Exp $
351  */
352 public class RE
353 {
354     /**
355      * Specifies normal, case-sensitive matching behaviour.
356      */
357     public static final int MATCH_NORMAL          = 0x0000;
358 
359     /**
360      * Flag to indicate that matching should be case-independent (folded)
361      */
362     public static final int MATCH_CASEINDEPENDENT = 0x0001;
363 
364     /**
365      * Newlines should match as BOL/EOL (^ and $)
366      */
367     public static final int MATCH_MULTILINE       = 0x0002;
368 
369     /**
370      * Consider all input a single body of text - newlines are matched by .
371      */
372     public static final int MATCH_SINGLELINE      = 0x0004;
373 
374     /************************************************
375      *                                              *
376      * The format of a node in a program is:        *
377      *                                              *
378      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
379      *                                              *
380      * char OPCODE - instruction                    *
381      * char OPDATA - modifying data                 *
382      * char OPNEXT - next node (relative offset)    *
383      *                                              *
384      ************************************************/
385 
386                  //   Opcode              Char       Opdata/Operand  Meaning
387                  //   ----------          ---------- --------------- --------------------------------------------------
388     static final char OP_END              = 'E';  //                 end of program
389     static final char OP_BOL              = '^';  //                 match only if at beginning of line
390     static final char OP_EOL              = '$';  //                 match only if at end of line
391     static final char OP_ANY              = '.';  //                 match any single character except newline
392     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
393     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
394     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
395     static final char OP_STAR             = '*';  // node            kleene closure
396     static final char OP_PLUS             = '+';  // node            positive closure
397     static final char OP_MAYBE            = '?';  // node            optional closure
398     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
399     static final char OP_OPEN             = '(';  // number          nth opening paren
400     static final char OP_CLOSE            = ')';  // number          nth closing paren
401     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
402     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
403     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
404     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
405     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
406     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
407     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
408 
409     // Escape codes
410     static final char E_ALNUM             = 'w';  // Alphanumeric
411     static final char E_NALNUM            = 'W';  // Non-alphanumeric
412     static final char E_BOUND             = 'b';  // Word boundary
413     static final char E_NBOUND            = 'B';  // Non-word boundary
414     static final char E_SPACE             = 's';  // Whitespace
415     static final char E_NSPACE            = 'S';  // Non-whitespace
416     static final char E_DIGIT             = 'd';  // Digit
417     static final char E_NDIGIT            = 'D';  // Non-digit
418 
419     // Posix character classes
420     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
421     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics 
422     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
423     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
424     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
425     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
426     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
427     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
428     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
429     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
430     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
431     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
432     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
433     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
434 
435     // Limits
436     static final int maxNode  = 65536;            // Maximum number of nodes in a program
437     static final int maxParen = 16;               // Number of paren pairs (only 9 can be backrefs)
438 
439     // Node layout constants
440     static final int offsetOpcode = 0;            // Opcode offset (first character)
441     static final int offsetOpdata = 1;            // Opdata offset (second char)
442     static final int offsetNext   = 2;            // Next index offset (third char)
443     static final int nodeSize     = 3;            // Node size (in chars)
444 
445     /** Line Separator */
446     static final String   NEWLINE = System.getProperty("line.separator");
447 
448     // State of current program
449     REProgram program;                            // Compiled regular expression 'program'
450     CharacterIterator search;                                // The string being matched against
451     int idx;                                      // Current index in string being searched
452     int matchFlags;                               // Match behaviour flags
453 
454     // Parenthesized subexpressions
455     int parenCount;                               // Number of subexpressions matched (num open parens + 1)
456     int start0;                                   // Cache of start[0]
457     int end0;                                     // Cache of start[0]
458     int start1;                                   // Cache of start[1]
459     int end1;                                     // Cache of start[1]
460     int start2;                                   // Cache of start[2]
461     int end2;                                     // Cache of start[2]
462     int[] startn;                                 // Lazy-alloced array of sub-expression starts
463     int[] endn;                                   // Lazy-alloced array of sub-expression ends
464 
465     // Backreferences
466     int[] startBackref;                           // Lazy-alloced array of backref starts
467     int[] endBackref;                             // Lazy-alloced array of backref ends
468 
469     /**
470      * Constructs a regular expression matcher from a String by compiling it
471      * using a new instance of RECompiler.  If you will be compiling many
472      * expressions, you may prefer to use a single RECompiler object instead.
473      * @param pattern The regular expression pattern to compile.
474      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
475      * @see RECompiler
476      * @see recompile
477      */
478     public RE(String   pattern) throws RESyntaxException
479     {
480         this(pattern, MATCH_NORMAL);
481     }
482 
483     /**
484      * Constructs a regular expression matcher from a String by compiling it
485      * using a new instance of RECompiler.  If you will be compiling many
486      * expressions, you may prefer to use a single RECompiler object instead.
487      * @param pattern The regular expression pattern to compile.
488      * @param matchFlags The matching style
489      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
490      * @see RECompiler
491      * @see recompile
492      */
493     public RE(String   pattern, int matchFlags) throws RESyntaxException
494     {
495         this(new RECompiler().compile(pattern));
496         setMatchFlags(matchFlags);
497     }
498 
499     /**
500      * Construct a matcher for a pre-compiled regular expression from program
501      * (bytecode) data.  Permits special flags to be passed in to modify matching
502      * behaviour.
503      * @param program Compiled regular expression program (see RECompiler and/or recompile)
504      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
505      *
506      * <pre>
507      *
508      *   MATCH_NORMAL              // Normal (case-sensitive) matching
509      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
510      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
511      *
512      * </pre>
513      *
514      * @see RECompiler
515      * @see REProgram
516      * @see recompile
517      */
518     public RE(REProgram program, int matchFlags)
519     {
520         setProgram(program);
521         setMatchFlags(matchFlags);
522     }
523 
524     /**
525      * Construct a matcher for a pre-compiled regular expression from program
526      * (bytecode) data.
527      * @param program Compiled regular expression program
528      * @see RECompiler
529      * @see recompile
530      */
531     public RE(REProgram program)
532     {
533         this(program, MATCH_NORMAL);
534     }
535 
536     /**
537      * Constructs a regular expression matcher with no initial program.
538      * This is likely to be an uncommon practice, but is still supported.
539      */
540     public RE()
541     {
542         this((REProgram)null, MATCH_NORMAL);
543     }
544 
545     /**
546      * Converts a 'simplified' regular expression to a full regular expression
547      * @param pattern The pattern to convert
548      * @return The full regular expression
549      */
550     public static String   simplePatternToFullRegularExpression(String   pattern)
551     {
552         StringBuffer   buf = new StringBuffer  ();
553         for (int i = 0; i < pattern.length(); i++)
554         {
555             char c = pattern.charAt(i);
556             switch (c)
557             {
558                 case '*':
559                     buf.append(".*");
560                     break;
561 
562                 case '.':
563                 case '[':
564                 case ']':
565                 case '\\':
566                 case '+':
567                 case '?':
568                 case '{':
569                 case '}':
570                 case '$':
571                 case '^':
572                 case '|':
573                 case '(':
574                 case ')':
575                     buf.append('\\');
576                 default:
577                     buf.append(c);
578                     break;
579             }
580         }
581         return buf.toString();
582     }
583 
584     /**
585      * Sets match behaviour flags which alter the way RE does matching.
586      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
587      *
588      * <pre>
589      *
590      *   MATCH_NORMAL              // Normal (case-sensitive) matching
591      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
592      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
593      *
594      * </pre>
595      *
596      */
597     public void setMatchFlags(int matchFlags)
598     {
599         this.matchFlags = matchFlags;
600     }
601 
602     /**
603      * Returns the current match behaviour flags.
604      * @return Current match behaviour flags (RE.MATCH_*).
605      *
606      * <pre>
607      *
608      *   MATCH_NORMAL              // Normal (case-sensitive) matching
609      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
610      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
611      *
612      * </pre>
613      *
614      * @see #setMatchFlags
615      *
616      */
617     public int getMatchFlags()
618     {
619         return matchFlags;
620     }
621 
622     /**
623      * Sets the current regular expression program used by this matcher object.
624      * @param program Regular expression program compiled by RECompiler.
625      * @see RECompiler
626      * @see REProgram
627      * @see recompile
628      */
629     public void setProgram(REProgram program)
630     {
631         this.program = program;
632     }
633 
634     /**
635      * Returns the current regular expression program in use by this matcher object.
636      * @return Regular expression program
637      * @see #setProgram
638      */
639     public REProgram getProgram()
640     {
641         return program;
642     }
643 
644     /**
645      * Returns the number of parenthesized subexpressions available after a successful match.
646      * @return Number of available parenthesized subexpressions
647      */
648     public int getParenCount()
649     {
650         return parenCount;
651     }
652 
653     /**
654      * Gets the contents of a parenthesized subexpression after a successful match.
655      * @param which Nesting level of subexpression
656      * @return String
657      */
658     public String   getParen(int which)
659     {
660         int start;
661         if (which < parenCount && (start = getParenStart(which)) >= 0)
662         {
663             return search.substring(start, getParenEnd(which));
664         }
665         return null;
666     }
667 
668     /**
669      * Returns the start index of a given paren level.
670      * @param which Nesting level of subexpression
671      * @return String index 
672      */
673     public final int getParenStart(int which)
674     {
675         if (which < parenCount)
676         {
677             switch (which)
678             {
679                 case 0:
680                     return start0;
681                     
682                 case 1:
683                     return start1;
684                     
685                 case 2:
686                     return start2;
687                     
688                 default:
689                     if (startn == null)
690                     {
691                         allocParens();
692                     }
693                     return startn[which];
694             }
695         }
696         return -1;
697     }
698 
699     /**
700      * Returns the end index of a given paren level.
701      * @param which Nesting level of subexpression
702      * @return String index 
703      */
704     public final int getParenEnd(int which)
705     {
706         if (which < parenCount)
707         {
708             switch (which)
709             {
710                 case 0:
711                     return end0;
712                     
713                 case 1:
714                     return end1;
715                     
716                 case 2:
717                     return end2;
718                     
719                 default:
720                     if (endn == null)
721                     {
722                         allocParens();
723                     }
724                     return endn[which];
725             }
726         }
727         return -1;
728     }
729 
730     /**
731      * Returns the length of a given paren level.
732      * @param which Nesting level of subexpression
733      * @return Number of characters in the parenthesized subexpression
734      */
735     public final int getParenLength(int which)
736     {
737         if (which < parenCount)
738         {
739             return getParenEnd(which) - getParenStart(which);
740         }
741         return -1;
742     }
743 
744     /**
745      * Sets the start of a paren level
746      * @param which Which paren level
747      * @param i Index in input array
748      */
749     protected final void setParenStart(int which, int i)
750     {
751         if (which < parenCount)
752         {
753             switch (which)
754             {
755                 case 0:
756                     start0 = i;
757                     break;
758                     
759                 case 1:
760                     start1 = i;
761                     break;
762                     
763                 case 2:
764                     start2 = i;
765                     break;
766                     
767                 default:
768                     if (startn == null)
769                     {
770                         allocParens();
771                     }
772                     startn[which] = i;
773                     break;
774             }
775         }
776     }
777 
778     /**
779      * Sets the end of a paren level
780      * @param which Which paren level
781      * @param i Index in input array
782      */
783     protected final void setParenEnd(int which, int i)
784     {
785         if (which < parenCount)
786         {
787             switch (which)
788             {
789                 case 0:
790                     end0 = i;
791                     break;
792                     
793                 case 1:
794                     end1 = i;
795                     break;
796                     
797                 case 2:
798                     end2 = i;
799                     break;
800                     
801                 default:
802                     if (endn == null)
803                     {
804                         allocParens();
805                     }
806                     endn[which] = i;
807                     break;
808             }
809         }
810     }
811 
812     /**
813      * Throws an Error representing an internal error condition probably resulting
814      * from a bug in the regular expression compiler (or possibly data corruption).
815      * In practice, this should be very rare.
816      * @param s Error description
817      */
818     protected void internalError(String   s) throws Error  
819     {
820         throw new Error  ("RE internal error: " + s);
821     }
822 
823     /**
824      * Performs lazy allocation of subexpression arrays
825      */
826     private final void allocParens()
827     {
828         // Allocate arrays for subexpressions
829         startn = new int[maxParen];
830         endn = new int[maxParen];
831 
832         // Set sub-expression pointers to invalid values
833         for (int i = 0; i < maxParen; i++)
834         {
835             startn[i] = -1;
836             endn[i] = -1;
837         }
838     }
839 
840     /**
841      * Try to match a string against a subset of nodes in the program
842      * @param firstNode Node to start at in program
843      * @param lastNode Last valid node (used for matching a subexpression without
844      * matching the rest of the program as well).
845      * @param idxStart Starting position in character array
846      * @return Final input array index if match succeeded.  -1 if not.
847      */
848     protected int matchNodes(int firstNode, int lastNode, int idxStart)
849     {
850         // Our current place in the string
851         int idx = idxStart;
852 
853         // Loop while node is valid
854         int next, opcode, opdata;
855         int idxNew;
856         char[] instruction = program.instruction;
857         for (int node = firstNode; node < lastNode; )
858         {
859             opcode = instruction[node + offsetOpcode];
860             next   = node + (short)instruction[node + offsetNext];
861             opdata = instruction[node + offsetOpdata];
862 
863             switch (opcode)
864             {
865                 case OP_RELUCTANTMAYBE:
866                     {
867                         int once = 0;
868                         do
869                         {
870                             // Try to match the rest without using the reluctant subexpr
871                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
872                             {
873                                 return idxNew;
874                             }
875                         }
876                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
877                         return -1;
878                     }
879 
880                 case OP_RELUCTANTPLUS:
881                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
882                     {
883                         // Try to match the rest without using the reluctant subexpr
884                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
885                         {
886                             return idxNew;
887                         }
888                     }
889                     return -1;
890 
891                 case OP_RELUCTANTSTAR:
892                     do
893                     {
894                         // Try to match the rest without using the reluctant subexpr
895                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
896                         {
897                             return idxNew;
898                         }
899                     }
900                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
901                     return -1;
902 
903                 case OP_OPEN:
904 
905                     // Match subexpression
906                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
907                     {
908                         startBackref[opdata] = idx;
909                     }
910                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
911                     {
912                         // Increase valid paren count
913                         if ((opdata + 1) > parenCount)
914                         {
915                             parenCount = opdata + 1;
916                         }
917 
918                         // Don't set paren if already set later on
919                         if (getParenStart(opdata) == -1)
920                         {
921                             setParenStart(opdata, idx);
922                         }
923                     }
924                     return idxNew;
925 
926                 case OP_CLOSE:
927 
928                     // Done matching subexpression
929                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
930                     {
931                         endBackref[opdata] = idx;
932                     }
933                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
934                     {
935                         // Increase valid paren count
936                         if ((opdata + 1) > parenCount)
937                         {
938                             parenCount = opdata + 1;
939                         }
940 
941                         // Don't set paren if already set later on
942                         if (getParenEnd(opdata) == -1)
943                         {
944                             setParenEnd(opdata, idx);
945                         }
946                     }
947                     return idxNew;
948 
949                 case OP_BACKREF:
950                     {
951                         // Get the start and end of the backref
952                         int s = startBackref[opdata];
953                         int e = endBackref[opdata];
954 
955                         // We don't know the backref yet
956                         if (s == -1 || e == -1)
957                         {
958                             return -1;
959                         }
960 
961                         // The backref is empty size
962                         if (s == e)
963                         {
964                             break;
965                         }
966 
967                         // Get the length of the backref
968                         int l = e - s;
969 
970                         // If there's not enough input left, give up.
971                         if (search.isEnd(idx + l - 1))
972                         {
973                             return -1;
974                         }
975 
976                         // Case fold the backref?
977                         if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
978                         {
979                             // Compare backref to input, case-folding as we go
980                             for (int i = 0; i < l; i++)
981                             {
982                                 if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
983                                 {
984                                     return -1;
985                                 }
986                             }
987                         }
988                         else
989                         {
990                             // Compare backref to input
991                             for (int i = 0; i < l; i++)
992                             {
993                                 if (search.charAt(idx++) != search.charAt(s + i))
994                                 {
995                                     return -1;
996                                 }
997                             }
998                         }
999                     }
1000                    break;
1001
1002                case OP_BOL:
1003
1004                    // Fail if we're not at the start of the string
1005                    if (idx != 0)
1006                    {
1007                        // If we're multiline matching, we could still be at the start of a line
1008                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1009                        {
1010                            // If not at start of line, give up
1011                            if (idx <= 0 || !isNewline(idx - 1)) {
1012                                return -1;
1013                            } else {
1014                                break;
1015                            }
1016                        }
1017                        return -1;
1018                    }
1019                    break;
1020
1021                case OP_EOL:
1022
1023                    // If we're not at the end of string
1024                    if (!search.isEnd(0) && !search.isEnd(idx))
1025                    {
1026                        // If we're multi-line matching
1027                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1028                        {
1029                            // Give up if we're not at the end of a line
1030                            if (! isNewline(idx)) {
1031                                return -1;
1032                            } else {
1033                                break;
1034                            }
1035                        }
1036                        return -1;
1037                    }
1038                    break;
1039
1040                case OP_ESCAPE:
1041
1042                    // Which escape?
1043                    switch (opdata)
1044                    {
1045                        // Word boundary match
1046                        case E_NBOUND:
1047                        case E_BOUND:
1048                            {
1049                                char cLast = ((idx == getParenStart(0)) ? '\n' : search.charAt(idx - 1));
1050                                char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1051                                if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1052                                {
1053                                    return -1;
1054                                }
1055                            }
1056                            break;
1057
1058                        // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1059                        case E_ALNUM:
1060                        case E_NALNUM:
1061                        case E_DIGIT:
1062                        case E_NDIGIT:
1063                        case E_SPACE:
1064                        case E_NSPACE:
1065
1066                            // Give up if out of input
1067                            if (search.isEnd(idx))
1068                            {
1069                                return -1;
1070                            }
1071
1072                            // Switch on escape
1073                            switch (opdata)
1074                            {
1075                                case E_ALNUM:
1076                                case E_NALNUM:
1077                                    if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM)))
1078                                    {
1079                                        return -1;
1080                                    }
1081                                    break;
1082
1083                                case E_DIGIT:
1084                                case E_NDIGIT:
1085                                    if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT)))
1086                                    {
1087                                        return -1;
1088                                    }
1089                                    break;
1090
1091                                case E_SPACE:
1092                                case E_NSPACE:
1093                                    if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE)))
1094                                    {
1095                                        return -1;
1096                                    }
1097                                    break;
1098                            }
1099                            idx++;
1100                            break;
1101
1102                        default:
1103                            internalError("Unrecognized escape '" + opdata + "'");
1104                    }
1105                    break;
1106
1107                case OP_ANY:
1108
1109                    if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1110                        // Match anything
1111                        if(search.isEnd(idx))
1112                        {
1113                            return -1;
1114                        }
1115                        idx++;
1116                        break;
1117                    }
1118                    else
1119                    {
1120                        // Match anything but a newline
1121                        if (search.isEnd(idx) || search.charAt(idx++) == '\n')
1122                        {
1123                            return -1;
1124                        }
1125                        break;
1126                    }
1127
1128                case OP_ATOM:
1129                    {
1130                        // Match an atom value
1131                        if (search.isEnd(idx))
1132                        {
1133                            return -1;
1134                        }
1135
1136                        // Get length of atom and starting index
1137                        int lenAtom = opdata;
1138                        int startAtom = node + nodeSize;
1139
1140                        // Give up if not enough input remains to have a match
1141                        if (search.isEnd(lenAtom + idx - 1))
1142                        {
1143                            return -1;
1144                        }
1145
1146                        // Match atom differently depending on casefolding flag
1147                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
1148                        {
1149                            for (int i = 0; i < lenAtom; i++)
1150                            {
1151                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
1152                                {
1153                                    return -1;
1154                                }
1155                            }
1156                        }
1157                        else
1158                        {
1159                            for (int i = 0; i < lenAtom; i++)
1160                            {
1161                                if (search.charAt(idx++) != instruction[startAtom + i])
1162                                {
1163                                    return -1;
1164                                }
1165                            }
1166                        }
1167                    }
1168                    break;
1169
1170                case OP_POSIXCLASS:
1171                    {
1172                        // Out of input?
1173                        if (search.isEnd(idx))
1174                        {
1175                            return -1;
1176                        }
1177                        
1178                        switch (opdata)
1179                        {
1180                            case POSIX_CLASS_ALNUM:
1181                                if (!Character.isLetterOrDigit(search.charAt(idx)))
1182                                {
1183                                    return -1;
1184                                }
1185                                break;
1186                                
1187                            case POSIX_CLASS_ALPHA:
1188                                if (!Character.isLetter(search.charAt(idx)))
1189                                {
1190                                    return -1;
1191                                }
1192                                break;
1193                                
1194                            case POSIX_CLASS_DIGIT:
1195                                if (!Character.isDigit(search.charAt(idx)))
1196                                {
1197                                    return -1;
1198                                }
1199                                break;
1200                                
1201                            case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1202                                if (!Character.isSpaceChar(search.charAt(idx)))
1203                                {
1204                                    return -1;
1205                                }
1206                                break;
1207                                
1208                            case POSIX_CLASS_SPACE:
1209                                if (!Character.isWhitespace(search.charAt(idx)))
1210                                {
1211                                    return -1;
1212                                }
1213                                break;
1214                                
1215                            case POSIX_CLASS_CNTRL:
1216                                if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1217                                {
1218                                    return -1;
1219                                }
1220                                break;
1221                                
1222                            case POSIX_CLASS_GRAPH: // JWL - bugbug???
1223                                switch (Character.getType(search.charAt(idx)))
1224                                {
1225                                    case Character.MATH_SYMBOL:
1226                                    case Character.CURRENCY_SYMBOL:
1227                                    case Character.MODIFIER_SYMBOL:
1228                                    case Character.OTHER_SYMBOL:
1229                                        break;
1230                                        
1231                                    default:
1232                                        return -1;
1233                                }
1234                                break;
1235                                
1236                            case POSIX_CLASS_LOWER:
1237                                if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1238                                {
1239                                    return -1;
1240                                }
1241                                break;
1242                                
1243                            case POSIX_CLASS_UPPER:
1244                                if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1245                                {
1246                                    return -1;
1247                                }
1248                                break;
1249                                
1250                            case POSIX_CLASS_PRINT:
1251                                if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1252                                {
1253                                    return -1;
1254                                }
1255                                break;
1256                                
1257                            case POSIX_CLASS_PUNCT:
1258                            {
1259                                int type = Character.getType(search.charAt(idx));
1260                                switch(type)
1261                                {
1262                                    case Character.DASH_PUNCTUATION:
1263                                    case Character.START_PUNCTUATION:
1264                                    case Character.END_PUNCTUATION:
1265                                    case Character.CONNECTOR_PUNCTUATION:
1266                                    case Character.OTHER_PUNCTUATION:
1267                                        break;
1268                                        
1269                                    default:
1270                                        return -1;
1271                                }
1272                            }
1273                            break;
1274
1275                            case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1276                            {
1277                                boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1278                                                    (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1279                                                    (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1280                                if (!isXDigit)
1281                                {
1282                                    return -1;
1283                                }
1284                            }
1285                            break;
1286                            
1287                            case POSIX_CLASS_JSTART:
1288                                if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1289                                {
1290                                    return -1;
1291                                }
1292                                break;
1293                                
1294                            case POSIX_CLASS_JPART:
1295                                if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1296                                {
1297                                    return -1;
1298                                }
1299                                break;
1300
1301                            default:
1302                                internalError("Bad posix class");
1303                                break;
1304                        }
1305                    
1306                        // Matched.
1307                        idx++;
1308                    }
1309                    break;
1310
1311                case OP_ANYOF:
1312                    {
1313                        // Out of input?
1314                        if (search.isEnd(idx))
1315                        {
1316                            return -1;
1317                        }
1318
1319                        // Get character to match against character class and maybe casefold
1320                        char c = search.charAt(idx);
1321                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1322                        if (caseFold)
1323                        {
1324                            c = Character.toLowerCase(c);
1325                        }
1326
1327                        // Loop through character class checking our match character
1328                        int idxRange = node + nodeSize;
1329                        int idxEnd = idxRange + (opdata * 2);
1330                        boolean match = false;
1331                        for (int i = idxRange; i < idxEnd; )
1332                        {
1333                            // Get start, end and match characters
1334                            char s = instruction[i++];
1335                            char e = instruction[i++];
1336
1337                            // Fold ends of range and match character
1338                            if (caseFold)
1339                            {
1340                                s = Character.toLowerCase(s);
1341                                e = Character.toLowerCase(e);
1342                            }
1343
1344                            // If the match character is in range, break out
1345                            if (c >= s && c <= e)
1346                            {
1347                                match = true;
1348                                break;
1349                            }
1350                        }
1351
1352                        // Fail if we didn't match the character class
1353                        if (!match)
1354                        {
1355                            return -1;
1356                        }
1357                        idx++;
1358                    }
1359                    break;
1360
1361                case OP_BRANCH:
1362                {
1363                    // Check for choices
1364                    if (instruction[next + offsetOpcode] != OP_BRANCH)
1365                    {
1366                        // If there aren't any other choices, just evaluate this branch.
1367                        node += nodeSize;
1368                        continue;
1369                    }
1370
1371                    // Try all available branches
1372                    short nextBranch;
1373                    do
1374                    {
1375                        // Try matching the branch against the string
1376                        if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1377                        {
1378                            return idxNew;
1379                        }
1380                        
1381                        // Go to next branch (if any)
1382                        nextBranch = (short)instruction[node + offsetNext];
1383                        node += nextBranch;
1384                    }
1385                    while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1386
1387                    // Failed to match any branch!
1388                    return -1;
1389                }
1390
1391                case OP_NOTHING:
1392                case OP_GOTO:
1393
1394                    // Just advance to the next node without doing anything
1395                    break;
1396
1397                case OP_END:
1398
1399                    // Match has succeeded!
1400                    setParenEnd(0, idx);
1401                    return idx;
1402
1403                default:
1404
1405                    // Corrupt program
1406                    internalError("Invalid opcode '" + opcode + "'");
1407            }
1408
1409            // Advance to the next node in the program
1410            node = next;
1411        }
1412
1413        // We "should" never end up here
1414        internalError("Corrupt program");
1415        return -1;
1416    }
1417
1418    /**
1419     * Match the current regular expression program against the current
1420     * input string, starting at index i of the input string.  This method
1421     * is only meant for internal use.
1422     * @param i The input string index to start matching at
1423     * @return True if the input matched the expression
1424     */
1425    protected boolean matchAt(int i)
1426    {
1427        // Initialize start pointer, paren cache and paren count
1428        start0 = -1;
1429        end0   = -1;
1430        start1 = -1;
1431        end1   = -1;
1432        start2 = -1;
1433        end2   = -1;
1434        startn = null;
1435        endn   = null;
1436        parenCount = 1;
1437        setParenStart(0, i);
1438
1439        // Allocate backref arrays (unless optimizations indicate otherwise)
1440        if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1441        {
1442            startBackref = new int[maxParen];
1443            endBackref = new int[maxParen];
1444        }
1445
1446        // Match against string
1447        int idx;
1448        if ((idx = matchNodes(0, maxNode, i)) != -1)
1449        {
1450            setParenEnd(0, idx);
1451            return true;
1452        }
1453
1454        // Didn't match
1455        parenCount = 0;
1456        return false;
1457    }
1458
1459    /**
1460     * Matches the current regular expression program against a character array,
1461     * starting at a given index.
1462     * @param search String to match against
1463     * @param i Index to start searching at
1464     * @return True if string matched
1465     */
1466    public boolean match(String   search, int i) 
1467    {
1468        return match(new StringCharacterIterator(search), i);
1469    }
1470
1471    /**
1472     * Matches the current regular expression program against a character array,
1473     * starting at a given index.
1474     * @param search String to match against
1475     * @param i Index to start searching at
1476     * @return True if string matched
1477     */
1478    public boolean match(CharacterIterator search, int i)
1479    {
1480        // There is no compiled program to search with!
1481        if (program == null)
1482        {
1483            // This should be uncommon enough to be an error case rather
1484            // than an exception (which would have to be handled everywhere)
1485            internalError("No RE program to run!");
1486        }
1487
1488        // Save string to search
1489        this.search = search;
1490
1491        // Can we optimize the search by looking for a prefix string?
1492        if (program.prefix == null)
1493        {
1494            // Unprefixed matching must try for a match at each character
1495            for ( ;! search.isEnd(i - 1); i++)
1496            {
1497                // Try a match at index i
1498                if (matchAt(i))
1499                {
1500                    return true;
1501                }
1502            }
1503            return false;
1504        }
1505        else
1506        {
1507            // Prefix-anchored matching is possible
1508            boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1509            char[] prefix = program.prefix;
1510            for ( ;! search.isEnd(i + prefix.length - 1); i++)
1511            {
1512                // If the first character of the prefix matches
1513                boolean match = false;
1514                if (caseIndependent)
1515                    match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
1516                else
1517                    match = search.charAt(i) == prefix[0];
1518                if (match)
1519                {
1520                    // Save first character position
1521                    int firstChar = i++;
1522                    int k;
1523                    for (k = 1; k < prefix.length; )
1524                    {
1525                        // If there's a mismatch of any character in the prefix, give up
1526                        if (caseIndependent)
1527                            match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
1528                        else
1529                            match = search.charAt(i++) == prefix[k++];
1530                        if (!match)
1531                        {
1532                            break;
1533                        }
1534                    }
1535
1536                    // See if the whole prefix string matched
1537                    if (k == prefix.length)
1538                    {
1539                        // We matched the full prefix at firstChar, so try it
1540                        if (matchAt(firstChar))
1541                        {
1542                            return true;
1543                        }
1544                    }
1545
1546                    // Match failed, reset i to continue the search
1547                    i = firstChar;
1548                }
1549            }
1550            return false;
1551        }
1552    }
1553
1554    /**
1555     * Matches the current regular expression program against a String.
1556     * @param search String to match against
1557     * @return True if string matched
1558     */
1559    public boolean match(String   search)
1560    {
1561        return match(search, 0);
1562    }
1563
1564    /**
1565     * Splits a string into an array of strings on regular expression boundaries.
1566     * This function works the same way as the Perl function of the same name.
1567     * Given a regular expression of "[ab]+" and a string to split of
1568     * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1569     * "[xyzzy, yyz, 123]".
1570     * @param s String to split on this regular exression
1571     * @return Array of strings
1572     */
1573    public String  [] split(String   s)
1574    {
1575        // Create new vector
1576        Vector   v = new Vector  ();
1577
1578        // Start at position 0 and search the whole string
1579        int pos = 0;
1580        int len = s.length();
1581
1582        // Try a match at each position
1583        while (pos < len && match(s, pos))
1584        {
1585            // Get start of match
1586            int start = getParenStart(0);
1587
1588            // Get end of match
1589            int newpos = getParenEnd(0);
1590
1591            // Check if no progress was made
1592            if (newpos == pos)
1593            {
1594                v.addElement(s.substring(pos, start + 1));
1595                newpos++;
1596            }
1597            else
1598            {
1599                v.addElement(s.substring(pos, start));
1600            }
1601
1602            // Move to new position
1603            pos = newpos;
1604        }
1605
1606        // Push remainder if it's not empty
1607        String   remainder = s.substring(pos);
1608        if (remainder.length() != 0)
1609        {
1610            v.addElement(remainder);
1611        }
1612
1613        // Return vector as an array of strings
1614        String  [] ret = new String  [v.size()];
1615        v.copyInto(ret);
1616        return ret;
1617    }
1618
1619    /**
1620     * Flag bit that indicates that subst should replace all occurrences of this
1621     * regular expression.
1622     */
1623    public static final int REPLACE_ALL          = 0x0000;
1624
1625    /**
1626     * Flag bit that indicates that subst should only replace the first occurrence
1627     * of this regular expression.
1628     */
1629    public static final int REPLACE_FIRSTONLY    = 0x0001;
1630
1631    /**
1632     * Substitutes a string for this regular expression in another string.
1633     * This method works like the Perl function of the same name.
1634     * Given a regular expression of "a*b", a String to substituteIn of
1635     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1636     * resulting String returned by subst would be "-foo-garply-wacky-".
1637     * @param substituteIn String to substitute within
1638     * @param substitution String to substitute for all matches of this regular expression.
1639     * @return The string substituteIn with zero or more occurrences of the current
1640     * regular expression replaced with the substitution String (if this regular
1641     * expression object doesn't match at any position, the original String is returned
1642     * unchanged).
1643     */
1644    public String   subst(String   substituteIn, String   substitution)
1645    {
1646        return subst(substituteIn, substitution, REPLACE_ALL);
1647    }
1648
1649    /**
1650     * Substitutes a string for this regular expression in another string.
1651     * This method works like the Perl function of the same name.
1652     * Given a regular expression of "a*b", a String to substituteIn of
1653     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1654     * resulting String returned by subst would be "-foo-garply-wacky-".
1655     * @param substituteIn String to substitute within
1656     * @param substitution String to substitute for matches of this regular expression
1657     * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1658     * flag bit is set, only the first occurrence of this regular expression is replaced.
1659     * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1660     * replaced.
1661     * @return The string substituteIn with zero or more occurrences of the current
1662     * regular expression replaced with the substitution String (if this regular
1663     * expression object doesn't match at any position, the original String is returned
1664     * unchanged).
1665     */
1666    public String   subst(String   substituteIn, String   substitution, int flags)
1667    {
1668        // String to return
1669        StringBuffer   ret = new StringBuffer  ();
1670
1671        // Start at position 0 and search the whole string
1672        int pos = 0;
1673        int len = substituteIn.length();
1674
1675        // Try a match at each position
1676        while (pos < len && match(substituteIn, pos))
1677        {
1678            // Append string before match
1679            ret.append(substituteIn.substring(pos, getParenStart(0)));
1680
1681            // Append substitution
1682            ret.append(substitution);
1683
1684            // Move forward, skipping past match
1685            int newpos = getParenEnd(0);
1686
1687            // We always want to make progress! 
1688            if (newpos == pos)
1689            {
1690                newpos++;
1691            }
1692
1693            // Try new position
1694            pos = newpos;
1695
1696            // Break out if we're only supposed to replace one occurrence
1697            if ((flags & REPLACE_FIRSTONLY) != 0)
1698            {
1699                break;
1700            }
1701        }
1702
1703        // If there's remaining input, append it
1704        if (pos < len)
1705        {
1706            ret.append(substituteIn.substring(pos));
1707        }
1708
1709        // Return string buffer as string 
1710        return ret.toString();
1711    }  
1712
1713    /**
1714     * Returns an array of Strings, whose toString representation matches a regular
1715     * expression. This method works like the Perl function of the same name.  Given
1716     * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1717     * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1718     * @param search Array of Objects to search 
1719     * @return Array of Objects whose toString value matches this regular expression.
1720     */
1721    public String  [] grep(Object  [] search)
1722    {
1723        // Create new vector to hold return items
1724        Vector   v = new Vector  ();
1725
1726        // Traverse array of objects
1727        for (int i = 0; i < search.length; i++)
1728        {
1729            // Get next object as a string
1730            String   s = search[i].toString();
1731
1732            // If it matches this regexp, add it to the list
1733            if (match(s))
1734            {
1735                v.addElement(s);
1736            }
1737        }
1738
1739        // Return vector as an array of strings
1740        String  [] ret = new String  [v.size()];
1741        v.copyInto(ret);
1742        return ret;
1743    }
1744
1745    /** @return true if at the i-th position in the 'search' a newline ends */
1746    private boolean isNewline(int i) {
1747
1748        if (i < NEWLINE.length() - 1) {
1749            return false;
1750        }
1751
1752        if (search.charAt(i) == '\n') {
1753            return true;
1754        }
1755
1756        for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
1757            if (NEWLINE.charAt(j) != search.charAt(i)) {
1758                return false;
1759            }
1760        }
1761        return true;
1762    }
1763}
1764
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags