Java > Open Source Codes > org > apache > regexp > RE


1   package org.apache.regexp;
2   
3   /*
4    * ====================================================================
5    * 
6    * The Apache Software License, Version 1.1
7    *
8    * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights 
9    * reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   *
15   * 1. Redistributions of source code must retain the above copyright
16   *    notice, this list of conditions and the following disclaimer. 
17   *
18   * 2. Redistributions in binary form must reproduce the above copyright
19   *    notice, this list of conditions and the following disclaimer in
20   *    the documentation and/or other materials provided with the
21   *    distribution.
22   *
23   * 3. The end-user documentation included with the redistribution, if
24   *    any, must include the following acknowlegement:  
25   *       "This product includes software developed by the 
26   *        Apache Software Foundation (http://www.apache.org/)."
27   *    Alternately, this acknowlegement may appear in the software itself,
28   *    if and wherever such third-party acknowlegements normally appear.
29   *
30   * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31   *    Foundation" must not be used to endorse or promote products derived
32   *    from this software without prior written permission. For written 
33   *    permission, please contact apache@apache.org.
34   *
35   * 5. Products derived from this software may not be called "Apache"
36   *    nor may "Apache" appear in their names without prior written
37   *    permission of the Apache Group.
38   *
39   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50   * SUCH DAMAGE.
51   * ====================================================================
52   *
53   * This software consists of voluntary contributions made by many
54   * individuals on behalf of the Apache Software Foundation.  For more
55   * information on the Apache Software Foundation, please see
56   * <http://www.apache.org/>.
57   *
58   */ 
59   
60  import java.io.Serializable  ;
61  import java.util.Vector  ;
62  
63  /**
64   * RE is an efficient, lightweight regular expression evaluator/matcher
65   * class. Regular expressions are pattern descriptions which enable
66   * sophisticated matching of strings.  In addition to being able to
67   * match a string against a pattern, you can also extract parts of the
68   * match.  This is especially useful in text parsing! Details on the
69   * syntax of regular expression patterns are given below.
70   *
71   * <p>
72   *
73   * To compile a regular expression (RE), you can simply construct an RE
74   * matcher object from the string specification of the pattern, like this:
75   *
76   * <pre>
77   *
78   *  RE r = new RE("a*b");
79   *
80   * </pre>
81   *
82   * <p>
83   *
84   * Once you have done this, you can call either of the RE.match methods to
85   * perform matching on a String.  For example:
86   *
87   * <pre>
88   *
89   *  boolean matched = r.match("aaaab");
90   *
91   * </pre>
92   *
93   * will cause the boolean matched to be set to true because the
94   * pattern "a*b" matches the string "aaaab".
95   *
96   * <p>
97   * If you were interested in the <i>number</i> of a's which matched the
98   * first part of our example expression, you could change the expression to
99   * "(a*)b".  Then when you compiled the expression and matched it against
100  * something like "xaaaab", you would get results like this:
101  *
102  * <pre>
103  *
104  *  RE r = new RE("(a*)b");                  // Compile expression
105  *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
106  *
107  * <br>
108  *
109  *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
110  *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
111  *
112  * <br>
113  *
114  *  int startWholeExpr = r.getParenStart(0);   // startWholeExpr will be index 1
115  *  int endWholeExpr = r.getParenEnd(0);       // endWholeExpr will be index 6
116  *  int lenWholeExpr = r.getParenLength(0);    // lenWholeExpr will be 5
117  *
118  * <br>
119  *
120  *  int startInside = r.getParenStart(1);    // startInside will be index 1
121  *  int endInside = r.getParenEnd(1);        // endInside will be index 5
122  *  int lenInside = r.getParenLength(1);     // lenInside will be 4
123  *
124  * </pre>
125  *
126  * You can also refer to the contents of a parenthesized expression
127  * within a regular expression itself.  This is called a
128  * 'backreference'.  The first backreference in a regular expression is
129  * denoted by \1, the second by \2 and so on.  So the expression:
130  *
131  * <pre>
132  *
133  *  ([0-9]+)=\1
134  *
135  * </pre>
136  *
137  * will match any string of the form n=n (like 0=0 or 2=2).
138  *
139  * <p>
140  *
141  * The full regular expression syntax accepted by RE is described here:
142  *
143  * <pre>
144  *
145  * <br>
146  *
147  *  <b><font face=times roman>Characters</font></b>
148  *
149  * <br>
150  *
151  *    <i>unicodeChar</i>   Matches any identical unicode character
152  *    \                    Used to quote a meta-character (like '*')
153  *    \\                   Matches a single '\' character
154  *    \0nnn                Matches a given octal character
155  *    \xhh                 Matches a given 8-bit hexadecimal character
156  *    \\uhhhh              Matches a given 16-bit hexadecimal character
157  *    \t                   Matches an ASCII tab character
158  *    \n                   Matches an ASCII newline character
159  *    \r                   Matches an ASCII return character
160  *    \f                   Matches an ASCII form feed character
161  *
162  * <br>
163  *
164  *  <b><font face=times roman>Character Classes</font></b>
165  *
166  * <br>
167  *
168  *    [abc]                Simple character class
169  *    [a-zA-Z]             Character class with ranges
170  *    [^abc]               Negated character class
171  *
172  * <br>
173  *
174  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
175  *
176  * <br>
177  *
178  *    [:alnum:]            Alphanumeric characters.
179  *    [:alpha:]            Alphabetic characters.
180  *    [:blank:]            Space and tab characters.
181  *    [:cntrl:]            Control characters.
182  *    [:digit:]            Numeric characters.
183  *    [:graph:]            Characters that are printable and are also visible.
184  *                         (A space is printable, but not visible, while an 
185  *                         `a' is both.)
186  *    [:lower:]            Lower-case alphabetic characters.
187  *    [:print:]            Printable characters (characters that are not 
188  *                         control characters.)
189  *    [:punct:]            Punctuation characters (characters that are not letter,
190  *                         digits, control characters, or space characters).
191  *    [:space:]            Space characters (such as space, tab, and formfeed, 
192  *                         to name a few).
193  *    [:upper:]            Upper-case alphabetic characters.
194  *    [:xdigit:]           Characters that are hexadecimal digits.
195  *
196  * <br>
197  *
198  *  <b><font face=times roman>Non-standard POSIX-style Character 
199  *                            Classes</font></b>
200  *
201  * <br>
202  *
203  *    [:javastart:]        Start of a Java identifier
204  *    [:javapart:]         Part of a Java identifier
205  *
206  * <br>
207  *
208  *  <b><font face=times roman>Predefined Classes</font></b>
209  *
210  * <br>
211  *
212  *    .         Matches any character other than newline
213  *    \w        Matches a "word" character (alphanumeric plus "_")
214  *    \W        Matches a non-word character
215  *    \s        Matches a whitespace character
216  *    \S        Matches a non-whitespace character
217  *    \d        Matches a digit character
218  *    \D        Matches a non-digit character
219  *
220  * <br>
221  *
222  *  <b><font face=times roman>Boundary Matchers</font></b>
223  *
224  * <br>
225  *
226  *    ^         Matches only at the beginning of a line
227  *    $         Matches only at the end of a line
228  *    \b        Matches only at a word boundary
229  *    \B        Matches only at a non-word boundary
230  *
231  * <br>
232  *
233  *  <b><font face=times roman>Greedy Closures</font></b>
234  *
235  * <br>
236  *
237  *    A*        Matches A 0 or more times (greedy)
238  *    A+        Matches A 1 or more times (greedy)
239  *    A?        Matches A 1 or 0 times (greedy)
240  *    A{n}      Matches A exactly n times (greedy)
241  *    A{n,}     Matches A at least n times (greedy)
242  *    A{n,m}    Matches A at least n but not more than m times (greedy)
243  *
244  * <br>
245  *
246  *  <b><font face=times roman>Reluctant Closures</font></b>
247  *
248  * <br>
249  *
250  *    A*?       Matches A 0 or more times (reluctant)
251  *    A+?       Matches A 1 or more times (reluctant)
252  *    A??       Matches A 0 or 1 times (reluctant)
253  *
254  * <br>
255  *
256  *  <b><font face=times roman>Logical Operators</font></b>
257  *
258  * <br>
259  *
260  *    AB        Matches A followed by B
261  *    A|B       Matches either A or B
262  *    (A)       Used for subexpression grouping
263  *   (?:A)      Used for subexpression clustering (just like grouping but 
264  *              no backrefs)
265  *
266  * <br>
267  *
268  *  <b><font face=times roman>Backreferences</font></b>
269  *
270  * <br>
271  *
272  *    \1    Backreference to 1st parenthesized subexpression
273  *    \2    Backreference to 2nd parenthesized subexpression
274  *    \3    Backreference to 3rd parenthesized subexpression
275  *    \4    Backreference to 4th parenthesized subexpression
276  *    \5    Backreference to 5th parenthesized subexpression
277  *    \6    Backreference to 6th parenthesized subexpression
278  *    \7    Backreference to 7th parenthesized subexpression
279  *    \8    Backreference to 8th parenthesized subexpression
280  *    \9    Backreference to 9th parenthesized subexpression
281  *
282  * <br>
283  *
284  * </pre>
285  *
286  * <p>
287  *
288  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
289  * that they match as many elements of the string as possible without
290  * causing the overall match to fail.  If you want a closure to be
291  * reluctant (non-greedy), you can simply follow it with a '?'.  A
292  * reluctant closure will match as few elements of the string as
293  * possible when finding matches.  {m,n} closures don't currently
294  * support reluctancy.
295  *
296  * <p>
297  *
298  * RE runs programs compiled by the RECompiler class.  But the RE
299  * matcher class does not include the actual regular expression compiler
300  * for reasons of efficiency.  In fact, if you want to pre-compile one
301  * or more regular expressions, the 'recompile' class can be invoked
302  * from the command line to produce compiled output like this:
303  *
304  * <pre>
305  *
306  *    // Pre-compiled regular expression "a*b"
307  *    char[] re1Instructions =
308  *    {
309  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
310  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
311  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
312  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
313  *        0x0000,
314  *    };
315  *
316  *    <br>
317  *
318  *    REProgram re1 = new REProgram(re1Instructions);
319  *
320  * </pre>
321  *
322  * You can then construct a regular expression matcher (RE) object from
323  * the pre-compiled expression re1 and thus avoid the overhead of
324  * compiling the expression at runtime. If you require more dynamic
325  * regular expressions, you can construct a single RECompiler object and
326  * re-use it to compile each expression. * Similarly, you can change the
327  * program run by a given matcher object at any time. * However, RE and
328  * RECompiler are not threadsafe (for efficiency reasons, and because
329  * requiring thread safety in this class is deemed to be a rare
330  * requirement), so you will need to construct a separate compiler or
331  * matcher object for each thread (unless you do thread synchronization
332  * yourself).
333  *
334  * </pre>
335  * <br><p><br>
336  *
337  * <font color=red>
338  * <i>ISSUES:</i>
339  *
340  * <ul>
341  *  <li>com.weusours.util.re is not currently compatible with all
342  *      standard POSIX regcomp flags</li>
343  *  <li>com.weusours.util.re does not support POSIX equivalence classes
344  *      ([=foo=] syntax) (I18N/locale issue)</li>
345  *  <li>com.weusours.util.re does not support nested POSIX character
346  *      classes (definitely should, but not completely trivial)</li>
347  *  <li>com.weusours.util.re Does not support POSIX character collation
348  *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
349  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
350  *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
351  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
352  *  <li>Not *all* possibilities are considered for greediness when backreferences
353  *      are involved (as POSIX suggests should be the case).  The POSIX RE
354  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
355  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
356  *      and actually Perl doesn't go to this extent either!  Until someone
357  *      actually complains about this, I'm not sure it's worth "fixing".
358  *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
359  * </ul>
360  *
361  * </font>
362  *
363  * @see recompile
364  * @see RECompiler
365  *
366  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
367  * @author <a HREF="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
368  * @version $Id: RE.java,v 1.13 2003/06/02 02:18:41 vgritsenko Exp $
369  */
370 public class RE implements Serializable  
371 {
372     /**
373      * Specifies normal, case-sensitive matching behaviour.
374      */
375     public static final int MATCH_NORMAL          = 0x0000;
376 
377     /**
378      * Flag to indicate that matching should be case-independent (folded)
379      */
380     public static final int MATCH_CASEINDEPENDENT = 0x0001;
381 
382     /**
383      * Newlines should match as BOL/EOL (^ and $)
384      */
385     public static final int MATCH_MULTILINE       = 0x0002;
386 
387     /**
388      * Consider all input a single body of text - newlines are matched by .
389      */
390     public static final int MATCH_SINGLELINE      = 0x0004;
391 
392     /************************************************
393      *                                              *
394      * The format of a node in a program is:        *
395      *                                              *
396      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
397      *                                              *
398      * char OPCODE - instruction                    *
399      * char OPDATA - modifying data                 *
400      * char OPNEXT - next node (relative offset)    *
401      *                                              *
402      ************************************************/
403 
404                  //   Opcode              Char       Opdata/Operand  Meaning
405                  //   ----------          ---------- --------------- --------------------------------------------------
406     static final char OP_END              = 'E';  //                 end of program
407     static final char OP_BOL              = '^';  //                 match only if at beginning of line
408     static final char OP_EOL              = '$';  //                 match only if at end of line
409     static final char OP_ANY              = '.';  //                 match any single character except newline
410     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
411     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
412     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
413     static final char OP_STAR             = '*';  // node            kleene closure
414     static final char OP_PLUS             = '+';  // node            positive closure
415     static final char OP_MAYBE            = '?';  // node            optional closure
416     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
417     static final char OP_OPEN             = '(';  // number          nth opening paren
418     static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
419     static final char OP_CLOSE            = ')';  // number          nth closing paren
420     static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
421     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
422     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
423     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
424     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
425     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
426     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
427     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
428 
429     // Escape codes
430     static final char E_ALNUM             = 'w';  // Alphanumeric
431     static final char E_NALNUM            = 'W';  // Non-alphanumeric
432     static final char E_BOUND             = 'b';  // Word boundary
433     static final char E_NBOUND            = 'B';  // Non-word boundary
434     static final char E_SPACE             = 's';  // Whitespace
435     static final char E_NSPACE            = 'S';  // Non-whitespace
436     static final char E_DIGIT             = 'd';  // Digit
437     static final char E_NDIGIT            = 'D';  // Non-digit
438 
439     // Posix character classes
440     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
441     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
442     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
443     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
444     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
445     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
446     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
447     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
448     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
449     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
450     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
451     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
452     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
453     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
454 
455     // Limits
456     static final int maxNode  = 65536;            // Maximum number of nodes in a program
457     static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
458 
459     // Node layout constants
460     static final int offsetOpcode = 0;            // Opcode offset (first character)
461     static final int offsetOpdata = 1;            // Opdata offset (second char)
462     static final int offsetNext   = 2;            // Next index offset (third char)
463     static final int nodeSize     = 3;            // Node size (in chars)
464 
465     /** Line Separator */
466     static final String   NEWLINE = System.getProperty("line.separator");
467 
468     // State of current program
469     REProgram program;                            // Compiled regular expression 'program'
470     transient CharacterIterator search;           // The string being matched against
471     int matchFlags;                               // Match behaviour flags
472     int maxParen = MAX_PAREN;
473 
474     // Parenthesized subexpressions
475     transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
476     transient int start0;                         // Cache of start[0]
477     transient int end0;                           // Cache of start[0]
478     transient int start1;                         // Cache of start[1]
479     transient int end1;                           // Cache of start[1]
480     transient int start2;                         // Cache of start[2]
481     transient int end2;                           // Cache of start[2]
482     transient int[] startn;                       // Lazy-alloced array of sub-expression starts
483     transient int[] endn;                         // Lazy-alloced array of sub-expression ends
484 
485     // Backreferences
486     transient int[] startBackref;                 // Lazy-alloced array of backref starts
487     transient int[] endBackref;                   // Lazy-alloced array of backref ends
488 
489     /**
490      * Constructs a regular expression matcher from a String by compiling it
491      * using a new instance of RECompiler.  If you will be compiling many
492      * expressions, you may prefer to use a single RECompiler object instead.
493      * @param pattern The regular expression pattern to compile.
494      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
495      * @see RECompiler
496      * @see recompile
497      */
498     public RE(String   pattern) throws RESyntaxException
499     {
500         this(pattern, MATCH_NORMAL);
501     }
502 
503     /**
504      * Constructs a regular expression matcher from a String by compiling it
505      * using a new instance of RECompiler.  If you will be compiling many
506      * expressions, you may prefer to use a single RECompiler object instead.
507      * @param pattern The regular expression pattern to compile.
508      * @param matchFlags The matching style
509      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
510      * @see RECompiler
511      * @see recompile
512      */
513     public RE(String   pattern, int matchFlags) throws RESyntaxException
514     {
515         this(new RECompiler().compile(pattern));
516         setMatchFlags(matchFlags);
517     }
518 
519     /**
520      * Construct a matcher for a pre-compiled regular expression from program
521      * (bytecode) data.  Permits special flags to be passed in to modify matching
522      * behaviour.
523      * @param program Compiled regular expression program (see RECompiler and/or recompile)
524      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
525      *
526      * <pre>
527      *
528      *   MATCH_NORMAL              // Normal (case-sensitive) matching
529      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
530      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
531      *
532      * </pre>
533      *
534      * @see RECompiler
535      * @see REProgram
536      * @see recompile
537      */
538     public RE(REProgram program, int matchFlags)
539     {
540         setProgram(program);
541         setMatchFlags(matchFlags);
542     }
543 
544     /**
545      * Construct a matcher for a pre-compiled regular expression from program
546      * (bytecode) data.
547      * @param program Compiled regular expression program
548      * @see RECompiler
549      * @see recompile
550      */
551     public RE(REProgram program)
552     {
553         this(program, MATCH_NORMAL);
554     }
555 
556     /**
557      * Constructs a regular expression matcher with no initial program.
558      * This is likely to be an uncommon practice, but is still supported.
559      */
560     public RE()
561     {
562         this((REProgram)null, MATCH_NORMAL);
563     }
564 
565     /**
566      * Converts a 'simplified' regular expression to a full regular expression
567      * @param pattern The pattern to convert
568      * @return The full regular expression
569      */
570     public static String   simplePatternToFullRegularExpression(String   pattern)
571     {
572         StringBuffer   buf = new StringBuffer  ();
573         for (int i = 0; i < pattern.length(); i++)
574         {
575             char c = pattern.charAt(i);
576             switch (c)
577             {
578                 case '*':
579                     buf.append(".*");
580                     break;
581 
582                 case '.':
583                 case '[':
584                 case ']':
585                 case '\\':
586                 case '+':
587                 case '?':
588                 case '{':
589                 case '}':
590                 case '$':
591                 case '^':
592                 case '|':
593                 case '(':
594                 case ')':
595                     buf.append('\\');
596                 default:
597                     buf.append(c);
598                     break;
599             }
600         }
601         return buf.toString();
602     }
603 
604     /**
605      * Sets match behaviour flags which alter the way RE does matching.
606      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
607      *
608      * <pre>
609      *
610      *   MATCH_NORMAL              // Normal (case-sensitive) matching
611      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
612      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
613      *
614      * </pre>
615      *
616      */
617     public void setMatchFlags(int matchFlags)
618     {
619         this.matchFlags = matchFlags;
620     }
621 
622     /**
623      * Returns the current match behaviour flags.
624      * @return Current match behaviour flags (RE.MATCH_*).
625      *
626      * <pre>
627      *
628      *   MATCH_NORMAL              // Normal (case-sensitive) matching
629      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
630      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
631      *
632      * </pre>
633      *
634      * @see #setMatchFlags
635      *
636      */
637     public int getMatchFlags()
638     {
639         return matchFlags;
640     }
641 
642     /**
643      * Sets the current regular expression program used by this matcher object.
644      * @param program Regular expression program compiled by RECompiler.
645      * @see RECompiler
646      * @see REProgram
647      * @see recompile
648      */
649     public void setProgram(REProgram program)
650     {
651         this.program = program;
652         if (program != null && program.maxParens != -1) {
653             this.maxParen = program.maxParens;
654         } else {
655             this.maxParen = MAX_PAREN;
656         }
657     }
658 
659     /**
660      * Returns the current regular expression program in use by this matcher object.
661      * @return Regular expression program
662      * @see #setProgram
663      */
664     public REProgram getProgram()
665     {
666         return program;
667     }
668 
669     /**
670      * Returns the number of parenthesized subexpressions available after a successful match.
671      * @return Number of available parenthesized subexpressions
672      */
673     public int getParenCount()
674     {
675         return parenCount;
676     }
677 
678     /**
679      * Gets the contents of a parenthesized subexpression after a successful match.
680      * @param which Nesting level of subexpression
681      * @return String
682      */
683     public String   getParen(int which)
684     {
685         int start;
686         if (which < parenCount && (start = getParenStart(which)) >= 0)
687         {
688             return search.substring(start, getParenEnd(which));
689         }
690         return null;
691     }
692 
693     /**
694      * Returns the start index of a given paren level.
695      * @param which Nesting level of subexpression
696      * @return String index 
697      */
698     public final int getParenStart(int which)
699     {
700         if (which < parenCount)
701         {
702             switch (which)
703             {
704                 case 0:
705                     return start0;
706                     
707                 case 1:
708                     return start1;
709                     
710                 case 2:
711                     return start2;
712                     
713                 default:
714                     if (startn == null)
715                     {
716                         allocParens();
717                     }
718                     return startn[which];
719             }
720         }
721         return -1;
722     }
723 
724     /**
725      * Returns the end index of a given paren level.
726      * @param which Nesting level of subexpression
727      * @return String index 
728      */
729     public final int getParenEnd(int which)
730     {
731         if (which < parenCount)
732         {
733             switch (which)
734             {
735                 case 0:
736                     return end0;
737                     
738                 case 1:
739                     return end1;
740                     
741                 case 2:
742                     return end2;
743                     
744                 default:
745                     if (endn == null)
746                     {
747                         allocParens();
748                     }
749                     return endn[which];
750             }
751         }
752         return -1;
753     }
754 
755     /**
756      * Returns the length of a given paren level.
757      * @param which Nesting level of subexpression
758      * @return Number of characters in the parenthesized subexpression
759      */
760     public final int getParenLength(int which)
761     {
762         if (which < parenCount)
763         {
764             return getParenEnd(which) - getParenStart(which);
765         }
766         return -1;
767     }
768 
769     /**
770      * Sets the start of a paren level
771      * @param which Which paren level
772      * @param i Index in input array
773      */
774     protected final void setParenStart(int which, int i)
775     {
776         if (which < parenCount)
777         {
778             switch (which)
779             {
780                 case 0:
781                     start0 = i;
782                     break;
783                     
784                 case 1:
785                     start1 = i;
786                     break;
787                     
788                 case 2:
789                     start2 = i;
790                     break;
791                     
792                 default:
793                     if (startn == null)
794                     {
795                         allocParens();
796                     }
797                     startn[which] = i;
798                     break;
799             }
800         }
801     }
802 
803     /**
804      * Sets the end of a paren level
805      * @param which Which paren level
806      * @param i Index in input array
807      */
808     protected final void setParenEnd(int which, int i)
809     {
810         if (which < parenCount)
811         {
812             switch (which)
813             {
814                 case 0:
815                     end0 = i;
816                     break;
817                     
818                 case 1:
819                     end1 = i;
820                     break;
821                     
822                 case 2:
823                     end2 = i;
824                     break;
825                     
826                 default:
827                     if (endn == null)
828                     {
829                         allocParens();
830                     }
831                     endn[which] = i;
832                     break;
833             }
834         }
835     }
836 
837     /**
838      * Throws an Error representing an internal error condition probably resulting
839      * from a bug in the regular expression compiler (or possibly data corruption).
840      * In practice, this should be very rare.
841      * @param s Error description
842      */
843     protected void internalError(String   s) throws Error  
844     {
845         throw new Error  ("RE internal error: " + s);
846     }
847 
848     /**
849      * Performs lazy allocation of subexpression arrays
850      */
851     private final void allocParens()
852     {
853         // Allocate arrays for subexpressions
854         startn = new int[maxParen];
855         endn = new int[maxParen];
856 
857         // Set sub-expression pointers to invalid values
858         for (int i = 0; i < maxParen; i++)
859         {
860             startn[i] = -1;
861             endn[i] = -1;
862         }
863     }
864 
865     /**
866      * Try to match a string against a subset of nodes in the program
867      * @param firstNode Node to start at in program
868      * @param lastNode Last valid node (used for matching a subexpression without
869      * matching the rest of the program as well).
870      * @param idxStart Starting position in character array
871      * @return Final input array index if match succeeded.  -1 if not.
872      */
873     protected int matchNodes(int firstNode, int lastNode, int idxStart)
874     {
875         // Our current place in the string
876         int idx = idxStart;
877 
878         // Loop while node is valid
879         int next, opcode, opdata;
880         int idxNew;
881         char[] instruction = program.instruction;
882         for (int node = firstNode; node < lastNode; )
883         {
884             opcode = instruction[node + offsetOpcode];
885             next   = node + (short)instruction[node + offsetNext];
886             opdata = instruction[node + offsetOpdata];
887 
888             switch (opcode)
889             {
890                 case OP_RELUCTANTMAYBE:
891                     {
892                         int once = 0;
893                         do
894                         {
895                             // Try to match the rest without using the reluctant subexpr
896                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
897                             {
898                                 return idxNew;
899                             }
900                         }
901                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
902                         return -1;
903                     }
904 
905                 case OP_RELUCTANTPLUS:
906                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
907                     {
908                         // Try to match the rest without using the reluctant subexpr
909                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
910                         {
911                             return idxNew;
912                         }
913                     }
914                     return -1;
915 
916                 case OP_RELUCTANTSTAR:
917                     do
918                     {
919                         // Try to match the rest without using the reluctant subexpr
920                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
921                         {
922                             return idxNew;
923                         }
924                     }
925                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
926                     return -1;
927 
928                 case OP_OPEN:
929 
930                     // Match subexpression
931                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
932                     {
933                         startBackref[opdata] = idx;
934                     }
935                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
936                     {
937                         // Increase valid paren count
938                         if ((opdata + 1) > parenCount)
939                         {
940                             parenCount = opdata + 1;
941                         }
942 
943                         // Don't set paren if already set later on
944                         if (getParenStart(opdata) == -1)
945                         {
946                             setParenStart(opdata, idx);
947                         }
948                     }
949                     return idxNew;
950 
951                 case OP_CLOSE:
952 
953                     // Done matching subexpression
954                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
955                     {
956                         endBackref[opdata] = idx;
957                     }
958                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
959                     {
960                         // Increase valid paren count
961                         if ((opdata + 1) > parenCount)
962                         {
963                             parenCount = opdata + 1;
964                         }
965 
966                         // Don't set paren if already set later on
967                         if (getParenEnd(opdata) == -1)
968                         {
969                             setParenEnd(opdata, idx);
970                         }
971                     }
972                     return idxNew;
973 
974                 case OP_OPEN_CLUSTER:
975                 case OP_CLOSE_CLUSTER:
976                     // starting or ending the matching of a subexpression which has no backref.
977                     return matchNodes( next, maxNode, idx );
978 
979                 case OP_BACKREF:
980                     {
981                         // Get the start and end of the backref
982                         int s = startBackref[opdata];
983                         int e = endBackref[opdata];
984 
985                         // We don't know the backref yet
986                         if (s == -1 || e == -1)
987                         {
988                             return -1;
989                         }
990 
991                         // The backref is empty size
992                         if (s == e)
993                         {
994                             break;
995                         }
996 
997                         // Get the length of the backref
998                         int l = e - s;
999 
1000                        // If there's not enough input left, give up.
1001                        if (search.isEnd(idx + l - 1))
1002                        {
1003                            return -1;
1004                        }
1005
1006                        // Case fold the backref?
1007                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
1008                        {
1009                            // Compare backref to input, case-folding as we go
1010                            for (int i = 0; i < l; i++)
1011                            {
1012                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
1013                                {
1014                                    return -1;
1015                                }
1016                            }
1017                        }
1018                        else
1019                        {
1020                            // Compare backref to input
1021                            for (int i = 0; i < l; i++)
1022                            {
1023                                if (search.charAt(idx++) != search.charAt(s + i))
1024                                {
1025                                    return -1;
1026                                }
1027                            }
1028                        }
1029                    }
1030                    break;
1031
1032                case OP_BOL:
1033
1034                    // Fail if we're not at the start of the string
1035                    if (idx != 0)
1036                    {
1037                        // If we're multiline matching, we could still be at the start of a line
1038                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1039                        {
1040                            // If not at start of line, give up
1041                            if (idx <= 0 || !isNewline(idx - 1)) {
1042                                return -1;
1043                            } else {
1044                                break;
1045                            }
1046                        }
1047                        return -1;
1048                    }
1049                    break;
1050
1051                case OP_EOL:
1052
1053                    // If we're not at the end of string
1054                    if (!search.isEnd(0) && !search.isEnd(idx))
1055                    {
1056                        // If we're multi-line matching
1057                        if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1058                        {
1059                            // Give up if we're not at the end of a line
1060                            if (! isNewline(idx)) {
1061                                return -1;
1062                            } else {
1063                                break;
1064                            }
1065                        }
1066                        return -1;
1067                    }
1068                    break;
1069
1070                case OP_ESCAPE:
1071
1072                    // Which escape?
1073                    switch (opdata)
1074                    {
1075                        // Word boundary match
1076                        case E_NBOUND:
1077                        case E_BOUND:
1078                            {
1079                                char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1080                                char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1081                                if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1082                                {
1083                                    return -1;
1084                                }
1085                            }
1086                            break;
1087
1088                        // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1089                        case E_ALNUM:
1090                        case E_NALNUM:
1091                        case E_DIGIT:
1092                        case E_NDIGIT:
1093                        case E_SPACE:
1094                        case E_NSPACE:
1095
1096                            // Give up if out of input
1097                            if (search.isEnd(idx))
1098                            {
1099                                return -1;
1100                            }
1101
1102                            char c = search.charAt(idx);
1103
1104                            // Switch on escape
1105                            switch (opdata)
1106                            {
1107                                case E_ALNUM:
1108                                case E_NALNUM:
1109                                    if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1110                                    {
1111                                        return -1;
1112                                    }
1113                                    break;
1114
1115                                case E_DIGIT:
1116                                case E_NDIGIT:
1117                                    if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1118                                    {
1119                                        return -1;
1120                                    }
1121                                    break;
1122
1123                                case E_SPACE:
1124                                case E_NSPACE:
1125                                    if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1126                                    {
1127                                        return -1;
1128                                    }
1129                                    break;
1130                            }
1131                            idx++;
1132                            break;
1133
1134                        default:
1135                            internalError("Unrecognized escape '" + opdata + "'");
1136                    }
1137                    break;
1138
1139                case OP_ANY:
1140
1141                    if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1142                        // Match anything
1143                        if(search.isEnd(idx))
1144                        {
1145                            return -1;
1146                        }
1147                        idx++;
1148                        break;
1149                    }
1150                    else
1151                    {
1152                        // Match anything but a newline
1153                        if (search.isEnd(idx) || search.charAt(idx++) == '\n')
1154                        {
1155                            return -1;
1156                        }
1157                        break;
1158                    }
1159
1160                case OP_ATOM:
1161                    {
1162                        // Match an atom value
1163                        if (search.isEnd(idx))
1164                        {
1165                            return -1;
1166                        }
1167
1168                        // Get length of atom and starting index
1169                        int lenAtom = opdata;
1170                        int startAtom = node + nodeSize;
1171
1172                        // Give up if not enough input remains to have a match
1173                        if (search.isEnd(lenAtom + idx - 1))
1174                        {
1175                            return -1;
1176                        }
1177
1178                        // Match atom differently depending on casefolding flag
1179                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
1180                        {
1181                            for (int i = 0; i < lenAtom; i++)
1182                            {
1183                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
1184                                {
1185                                    return -1;
1186                                }
1187                            }
1188                        }
1189                        else
1190                        {
1191                            for (int i = 0; i < lenAtom; i++)
1192                            {
1193                                if (search.charAt(idx++) != instruction[startAtom + i])
1194                                {
1195                                    return -1;
1196                                }
1197                            }
1198                        }
1199                    }
1200                    break;
1201
1202                case OP_POSIXCLASS:
1203                    {
1204                        // Out of input?
1205                        if (search.isEnd(idx))
1206                        {
1207                            return -1;
1208                        }
1209                        
1210                        switch (opdata)
1211                        {
1212                            case POSIX_CLASS_ALNUM:
1213                                if (!Character.isLetterOrDigit(search.charAt(idx)))
1214                                {
1215                                    return -1;
1216                                }
1217                                break;
1218                                
1219                            case POSIX_CLASS_ALPHA:
1220                                if (!Character.isLetter(search.charAt(idx)))
1221                                {
1222                                    return -1;
1223                                }
1224                                break;
1225                                
1226                            case POSIX_CLASS_DIGIT:
1227                                if (!Character.isDigit(search.charAt(idx)))
1228                                {
1229                                    return -1;
1230                                }
1231                                break;
1232                                
1233                            case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1234                                if (!Character.isSpaceChar(search.charAt(idx)))
1235                                {
1236                                    return -1;
1237                                }
1238                                break;
1239                                
1240                            case POSIX_CLASS_SPACE:
1241                                if (!Character.isWhitespace(search.charAt(idx)))
1242                                {
1243                                    return -1;
1244                                }
1245                                break;
1246                                
1247                            case POSIX_CLASS_CNTRL:
1248                                if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1249                                {
1250                                    return -1;
1251                                }
1252                                break;
1253                                
1254                            case POSIX_CLASS_GRAPH: // JWL - bugbug???
1255                                switch (Character.getType(search.charAt(idx)))
1256                                {
1257                                    case Character.MATH_SYMBOL:
1258                                    case Character.CURRENCY_SYMBOL:
1259                                    case Character.MODIFIER_SYMBOL:
1260                                    case Character.OTHER_SYMBOL:
1261                                        break;
1262                                        
1263                                    default:
1264                                        return -1;
1265                                }
1266                                break;
1267                                
1268                            case POSIX_CLASS_LOWER:
1269                                if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1270                                {
1271                                    return -1;
1272                                }
1273                                break;
1274                                
1275                            case POSIX_CLASS_UPPER:
1276                                if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1277                                {
1278                                    return -1;
1279                                }
1280                                break;
1281                                
1282                            case POSIX_CLASS_PRINT:
1283                                if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1284                                {
1285                                    return -1;
1286                                }
1287                                break;
1288                                
1289                            case POSIX_CLASS_PUNCT:
1290                            {
1291                                int type = Character.getType(search.charAt(idx));
1292                                switch(type)
1293                                {
1294                                    case Character.DASH_PUNCTUATION:
1295                                    case Character.START_PUNCTUATION:
1296                                    case Character.END_PUNCTUATION:
1297                                    case Character.CONNECTOR_PUNCTUATION:
1298                                    case Character.OTHER_PUNCTUATION:
1299                                        break;
1300                                        
1301                                    default:
1302                                        return -1;
1303                                }
1304                            }
1305                            break;
1306
1307                            case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1308                            {
1309                                boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1310                                                    (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1311                                                    (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1312                                if (!isXDigit)
1313                                {
1314                                    return -1;
1315                                }
1316                            }
1317                            break;
1318                            
1319                            case POSIX_CLASS_JSTART:
1320                                if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1321                                {
1322                                    return -1;
1323                                }
1324                                break;
1325                                
1326                            case POSIX_CLASS_JPART:
1327                                if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1328                                {
1329                                    return -1;
1330                                }
1331                                break;
1332
1333                            default:
1334                                internalError("Bad posix class");
1335                                break;
1336                        }
1337                    
1338                        // Matched.
1339                        idx++;
1340                    }
1341                    break;
1342
1343                case OP_ANYOF:
1344                    {
1345                        // Out of input?
1346                        if (search.isEnd(idx))
1347                        {
1348                            return -1;
1349                        }
1350
1351                        // Get character to match against character class and maybe casefold
1352                        char c = search.charAt(idx);
1353                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1354                        if (caseFold)
1355                        {
1356                            c = Character.toLowerCase(c);
1357                        }
1358
1359                        // Loop through character class checking our match character
1360                        int idxRange = node + nodeSize;
1361                        int idxEnd = idxRange + (opdata * 2);
1362                        boolean match = false;
1363                        for (int i = idxRange; i < idxEnd; )
1364                        {
1365                            // Get start, end and match characters
1366                            char s = instruction[i++];
1367                            char e = instruction[i++];
1368
1369                            // Fold ends of range and match character
1370                            if (caseFold)
1371                            {
1372                                s = Character.toLowerCase(s);
1373                                e = Character.toLowerCase(e);
1374                            }
1375
1376                            // If the match character is in range, break out
1377                            if (c >= s && c <= e)
1378                            {
1379                                match = true;
1380                                break;
1381                            }
1382                        }
1383
1384                        // Fail if we didn't match the character class
1385                        if (!match)
1386                        {
1387                            return -1;
1388                        }
1389                        idx++;
1390                    }
1391                    break;
1392
1393                case OP_BRANCH:
1394                {
1395                    // Check for choices
1396                    if (instruction[next + offsetOpcode] != OP_BRANCH)
1397                    {
1398                        // If there aren't any other choices, just evaluate this branch.
1399                        node += nodeSize;
1400                        continue;
1401                    }
1402
1403                    // Try all available branches
1404                    short nextBranch;
1405                    do
1406                    {
1407                        // Try matching the branch against the string
1408                        if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1409                        {
1410                            return idxNew;
1411                        }
1412                        
1413                        // Go to next branch (if any)
1414                        nextBranch = (short)instruction[node + offsetNext];
1415                        node += nextBranch;
1416                    }
1417                    while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1418
1419                    // Failed to match any branch!
1420                    return -1;
1421                }
1422
1423                case OP_NOTHING:
1424                case OP_GOTO:
1425
1426                    // Just advance to the next node without doing anything
1427                    break;
1428
1429                case OP_END:
1430
1431                    // Match has succeeded!
1432                    setParenEnd(0, idx);
1433                    return idx;
1434
1435                default:
1436
1437                    // Corrupt program
1438                    internalError("Invalid opcode '" + opcode + "'");
1439            }
1440
1441            // Advance to the next node in the program
1442            node = next;
1443        }
1444
1445        // We "should" never end up here
1446        internalError("Corrupt program");
1447        return -1;
1448    }
1449
1450    /**
1451     * Match the current regular expression program against the current
1452     * input string, starting at index i of the input string.  This method
1453     * is only meant for internal use.
1454     * @param i The input string index to start matching at
1455     * @return True if the input matched the expression
1456     */
1457    protected boolean matchAt(int i)
1458    {
1459        // Initialize start pointer, paren cache and paren count
1460        start0 = -1;
1461        end0   = -1;
1462        start1 = -1;
1463        end1   = -1;
1464        start2 = -1;
1465        end2   = -1;
1466        startn = null;
1467        endn   = null;
1468        parenCount = 1;
1469        setParenStart(0, i);
1470
1471        // Allocate backref arrays (unless optimizations indicate otherwise)
1472        if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1473        {
1474            startBackref = new int[maxParen];
1475            endBackref = new int[maxParen];
1476        }
1477
1478        // Match against string
1479        int idx;
1480        if ((idx = matchNodes(0, maxNode, i)) != -1)
1481        {
1482            setParenEnd(0, idx);
1483            return true;
1484        }
1485
1486        // Didn't match
1487        parenCount = 0;
1488        return false;
1489    }
1490
1491    /**
1492     * Matches the current regular expression program against a character array,
1493     * starting at a given index.
1494     * @param search String to match against
1495     * @param i Index to start searching at
1496     * @return True if string matched
1497     */
1498    public boolean match(String   search, int i) 
1499    {
1500        return match(new StringCharacterIterator(search), i);
1501    }
1502
1503    /**
1504     * Matches the current regular expression program against a character array,
1505     * starting at a given index.
1506     * @param search String to match against
1507     * @param i Index to start searching at
1508     * @return True if string matched
1509     */
1510    public boolean match(CharacterIterator search, int i)
1511    {
1512        // There is no compiled program to search with!
1513        if (program == null)
1514        {
1515            // This should be uncommon enough to be an error case rather
1516            // than an exception (which would have to be handled everywhere)
1517            internalError("No RE program to run!");
1518        }
1519
1520        // Save string to search
1521        this.search = search;
1522
1523        // Can we optimize the search by looking for a prefix string?
1524        if (program.prefix == null)
1525        {
1526            // Unprefixed matching must try for a match at each character
1527            for ( ;! search.isEnd(i - 1); i++)
1528            {
1529                // Try a match at index i
1530                if (matchAt(i))
1531                {
1532                    return true;
1533                }
1534            }
1535            return false;
1536        }
1537        else
1538        {
1539            // Prefix-anchored matching is possible
1540            boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1541            char[] prefix = program.prefix;
1542            for ( ;! search.isEnd(i + prefix.length - 1); i++)
1543            {
1544                // If the first character of the prefix matches
1545                boolean match = false;
1546                if (caseIndependent)
1547                    match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
1548                else
1549                    match = search.charAt(i) == prefix[0];
1550                if (match)
1551                {
1552                    // Save first character position
1553                    int firstChar = i++;
1554                    int k;
1555                    for (k = 1; k < prefix.length; )
1556                    {
1557                        // If there's a mismatch of any character in the prefix, give up
1558                        if (caseIndependent)
1559                            match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
1560                        else
1561                            match = search.charAt(i++) == prefix[k++];
1562                        if (!match)
1563                        {
1564                            break;
1565                        }
1566                    }
1567
1568                    // See if the whole prefix string matched
1569                    if (k == prefix.length)
1570                    {
1571                        // We matched the full prefix at firstChar, so try it
1572                        if (matchAt(firstChar))
1573                        {
1574                            return true;
1575                        }
1576                    }
1577
1578                    // Match failed, reset i to continue the search
1579                    i = firstChar;
1580                }
1581            }
1582            return false;
1583        }
1584    }
1585
1586    /**
1587     * Matches the current regular expression program against a String.
1588     * @param search String to match against
1589     * @return True if string matched
1590     */
1591    public boolean match(String   search)
1592    {
1593        return match(search, 0);
1594    }
1595
1596    /**
1597     * Splits a string into an array of strings on regular expression boundaries.
1598     * This function works the same way as the Perl function of the same name.
1599     * Given a regular expression of "[ab]+" and a string to split of
1600     * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1601     * "[xyzzy, yyz, 123]".
1602     * @param s String to split on this regular exression
1603     * @return Array of strings
1604     */
1605    public String  [] split(String   s)
1606    {
1607        // Create new vector
1608        Vector   v = new Vector  ();
1609
1610        // Start at position 0 and search the whole string
1611        int pos = 0;
1612        int len = s.length();
1613
1614        // Try a match at each position
1615        while (pos < len && match(s, pos))
1616        {
1617            // Get start of match
1618            int start = getParenStart(0);
1619
1620            // Get end of match
1621            int newpos = getParenEnd(0);
1622
1623            // Check if no progress was made
1624            if (newpos == pos)
1625            {
1626                v.addElement(s.substring(pos, start + 1));
1627                newpos++;
1628            }
1629            else
1630            {
1631                v.addElement(s.substring(pos, start));
1632            }
1633
1634            // Move to new position
1635            pos = newpos;
1636        }
1637
1638        // Push remainder if it's not empty
1639        String   remainder = s.substring(pos);
1640        if (remainder.length() != 0)
1641        {
1642            v.addElement(remainder);
1643        }
1644
1645        // Return vector as an array of strings
1646        String  [] ret = new String  [v.size()];
1647        v.copyInto(ret);
1648        return ret;
1649    }
1650
1651    /**
1652     * Flag bit that indicates that subst should replace all occurrences of this
1653     * regular expression.
1654     */
1655    public static final int REPLACE_ALL            = 0x0000;
1656
1657    /**
1658     * Flag bit that indicates that subst should only replace the first occurrence
1659     * of this regular expression.
1660     */
1661    public static final int REPLACE_FIRSTONLY      = 0x0001;
1662
1663    /**
1664     * Flag bit that indicates that subst should replace backreferences
1665     */
1666    public static final int REPLACE_BACKREFERENCES = 0x0002;
1667
1668    /**
1669     * Substitutes a string for this regular expression in another string.
1670     * This method works like the Perl function of the same name.
1671     * Given a regular expression of "a*b", a String to substituteIn of
1672     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1673     * resulting String returned by subst would be "-foo-garply-wacky-".
1674     *
1675     * @param substituteIn String to substitute within
1676     * @param substitution String to substitute for all matches of this regular expression.
1677     * @return The string substituteIn with zero or more occurrences of the current
1678     * regular expression replaced with the substitution String (if this regular
1679     * expression object doesn't match at any position, the original String is returned
1680     * unchanged).
1681     */
1682    public String   subst(String   substituteIn, String   substitution)
1683    {
1684        return subst(substituteIn, substitution, REPLACE_ALL);
1685    }
1686
1687    /**
1688     * Substitutes a string for this regular expression in another string.
1689     * This method works like the Perl function of the same name.
1690     * Given a regular expression of "a*b", a String to substituteIn of
1691     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1692     * resulting String returned by subst would be "-foo-garply-wacky-".
1693     * <p>
1694     * It is also possible to reference the contents of a parenthesized expression
1695     * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1696     * a String to substituteIn of "visit us: http://www.apache.org!" and the
1697     * substitution String "&lt;a HREF=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
1698     * returned by subst would be 
1699     * "visit us: &lt;a HREF=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
1700     * <p>
1701     * <i>Note:</i> $0 represents the whole match.
1702     *
1703     * @param substituteIn String to substitute within
1704     * @param substitution String to substitute for matches of this regular expression
1705     * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1706     * flag bit is set, only the first occurrence of this regular expression is replaced.
1707     * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1708     * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1709     * be processed.
1710     * @return The string substituteIn with zero or more occurrences of the current
1711     * regular expression replaced with the substitution String (if this regular
1712     * expression object doesn't match at any position, the original String is returned
1713     * unchanged).
1714     */
1715    public String   subst(String   substituteIn, String   substitution, int flags)
1716    {
1717        // String to return
1718        StringBuffer   ret = new StringBuffer  ();
1719
1720        // Start at position 0 and search the whole string
1721        int pos = 0;
1722        int len = substituteIn.length();
1723
1724        // Try a match at each position
1725        while (pos < len && match(substituteIn, pos))
1726        {
1727            // Append string before match
1728            ret.append(substituteIn.substring(pos, getParenStart(0)));
1729
1730            if ((flags & REPLACE_BACKREFERENCES) != 0)
1731            {
1732                // Process backreferences
1733                int lCurrentPosition = 0;
1734                int lLastPosition = 0;
1735                int lLength = substitution.length();
1736
1737                while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1738                {
1739                    if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1740                        && lCurrentPosition+1 < lLength)
1741                    {
1742                        char c = substitution.charAt(lCurrentPosition + 1);
1743                        if (c >= '0' && c <= '9')
1744                        {
1745                            // Append everything between the last and the current $ sign
1746                            ret.append(substitution.substring(lLastPosition+2, lCurrentPosition));
1747
1748                            // Append the parenthesized expression
1749                            // Note: if a parenthesized expression of the requested
1750                            // index is not available "null" is added to the string
1751                            ret.append(getParen(c - '0'));
1752                            lLastPosition = lCurrentPosition;
1753                        }
1754                    }
1755
1756                    // Move forward, skipping past match
1757                    lCurrentPosition++;
1758                }
1759
1760                // Append everything after the last $ sign
1761                ret.append(substitution.substring(lLastPosition+2,lLength));
1762            }
1763            else
1764            {
1765                // Append substitution without processing backreferences
1766                ret.append(substitution);
1767            }
1768
1769            // Move forward, skipping past match
1770            int newpos = getParenEnd(0);
1771
1772            // We always want to make progress! 
1773            if (newpos == pos)
1774            {
1775                newpos++;
1776            }
1777
1778            // Try new position
1779            pos = newpos;
1780
1781            // Break out if we're only supposed to replace one occurrence
1782            if ((flags & REPLACE_FIRSTONLY) != 0)
1783            {
1784                break;
1785            }
1786        }
1787
1788        // If there's remaining input, append it
1789        if (pos < len)
1790        {
1791            ret.append(substituteIn.substring(pos));
1792        }
1793
1794        // Return string buffer as string 
1795        return ret.toString();
1796    }  
1797
1798    /**
1799     * Returns an array of Strings, whose toString representation matches a regular
1800     * expression. This method works like the Perl function of the same name.  Given
1801     * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1802     * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1803     * @param search Array of Objects to search 
1804     * @return Array of Strings whose toString() value matches this regular expression.
1805     */
1806    public String  [] grep(Object  [] search)
1807    {
1808        // Create new vector to hold return items
1809        Vector   v = new Vector  ();
1810
1811        // Traverse array of objects
1812        for (int i = 0; i < search.length; i++)
1813        {
1814            // Get next object as a string
1815            String   s = search[i].toString();
1816
1817            // If it matches this regexp, add it to the list
1818            if (match(s))
1819            {
1820                v.addElement(s);
1821            }
1822        }
1823
1824        // Return vector as an array of strings
1825        String  [] ret = new String  [v.size()];
1826        v.copyInto(ret);
1827        return ret;
1828    }
1829
1830    /** @return true if at the i-th position in the 'search' a newline ends */
1831    private boolean isNewline(int i) {
1832
1833        if (i < NEWLINE.length() - 1) {
1834            return false;
1835        }
1836
1837        if (search.charAt(i) == '\n') {
1838            return true;
1839        }
1840
1841        for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
1842            if (NEWLINE.charAt(j) != search.charAt(i)) {
1843                return false;
1844            }
1845        }
1846        return true;
1847    }
1848}
1849
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags