KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > regexp > RE


1 package org.apache.regexp;
2
3 /*
4  * ====================================================================
5  *
6  * The Apache Software License, Version 1.1
7  *
8  * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
9  * reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in
20  * the documentation and/or other materials provided with the
21  * distribution.
22  *
23  * 3. The end-user documentation included with the redistribution, if
24  * any, must include the following acknowlegement:
25  * "This product includes software developed by the
26  * Apache Software Foundation (http://www.apache.org/)."
27  * Alternately, this acknowlegement may appear in the software itself,
28  * if and wherever such third-party acknowlegements normally appear.
29  *
30  * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
31  * Foundation" must not be used to endorse or promote products derived
32  * from this software without prior written permission. For written
33  * permission, please contact apache@apache.org.
34  *
35  * 5. Products derived from this software may not be called "Apache"
36  * nor may "Apache" appear in their names without prior written
37  * permission of the Apache Group.
38  *
39  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
43  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  * ====================================================================
52  *
53  * This software consists of voluntary contributions made by many
54  * individuals on behalf of the Apache Software Foundation. For more
55  * information on the Apache Software Foundation, please see
56  * <http://www.apache.org/>.
57  *
58  */

59  
60 import java.io.Serializable JavaDoc;
61 import java.util.Vector JavaDoc;
62
63 /**
64  * RE is an efficient, lightweight regular expression evaluator/matcher
65  * class. Regular expressions are pattern descriptions which enable
66  * sophisticated matching of strings. In addition to being able to
67  * match a string against a pattern, you can also extract parts of the
68  * match. This is especially useful in text parsing! Details on the
69  * syntax of regular expression patterns are given below.
70  *
71  * <p>
72  *
73  * To compile a regular expression (RE), you can simply construct an RE
74  * matcher object from the string specification of the pattern, like this:
75  *
76  * <pre>
77  *
78  * RE r = new RE("a*b");
79  *
80  * </pre>
81  *
82  * <p>
83  *
84  * Once you have done this, you can call either of the RE.match methods to
85  * perform matching on a String. For example:
86  *
87  * <pre>
88  *
89  * boolean matched = r.match("aaaab");
90  *
91  * </pre>
92  *
93  * will cause the boolean matched to be set to true because the
94  * pattern "a*b" matches the string "aaaab".
95  *
96  * <p>
97  * If you were interested in the <i>number</i> of a's which matched the
98  * first part of our example expression, you could change the expression to
99  * "(a*)b". Then when you compiled the expression and matched it against
100  * something like "xaaaab", you would get results like this:
101  *
102  * <pre>
103  *
104  * RE r = new RE("(a*)b"); // Compile expression
105  * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
106  *
107  * <br>
108  *
109  * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
110  * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
111  *
112  * <br>
113  *
114  * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
115  * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
116  * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
117  *
118  * <br>
119  *
120  * int startInside = r.getParenStart(1); // startInside will be index 1
121  * int endInside = r.getParenEnd(1); // endInside will be index 5
122  * int lenInside = r.getParenLength(1); // lenInside will be 4
123  *
124  * </pre>
125  *
126  * You can also refer to the contents of a parenthesized expression
127  * within a regular expression itself. This is called a
128  * 'backreference'. The first backreference in a regular expression is
129  * denoted by \1, the second by \2 and so on. So the expression:
130  *
131  * <pre>
132  *
133  * ([0-9]+)=\1
134  *
135  * </pre>
136  *
137  * will match any string of the form n=n (like 0=0 or 2=2).
138  *
139  * <p>
140  *
141  * The full regular expression syntax accepted by RE is described here:
142  *
143  * <pre>
144  *
145  * <br>
146  *
147  * <b><font face=times roman>Characters</font></b>
148  *
149  * <br>
150  *
151  * <i>unicodeChar</i> Matches any identical unicode character
152  * \ Used to quote a meta-character (like '*')
153  * \\ Matches a single '\' character
154  * \0nnn Matches a given octal character
155  * \xhh Matches a given 8-bit hexadecimal character
156  * \\uhhhh Matches a given 16-bit hexadecimal character
157  * \t Matches an ASCII tab character
158  * \n Matches an ASCII newline character
159  * \r Matches an ASCII return character
160  * \f Matches an ASCII form feed character
161  *
162  * <br>
163  *
164  * <b><font face=times roman>Character Classes</font></b>
165  *
166  * <br>
167  *
168  * [abc] Simple character class
169  * [a-zA-Z] Character class with ranges
170  * [^abc] Negated character class
171  *
172  * <br>
173  *
174  * <b><font face=times roman>Standard POSIX Character Classes</font></b>
175  *
176  * <br>
177  *
178  * [:alnum:] Alphanumeric characters.
179  * [:alpha:] Alphabetic characters.
180  * [:blank:] Space and tab characters.
181  * [:cntrl:] Control characters.
182  * [:digit:] Numeric characters.
183  * [:graph:] Characters that are printable and are also visible.
184  * (A space is printable, but not visible, while an
185  * `a' is both.)
186  * [:lower:] Lower-case alphabetic characters.
187  * [:print:] Printable characters (characters that are not
188  * control characters.)
189  * [:punct:] Punctuation characters (characters that are not letter,
190  * digits, control characters, or space characters).
191  * [:space:] Space characters (such as space, tab, and formfeed,
192  * to name a few).
193  * [:upper:] Upper-case alphabetic characters.
194  * [:xdigit:] Characters that are hexadecimal digits.
195  *
196  * <br>
197  *
198  * <b><font face=times roman>Non-standard POSIX-style Character
199  * Classes</font></b>
200  *
201  * <br>
202  *
203  * [:javastart:] Start of a Java identifier
204  * [:javapart:] Part of a Java identifier
205  *
206  * <br>
207  *
208  * <b><font face=times roman>Predefined Classes</font></b>
209  *
210  * <br>
211  *
212  * . Matches any character other than newline
213  * \w Matches a "word" character (alphanumeric plus "_")
214  * \W Matches a non-word character
215  * \s Matches a whitespace character
216  * \S Matches a non-whitespace character
217  * \d Matches a digit character
218  * \D Matches a non-digit character
219  *
220  * <br>
221  *
222  * <b><font face=times roman>Boundary Matchers</font></b>
223  *
224  * <br>
225  *
226  * ^ Matches only at the beginning of a line
227  * $ Matches only at the end of a line
228  * \b Matches only at a word boundary
229  * \B Matches only at a non-word boundary
230  *
231  * <br>
232  *
233  * <b><font face=times roman>Greedy Closures</font></b>
234  *
235  * <br>
236  *
237  * A* Matches A 0 or more times (greedy)
238  * A+ Matches A 1 or more times (greedy)
239  * A? Matches A 1 or 0 times (greedy)
240  * A{n} Matches A exactly n times (greedy)
241  * A{n,} Matches A at least n times (greedy)
242  * A{n,m} Matches A at least n but not more than m times (greedy)
243  *
244  * <br>
245  *
246  * <b><font face=times roman>Reluctant Closures</font></b>
247  *
248  * <br>
249  *
250  * A*? Matches A 0 or more times (reluctant)
251  * A+? Matches A 1 or more times (reluctant)
252  * A?? Matches A 0 or 1 times (reluctant)
253  *
254  * <br>
255  *
256  * <b><font face=times roman>Logical Operators</font></b>
257  *
258  * <br>
259  *
260  * AB Matches A followed by B
261  * A|B Matches either A or B
262  * (A) Used for subexpression grouping
263  * (?:A) Used for subexpression clustering (just like grouping but
264  * no backrefs)
265  *
266  * <br>
267  *
268  * <b><font face=times roman>Backreferences</font></b>
269  *
270  * <br>
271  *
272  * \1 Backreference to 1st parenthesized subexpression
273  * \2 Backreference to 2nd parenthesized subexpression
274  * \3 Backreference to 3rd parenthesized subexpression
275  * \4 Backreference to 4th parenthesized subexpression
276  * \5 Backreference to 5th parenthesized subexpression
277  * \6 Backreference to 6th parenthesized subexpression
278  * \7 Backreference to 7th parenthesized subexpression
279  * \8 Backreference to 8th parenthesized subexpression
280  * \9 Backreference to 9th parenthesized subexpression
281  *
282  * <br>
283  *
284  * </pre>
285  *
286  * <p>
287  *
288  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
289  * that they match as many elements of the string as possible without
290  * causing the overall match to fail. If you want a closure to be
291  * reluctant (non-greedy), you can simply follow it with a '?'. A
292  * reluctant closure will match as few elements of the string as
293  * possible when finding matches. {m,n} closures don't currently
294  * support reluctancy.
295  *
296  * <p>
297  *
298  * RE runs programs compiled by the RECompiler class. But the RE
299  * matcher class does not include the actual regular expression compiler
300  * for reasons of efficiency. In fact, if you want to pre-compile one
301  * or more regular expressions, the 'recompile' class can be invoked
302  * from the command line to produce compiled output like this:
303  *
304  * <pre>
305  *
306  * // Pre-compiled regular expression "a*b"
307  * char[] re1Instructions =
308  * {
309  * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
310  * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
311  * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
312  * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
313  * 0x0000,
314  * };
315  *
316  * <br>
317  *
318  * REProgram re1 = new REProgram(re1Instructions);
319  *
320  * </pre>
321  *
322  * You can then construct a regular expression matcher (RE) object from
323  * the pre-compiled expression re1 and thus avoid the overhead of
324  * compiling the expression at runtime. If you require more dynamic
325  * regular expressions, you can construct a single RECompiler object and
326  * re-use it to compile each expression. * Similarly, you can change the
327  * program run by a given matcher object at any time. * However, RE and
328  * RECompiler are not threadsafe (for efficiency reasons, and because
329  * requiring thread safety in this class is deemed to be a rare
330  * requirement), so you will need to construct a separate compiler or
331  * matcher object for each thread (unless you do thread synchronization
332  * yourself).
333  *
334  * </pre>
335  * <br><p><br>
336  *
337  * <font color=red>
338  * <i>ISSUES:</i>
339  *
340  * <ul>
341  * <li>com.weusours.util.re is not currently compatible with all
342  * standard POSIX regcomp flags</li>
343  * <li>com.weusours.util.re does not support POSIX equivalence classes
344  * ([=foo=] syntax) (I18N/locale issue)</li>
345  * <li>com.weusours.util.re does not support nested POSIX character
346  * classes (definitely should, but not completely trivial)</li>
347  * <li>com.weusours.util.re Does not support POSIX character collation
348  * concepts ([.foo.] syntax) (I18N/locale issue)</li>
349  * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
350  * <li>Should RE support character iterators (for backwards RE matching!)?</li>
351  * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
352  * <li>Not *all* possibilities are considered for greediness when backreferences
353  * are involved (as POSIX suggests should be the case). The POSIX RE
354  * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
355  * of acdacaa where \1 is "a". This is not the case in this RE package,
356  * and actually Perl doesn't go to this extent either! Until someone
357  * actually complains about this, I'm not sure it's worth "fixing".
358  * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
359  * </ul>
360  *
361  * </font>
362  *
363  * @see recompile
364  * @see RECompiler
365  *
366  * @author <a HREF="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
367  * @author <a HREF="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
368  * @version $Id: RE.java,v 1.13 2003/06/02 02:18:41 vgritsenko Exp $
369  */

370 public class RE implements Serializable JavaDoc
371 {
372     /**
373      * Specifies normal, case-sensitive matching behaviour.
374      */

375     public static final int MATCH_NORMAL = 0x0000;
376
377     /**
378      * Flag to indicate that matching should be case-independent (folded)
379      */

380     public static final int MATCH_CASEINDEPENDENT = 0x0001;
381
382     /**
383      * Newlines should match as BOL/EOL (^ and $)
384      */

385     public static final int MATCH_MULTILINE = 0x0002;
386
387     /**
388      * Consider all input a single body of text - newlines are matched by .
389      */

390     public static final int MATCH_SINGLELINE = 0x0004;
391
392     /************************************************
393      * *
394      * The format of a node in a program is: *
395      * *
396      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
397      * *
398      * char OPCODE - instruction *
399      * char OPDATA - modifying data *
400      * char OPNEXT - next node (relative offset) *
401      * *
402      ************************************************/

403
404                  // Opcode Char Opdata/Operand Meaning
405
// ---------- ---------- --------------- --------------------------------------------------
406
static final char OP_END = 'E'; // end of program
407
static final char OP_BOL = '^'; // match only if at beginning of line
408
static final char OP_EOL = '$'; // match only if at end of line
409
static final char OP_ANY = '.'; // match any single character except newline
410
static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
411
static final char OP_BRANCH = '|'; // node match this alternative or the next one
412
static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
413
static final char OP_STAR = '*'; // node kleene closure
414
static final char OP_PLUS = '+'; // node positive closure
415
static final char OP_MAYBE = '?'; // node optional closure
416
static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
417
static final char OP_OPEN = '('; // number nth opening paren
418
static final char OP_OPEN_CLUSTER = '<'; // opening cluster
419
static final char OP_CLOSE = ')'; // number nth closing paren
420
static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
421
static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
422
static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
423
static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
424
static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
425
static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
426
static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
427
static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
428

429     // Escape codes
430
static final char E_ALNUM = 'w'; // Alphanumeric
431
static final char E_NALNUM = 'W'; // Non-alphanumeric
432
static final char E_BOUND = 'b'; // Word boundary
433
static final char E_NBOUND = 'B'; // Non-word boundary
434
static final char E_SPACE = 's'; // Whitespace
435
static final char E_NSPACE = 'S'; // Non-whitespace
436
static final char E_DIGIT = 'd'; // Digit
437
static final char E_NDIGIT = 'D'; // Non-digit
438

439     // Posix character classes
440
static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
441
static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
442
static final char POSIX_CLASS_BLANK = 'b'; // Blanks
443
static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
444
static final char POSIX_CLASS_DIGIT = 'd'; // Digits
445
static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
446
static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
447
static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
448
static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
449
static final char POSIX_CLASS_SPACE = 's'; // Spaces
450
static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
451
static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
452
static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
453
static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
454

455     // Limits
456
static final int maxNode = 65536; // Maximum number of nodes in a program
457
static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
458

459     // Node layout constants
460
static final int offsetOpcode = 0; // Opcode offset (first character)
461
static final int offsetOpdata = 1; // Opdata offset (second char)
462
static final int offsetNext = 2; // Next index offset (third char)
463
static final int nodeSize = 3; // Node size (in chars)
464

465     /** Line Separator */
466     static final String JavaDoc NEWLINE = System.getProperty("line.separator");
467
468     // State of current program
469
REProgram program; // Compiled regular expression 'program'
470
transient CharacterIterator search; // The string being matched against
471
int matchFlags; // Match behaviour flags
472
int maxParen = MAX_PAREN;
473
474     // Parenthesized subexpressions
475
transient int parenCount; // Number of subexpressions matched (num open parens + 1)
476
transient int start0; // Cache of start[0]
477
transient int end0; // Cache of start[0]
478
transient int start1; // Cache of start[1]
479
transient int end1; // Cache of start[1]
480
transient int start2; // Cache of start[2]
481
transient int end2; // Cache of start[2]
482
transient int[] startn; // Lazy-alloced array of sub-expression starts
483
transient int[] endn; // Lazy-alloced array of sub-expression ends
484

485     // Backreferences
486
transient int[] startBackref; // Lazy-alloced array of backref starts
487
transient int[] endBackref; // Lazy-alloced array of backref ends
488

489     /**
490      * Constructs a regular expression matcher from a String by compiling it
491      * using a new instance of RECompiler. If you will be compiling many
492      * expressions, you may prefer to use a single RECompiler object instead.
493      * @param pattern The regular expression pattern to compile.
494      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
495      * @see RECompiler
496      * @see recompile
497      */

498     public RE(String JavaDoc pattern) throws RESyntaxException
499     {
500         this(pattern, MATCH_NORMAL);
501     }
502
503     /**
504      * Constructs a regular expression matcher from a String by compiling it
505      * using a new instance of RECompiler. If you will be compiling many
506      * expressions, you may prefer to use a single RECompiler object instead.
507      * @param pattern The regular expression pattern to compile.
508      * @param matchFlags The matching style
509      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
510      * @see RECompiler
511      * @see recompile
512      */

513     public RE(String JavaDoc pattern, int matchFlags) throws RESyntaxException
514     {
515         this(new RECompiler().compile(pattern));
516         setMatchFlags(matchFlags);
517     }
518
519     /**
520      * Construct a matcher for a pre-compiled regular expression from program
521      * (bytecode) data. Permits special flags to be passed in to modify matching
522      * behaviour.
523      * @param program Compiled regular expression program (see RECompiler and/or recompile)
524      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
525      *
526      * <pre>
527      *
528      * MATCH_NORMAL // Normal (case-sensitive) matching
529      * MATCH_CASEINDEPENDENT // Case folded comparisons
530      * MATCH_MULTILINE // Newline matches as BOL/EOL
531      *
532      * </pre>
533      *
534      * @see RECompiler
535      * @see REProgram
536      * @see recompile
537      */

538     public RE(REProgram program, int matchFlags)
539     {
540         setProgram(program);
541         setMatchFlags(matchFlags);
542     }
543
544     /**
545      * Construct a matcher for a pre-compiled regular expression from program
546      * (bytecode) data.
547      * @param program Compiled regular expression program
548      * @see RECompiler
549      * @see recompile
550      */

551     public RE(REProgram program)
552     {
553         this(program, MATCH_NORMAL);
554     }
555
556     /**
557      * Constructs a regular expression matcher with no initial program.
558      * This is likely to be an uncommon practice, but is still supported.
559      */

560     public RE()
561     {
562         this((REProgram)null, MATCH_NORMAL);
563     }
564
565     /**
566      * Converts a 'simplified' regular expression to a full regular expression
567      * @param pattern The pattern to convert
568      * @return The full regular expression
569      */

570     public static String JavaDoc simplePatternToFullRegularExpression(String JavaDoc pattern)
571     {
572         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
573         for (int i = 0; i < pattern.length(); i++)
574         {
575             char c = pattern.charAt(i);
576             switch (c)
577             {
578                 case '*':
579                     buf.append(".*");
580                     break;
581
582                 case '.':
583                 case '[':
584                 case ']':
585                 case '\\':
586                 case '+':
587                 case '?':
588                 case '{':
589                 case '}':
590                 case '$':
591                 case '^':
592                 case '|':
593                 case '(':
594                 case ')':
595                     buf.append('\\');
596                 default:
597                     buf.append(c);
598                     break;
599             }
600         }
601         return buf.toString();
602     }
603
604     /**
605      * Sets match behaviour flags which alter the way RE does matching.
606      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
607      *
608      * <pre>
609      *
610      * MATCH_NORMAL // Normal (case-sensitive) matching
611      * MATCH_CASEINDEPENDENT // Case folded comparisons
612      * MATCH_MULTILINE // Newline matches as BOL/EOL
613      *
614      * </pre>
615      *
616      */

617     public void setMatchFlags(int matchFlags)
618     {
619         this.matchFlags = matchFlags;
620     }
621
622     /**
623      * Returns the current match behaviour flags.
624      * @return Current match behaviour flags (RE.MATCH_*).
625      *
626      * <pre>
627      *
628      * MATCH_NORMAL // Normal (case-sensitive) matching
629      * MATCH_CASEINDEPENDENT // Case folded comparisons
630      * MATCH_MULTILINE // Newline matches as BOL/EOL
631      *
632      * </pre>
633      *
634      * @see #setMatchFlags
635      *
636      */

637     public int getMatchFlags()
638     {
639         return matchFlags;
640     }
641
642     /**
643      * Sets the current regular expression program used by this matcher object.
644      * @param program Regular expression program compiled by RECompiler.
645      * @see RECompiler
646      * @see REProgram
647      * @see recompile
648      */

649     public void setProgram(REProgram program)
650     {
651         this.program = program;
652         if (program != null && program.maxParens != -1) {
653             this.maxParen = program.maxParens;
654         } else {
655             this.maxParen = MAX_PAREN;
656         }
657     }
658
659     /**
660      * Returns the current regular expression program in use by this matcher object.
661      * @return Regular expression program
662      * @see #setProgram
663      */

664     public REProgram getProgram()
665     {
666         return program;
667     }
668
669     /**
670      * Returns the number of parenthesized subexpressions available after a successful match.
671      * @return Number of available parenthesized subexpressions
672      */

673     public int getParenCount()
674     {
675         return parenCount;
676     }
677
678     /**
679      * Gets the contents of a parenthesized subexpression after a successful match.
680      * @param which Nesting level of subexpression
681      * @return String
682      */

683     public String JavaDoc getParen(int which)
684     {
685         int start;
686         if (which < parenCount && (start = getParenStart(which)) >= 0)
687         {
688             return search.substring(start, getParenEnd(which));
689         }
690         return null;
691     }
692
693     /**
694      * Returns the start index of a given paren level.
695      * @param which Nesting level of subexpression
696      * @return String index
697      */

698     public final int getParenStart(int which)
699     {
700         if (which < parenCount)
701         {
702             switch (which)
703             {
704                 case 0:
705                     return start0;
706                     
707                 case 1:
708                     return start1;
709                     
710                 case 2:
711                     return start2;
712                     
713                 default:
714                     if (startn == null)
715                     {
716                         allocParens();
717                     }
718                     return startn[which];
719             }
720         }
721         return -1;
722     }
723
724     /**
725      * Returns the end index of a given paren level.
726      * @param which Nesting level of subexpression
727      * @return String index
728      */

729     public final int getParenEnd(int which)
730     {
731         if (which < parenCount)
732         {
733             switch (which)
734             {
735                 case 0:
736                     return end0;
737                     
738                 case 1:
739                     return end1;
740                     
741                 case 2:
742                     return end2;
743                     
744                 default:
745                     if (endn == null)
746                     {
747                         allocParens();
748                     }
749                     return endn[which];
750             }
751         }
752         return -1;
753     }
754
755     /**
756      * Returns the length of a given paren level.
757      * @param which Nesting level of subexpression
758      * @return Number of characters in the parenthesized subexpression
759      */

760     public final int getParenLength(int which)
761     {
762         if (which < parenCount)
763         {
764             return getParenEnd(which) - getParenStart(which);
765         }
766         return -1;
767     }
768
769     /**
770      * Sets the start of a paren level
771      * @param which Which paren level
772      * @param i Index in input array
773      */

774     protected final void setParenStart(int which, int i)
775     {
776         if (which < parenCount)
777         {
778             switch (which)
779             {
780                 case 0:
781                     start0 = i;
782                     break;
783                     
784                 case 1:
785                     start1 = i;
786                     break;
787                     
788                 case 2:
789                     start2 = i;
790                     break;
791                     
792                 default:
793                     if (startn == null)
794                     {
795                         allocParens();
796                     }
797                     startn[which] = i;
798                     break;
799             }
800         }
801     }
802
803     /**
804      * Sets the end of a paren level
805      * @param which Which paren level
806      * @param i Index in input array
807      */

808     protected final void setParenEnd(int which, int i)
809     {
810         if (which < parenCount)
811         {
812             switch (which)
813             {
814                 case 0:
815                     end0 = i;
816                     break;
817                     
818                 case 1:
819                     end1 = i;
820                     break;
821                     
822                 case 2:
823                     end2 = i;
824                     break;
825                     
826                 default:
827                     if (endn == null)
828                     {
829                         allocParens();
830                     }
831                     endn[which] = i;
832                     break;
833             }
834         }
835     }
836
837     /**
838      * Throws an Error representing an internal error condition probably resulting
839      * from a bug in the regular expression compiler (or possibly data corruption).
840      * In practice, this should be very rare.
841      * @param s Error description
842      */

843     protected void internalError(String JavaDoc s) throws Error JavaDoc
844     {
845         throw new Error JavaDoc("RE internal error: " + s);
846     }
847
848     /**
849      * Performs lazy allocation of subexpression arrays
850      */

851     private final void allocParens()
852     {
853         // Allocate arrays for subexpressions
854
startn = new int[maxParen];
855         endn = new int[maxParen];
856
857         // Set sub-expression pointers to invalid values
858
for (int i = 0; i < maxParen; i++)
859         {
860             startn[i] = -1;
861             endn[i] = -1;
862         }
863     }
864
865     /**
866      * Try to match a string against a subset of nodes in the program
867      * @param firstNode Node to start at in program
868      * @param lastNode Last valid node (used for matching a subexpression without
869      * matching the rest of the program as well).
870      * @param idxStart Starting position in character array
871      * @return Final input array index if match succeeded. -1 if not.
872      */

873     protected int matchNodes(int firstNode, int lastNode, int idxStart)
874     {
875         // Our current place in the string
876
int idx = idxStart;
877
878         // Loop while node is valid
879
int next, opcode, opdata;
880         int idxNew;
881         char[] instruction = program.instruction;
882         for (int node = firstNode; node < lastNode; )
883         {
884             opcode = instruction[node + offsetOpcode];
885             next = node + (short)instruction[node + offsetNext];
886             opdata = instruction[node + offsetOpdata];
887
888             switch (opcode)
889             {
890                 case OP_RELUCTANTMAYBE:
891                     {
892                         int once = 0;
893                         do
894                         {
895                             // Try to match the rest without using the reluctant subexpr
896
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
897                             {
898                                 return idxNew;
899                             }
900                         }
901                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
902                         return -1;
903                     }
904
905                 case OP_RELUCTANTPLUS:
906                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
907                     {
908                         // Try to match the rest without using the reluctant subexpr
909
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
910                         {
911                             return idxNew;
912                         }
913                     }
914                     return -1;
915
916                 case OP_RELUCTANTSTAR:
917                     do
918                     {
919                         // Try to match the rest without using the reluctant subexpr
920
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
921                         {
922                             return idxNew;
923                         }
924                     }
925                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
926                     return -1;
927
928                 case OP_OPEN:
929
930                     // Match subexpression
931
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
932                     {
933                         startBackref[opdata] = idx;
934                     }
935                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
936                     {
937                         // Increase valid paren count
938
if ((opdata + 1) > parenCount)
939                         {
940                             parenCount = opdata + 1;
941                         }
942
943                         // Don't set paren if already set later on
944
if (getParenStart(opdata) == -1)
945                         {
946                             setParenStart(opdata, idx);
947                         }
948                     }
949                     return idxNew;
950
951                 case OP_CLOSE:
952
953                     // Done matching subexpression
954
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
955                     {
956                         endBackref[opdata] = idx;
957                     }
958                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
959                     {
960                         // Increase valid paren count
961
if ((opdata + 1) > parenCount)
962                         {
963                             parenCount = opdata + 1;
964                         }
965
966                         // Don't set paren if already set later on
967
if (getParenEnd(opdata) == -1)
968                         {
969                             setParenEnd(opdata, idx);
970                         }
971                     }
972                     return idxNew;
973
974                 case OP_OPEN_CLUSTER:
975                 case OP_CLOSE_CLUSTER:
976                     // starting or ending the matching of a subexpression which has no backref.
977
return matchNodes( next, maxNode, idx );
978
979                 case OP_BACKREF:
980                     {
981                         // Get the start and end of the backref
982
int s = startBackref[opdata];
983                         int e = endBackref[opdata];
984
985                         // We don't know the backref yet
986
if (s == -1 || e == -1)
987                         {
988                             return -1;
989                         }
990
991                         // The backref is empty size
992
if (s == e)
993                         {
994                             break;
995                         }
996
997                         // Get the length of the backref
998
int l = e - s;
999
1000                        // If there's not enough input left, give up.
1001
if (search.isEnd(idx + l - 1))
1002                        {
1003                            return -1;
1004                        }
1005
1006                        // Case fold the backref?
1007
if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
1008                        {
1009                            // Compare backref to input, case-folding as we go
1010
for (int i = 0; i < l; i++)
1011                            {
1012                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
1013                                {
1014                                    return -1;
1015                                }
1016                            }
1017                        }
1018                        else
1019                        {
1020                            // Compare backref to input
1021
for (int i = 0; i < l; i++)
1022                            {
1023                                if (search.charAt(idx++) != search.charAt(s + i))
1024                                {
1025                                    return -1;
1026                                }
1027                            }
1028                        }
1029                    }
1030                    break;
1031
1032                case OP_BOL:
1033
1034                    // Fail if we're not at the start of the string
1035
if (idx != 0)
1036                    {
1037                        // If we're multiline matching, we could still be at the start of a line
1038
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1039                        {
1040                            // If not at start of line, give up
1041
if (idx <= 0 || !isNewline(idx - 1)) {
1042                                return -1;
1043                            } else {
1044                                break;
1045                            }
1046                        }
1047                        return -1;
1048                    }
1049                    break;
1050
1051                case OP_EOL:
1052
1053                    // If we're not at the end of string
1054
if (!search.isEnd(0) && !search.isEnd(idx))
1055                    {
1056                        // If we're multi-line matching
1057
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
1058                        {
1059                            // Give up if we're not at the end of a line
1060
if (! isNewline(idx)) {
1061                                return -1;
1062                            } else {
1063                                break;
1064                            }
1065                        }
1066                        return -1;
1067                    }
1068                    break;
1069
1070                case OP_ESCAPE:
1071
1072                    // Which escape?
1073
switch (opdata)
1074                    {
1075                        // Word boundary match
1076
case E_NBOUND:
1077                        case E_BOUND:
1078                            {
1079                                char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1080                                char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1081                                if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1082                                {
1083                                    return -1;
1084                                }
1085                            }
1086                            break;
1087
1088                        // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1089
case E_ALNUM:
1090                        case E_NALNUM:
1091                        case E_DIGIT:
1092                        case E_NDIGIT:
1093                        case E_SPACE:
1094                        case E_NSPACE:
1095
1096                            // Give up if out of input
1097
if (search.isEnd(idx))
1098                            {
1099                                return -1;
1100                            }
1101
1102                            char c = search.charAt(idx);
1103
1104                            // Switch on escape
1105
switch (opdata)
1106                            {
1107                                case E_ALNUM:
1108                                case E_NALNUM:
1109                                    if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1110                                    {
1111                                        return -1;
1112                                    }
1113                                    break;
1114
1115                                case E_DIGIT:
1116                                case E_NDIGIT:
1117                                    if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1118                                    {
1119                                        return -1;
1120                                    }
1121                                    break;
1122
1123                                case E_SPACE:
1124                                case E_NSPACE:
1125                                    if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1126                                    {
1127                                        return -1;
1128                                    }
1129                                    break;
1130                            }
1131                            idx++;
1132                            break;
1133
1134                        default:
1135                            internalError("Unrecognized escape '" + opdata + "'");
1136                    }
1137                    break;
1138
1139                case OP_ANY:
1140
1141                    if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1142                        // Match anything
1143
if(search.isEnd(idx))
1144                        {
1145                            return -1;
1146                        }
1147                        idx++;
1148                        break;
1149                    }
1150                    else
1151                    {
1152                        // Match anything but a newline
1153
if (search.isEnd(idx) || search.charAt(idx++) == '\n')
1154                        {
1155                            return -1;
1156                        }
1157                        break;
1158                    }
1159
1160                case OP_ATOM:
1161                    {
1162                        // Match an atom value
1163
if (search.isEnd(idx))
1164                        {
1165                            return -1;
1166                        }
1167
1168                        // Get length of atom and starting index
1169
int lenAtom = opdata;
1170                        int startAtom = node + nodeSize;
1171
1172                        // Give up if not enough input remains to have a match
1173
if (search.isEnd(lenAtom + idx - 1))
1174                        {
1175                            return -1;
1176                        }
1177
1178                        // Match atom differently depending on casefolding flag
1179
if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
1180                        {
1181                            for (int i = 0; i < lenAtom; i++)
1182                            {
1183                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
1184                                {
1185                                    return -1;
1186                                }
1187                            }
1188                        }
1189                        else
1190                        {
1191                            for (int i = 0; i < lenAtom; i++)
1192                            {
1193                                if (search.charAt(idx++) != instruction[startAtom + i])
1194                                {
1195                                    return -1;
1196                                }
1197                            }
1198                        }
1199                    }
1200                    break;
1201
1202                case OP_POSIXCLASS:
1203                    {
1204                        // Out of input?
1205
if (search.isEnd(idx))
1206                        {
1207                            return -1;
1208                        }
1209                        
1210                        switch (opdata)
1211                        {
1212                            case POSIX_CLASS_ALNUM:
1213                                if (!Character.isLetterOrDigit(search.charAt(idx)))
1214                                {
1215                                    return -1;
1216                                }
1217                                break;
1218                                
1219                            case POSIX_CLASS_ALPHA:
1220                                if (!Character.isLetter(search.charAt(idx)))
1221                                {
1222                                    return -1;
1223                                }
1224                                break;
1225                                
1226                            case POSIX_CLASS_DIGIT:
1227                                if (!Character.isDigit(search.charAt(idx)))
1228                                {
1229                                    return -1;
1230                                }
1231                                break;
1232                                
1233                            case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1234
if (!Character.isSpaceChar(search.charAt(idx)))
1235                                {
1236                                    return -1;
1237                                }
1238                                break;
1239                                
1240                            case POSIX_CLASS_SPACE:
1241                                if (!Character.isWhitespace(search.charAt(idx)))
1242                                {
1243                                    return -1;
1244                                }
1245                                break;
1246                                
1247                            case POSIX_CLASS_CNTRL:
1248                                if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1249                                {
1250                                    return -1;
1251                                }
1252                                break;
1253                                
1254                            case POSIX_CLASS_GRAPH: // JWL - bugbug???
1255
switch (Character.getType(search.charAt(idx)))
1256                                {
1257                                    case Character.MATH_SYMBOL:
1258                                    case Character.CURRENCY_SYMBOL:
1259                                    case Character.MODIFIER_SYMBOL:
1260                                    case Character.OTHER_SYMBOL:
1261                                        break;
1262                                        
1263                                    default:
1264                                        return -1;
1265                                }
1266                                break;
1267                                
1268                            case POSIX_CLASS_LOWER:
1269                                if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1270                                {
1271                                    return -1;
1272                                }
1273                                break;
1274                                
1275                            case POSIX_CLASS_UPPER:
1276                                if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1277                                {
1278                                    return -1;
1279                                }
1280                                break;
1281                                
1282                            case POSIX_CLASS_PRINT:
1283                                if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1284                                {
1285                                    return -1;
1286                                }
1287                                break;
1288                                
1289                            case POSIX_CLASS_PUNCT:
1290                            {
1291                                int type = Character.getType(search.charAt(idx));
1292                                switch(type)
1293                                {
1294                                    case Character.DASH_PUNCTUATION:
1295                                    case Character.START_PUNCTUATION:
1296                                    case Character.END_PUNCTUATION:
1297                                    case Character.CONNECTOR_PUNCTUATION:
1298                                    case Character.OTHER_PUNCTUATION:
1299                                        break;
1300                                        
1301                                    default:
1302                                        return -1;
1303                                }
1304                            }
1305                            break;
1306
1307                            case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1308
{
1309                                boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1310                                                    (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1311                                                    (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1312                                if (!isXDigit)
1313                                {
1314                                    return -1;
1315                                }
1316                            }
1317                            break;
1318                            
1319                            case POSIX_CLASS_JSTART:
1320                                if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1321                                {
1322                                    return -1;
1323                                }
1324                                break;
1325                                
1326                            case POSIX_CLASS_JPART:
1327                                if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1328                                {
1329                                    return -1;
1330                                }
1331                                break;
1332
1333                            default:
1334                                internalError("Bad posix class");
1335                                break;
1336                        }
1337                    
1338                        // Matched.
1339
idx++;
1340                    }
1341                    break;
1342
1343                case OP_ANYOF:
1344                    {
1345                        // Out of input?
1346
if (search.isEnd(idx))
1347                        {
1348                            return -1;
1349                        }
1350
1351                        // Get character to match against character class and maybe casefold
1352
char c = search.charAt(idx);
1353                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1354                        if (caseFold)
1355                        {
1356                            c = Character.toLowerCase(c);
1357                        }
1358
1359                        // Loop through character class checking our match character
1360
int idxRange = node + nodeSize;
1361                        int idxEnd = idxRange + (opdata * 2);
1362                        boolean match = false;
1363                        for (int i = idxRange; i < idxEnd; )
1364                        {
1365                            // Get start, end and match characters
1366
char s = instruction[i++];
1367                            char e = instruction[i++];
1368
1369                            // Fold ends of range and match character
1370
if (caseFold)
1371                            {
1372                                s = Character.toLowerCase(s);
1373                                e = Character.toLowerCase(e);
1374                            }
1375
1376                            // If the match character is in range, break out
1377
if (c >= s && c <= e)
1378                            {
1379                                match = true;
1380                                break;
1381                            }
1382                        }
1383
1384                        // Fail if we didn't match the character class
1385
if (!match)
1386                        {
1387                            return -1;
1388                        }
1389                        idx++;
1390                    }
1391                    break;
1392
1393                case OP_BRANCH:
1394                {
1395                    // Check for choices
1396
if (instruction[next + offsetOpcode] != OP_BRANCH)
1397                    {
1398                        // If there aren't any other choices, just evaluate this branch.
1399
node += nodeSize;
1400                        continue;
1401                    }
1402
1403                    // Try all available branches
1404
short nextBranch;
1405                    do
1406                    {
1407                        // Try matching the branch against the string
1408
if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1409                        {
1410                            return idxNew;
1411                        }
1412                        
1413                        // Go to next branch (if any)
1414
nextBranch = (short)instruction[node + offsetNext];
1415                        node += nextBranch;
1416                    }
1417                    while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1418
1419                    // Failed to match any branch!
1420
return -1;
1421                }
1422
1423                case OP_NOTHING:
1424                case OP_GOTO:
1425
1426                    // Just advance to the next node without doing anything
1427
break;
1428
1429                case OP_END:
1430
1431                    // Match has succeeded!
1432
setParenEnd(0, idx);
1433                    return idx;
1434
1435                default:
1436
1437                    // Corrupt program
1438
internalError("Invalid opcode '" + opcode + "'");
1439            }
1440
1441            // Advance to the next node in the program
1442
node = next;
1443        }
1444
1445        // We "should" never end up here
1446
internalError("Corrupt program");
1447        return -1;
1448    }
1449
1450    /**
1451     * Match the current regular expression program against the current
1452     * input string, starting at index i of the input string. This method
1453     * is only meant for internal use.
1454     * @param i The input string index to start matching at
1455     * @return True if the input matched the expression
1456     */

1457    protected boolean matchAt(int i)
1458    {
1459        // Initialize start pointer, paren cache and paren count
1460
start0 = -1;
1461        end0 = -1;
1462        start1 = -1;
1463        end1 = -1;
1464        start2 = -1;
1465        end2 = -1;
1466        startn = null;
1467        endn = null;
1468        parenCount = 1;
1469        setParenStart(0, i);
1470
1471        // Allocate backref arrays (unless optimizations indicate otherwise)
1472
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1473        {
1474            startBackref = new int[maxParen];
1475            endBackref = new int[maxParen];
1476        }
1477
1478        // Match against string
1479
int idx;
1480        if ((idx = matchNodes(0, maxNode, i)) != -1)
1481        {
1482            setParenEnd(0, idx);
1483            return true;
1484        }
1485
1486        // Didn't match
1487
parenCount = 0;
1488        return false;
1489    }
1490
1491    /**
1492     * Matches the current regular expression program against a character array,
1493     * starting at a given index.
1494     * @param search String to match against
1495     * @param i Index to start searching at
1496     * @return True if string matched
1497     */

1498    public boolean match(String JavaDoc search, int i)
1499    {
1500        return match(new StringCharacterIterator(search), i);
1501    }
1502
1503    /**
1504     * Matches the current regular expression program against a character array,
1505     * starting at a given index.
1506     * @param search String to match against
1507     * @param i Index to start searching at
1508     * @return True if string matched
1509     */

1510    public boolean match(CharacterIterator search, int i)
1511    {
1512        // There is no compiled program to search with!
1513
if (program == null)
1514        {
1515            // This should be uncommon enough to be an error case rather
1516
// than an exception (which would have to be handled everywhere)
1517
internalError("No RE program to run!");
1518        }
1519
1520        // Save string to search
1521
this.search = search;
1522
1523        // Can we optimize the search by looking for a prefix string?
1524
if (program.prefix == null)
1525        {
1526            // Unprefixed matching must try for a match at each character
1527
for ( ;! search.isEnd(i - 1); i++)
1528            {
1529                // Try a match at index i
1530
if (matchAt(i))
1531                {
1532                    return true;
1533                }
1534            }
1535            return false;
1536        }
1537        else
1538        {
1539            // Prefix-anchored matching is possible
1540
boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1541            char[] prefix = program.prefix;
1542            for ( ;! search.isEnd(i + prefix.length - 1); i++)
1543            {
1544                // If the first character of the prefix matches
1545
boolean match = false;
1546                if (caseIndependent)
1547                    match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
1548                else
1549                    match = search.charAt(i) == prefix[0];
1550                if (match)
1551                {
1552                    // Save first character position
1553
int firstChar = i++;
1554                    int k;
1555                    for (k = 1; k < prefix.length; )
1556                    {
1557                        // If there's a mismatch of any character in the prefix, give up
1558
if (caseIndependent)
1559                            match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
1560                        else
1561                            match = search.charAt(i++) == prefix[k++];
1562                        if (!match)
1563                        {
1564                            break;
1565                        }
1566                    }
1567
1568                    // See if the whole prefix string matched
1569
if (k == prefix.length)
1570                    {
1571                        // We matched the full prefix at firstChar, so try it
1572
if (matchAt(firstChar))
1573                        {
1574                            return true;
1575                        }
1576                    }
1577
1578                    // Match failed, reset i to continue the search
1579
i = firstChar;
1580                }
1581            }
1582            return false;
1583        }
1584    }
1585
1586    /**
1587     * Matches the current regular expression program against a String.
1588     * @param search String to match against
1589     * @return True if string matched
1590     */

1591    public boolean match(String JavaDoc search)
1592    {
1593        return match(search, 0);
1594    }
1595
1596    /**
1597     * Splits a string into an array of strings on regular expression boundaries.
1598     * This function works the same way as the Perl function of the same name.
1599     * Given a regular expression of "[ab]+" and a string to split of
1600     * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1601     * "[xyzzy, yyz, 123]".
1602     * @param s String to split on this regular exression
1603     * @return Array of strings
1604     */

1605    public String JavaDoc[] split(String JavaDoc s)
1606    {
1607        // Create new vector
1608
Vector JavaDoc v = new Vector JavaDoc();
1609
1610        // Start at position 0 and search the whole string
1611
int pos = 0;
1612        int len = s.length();
1613
1614        // Try a match at each position
1615
while (pos < len && match(s, pos))
1616        {
1617            // Get start of match
1618
int start = getParenStart(0);
1619
1620            // Get end of match
1621
int newpos = getParenEnd(0);
1622
1623            // Check if no progress was made
1624
if (newpos == pos)
1625            {
1626                v.addElement(s.substring(pos, start + 1));
1627                newpos++;
1628            }
1629            else
1630            {
1631                v.addElement(s.substring(pos, start));
1632            }
1633
1634            // Move to new position
1635
pos = newpos;
1636        }
1637
1638        // Push remainder if it's not empty
1639
String JavaDoc remainder = s.substring(pos);
1640        if (remainder.length() != 0)
1641        {
1642            v.addElement(remainder);
1643        }
1644
1645        // Return vector as an array of strings
1646
String JavaDoc[] ret = new String JavaDoc[v.size()];
1647        v.copyInto(ret);
1648        return ret;
1649    }
1650
1651    /**
1652     * Flag bit that indicates that subst should replace all occurrences of this
1653     * regular expression.
1654     */

1655    public static final int REPLACE_ALL = 0x0000;
1656
1657    /**
1658     * Flag bit that indicates that subst should only replace the first occurrence
1659     * of this regular expression.
1660     */

1661    public static final int REPLACE_FIRSTONLY = 0x0001;
1662
1663    /**
1664     * Flag bit that indicates that subst should replace backreferences
1665     */

1666    public static final int REPLACE_BACKREFERENCES = 0x0002;
1667
1668    /**
1669     * Substitutes a string for this regular expression in another string.
1670     * This method works like the Perl function of the same name.
1671     * Given a regular expression of "a*b", a String to substituteIn of
1672     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1673     * resulting String returned by subst would be "-foo-garply-wacky-".
1674     *
1675     * @param substituteIn String to substitute within
1676     * @param substitution String to substitute for all matches of this regular expression.
1677     * @return The string substituteIn with zero or more occurrences of the current
1678     * regular expression replaced with the substitution String (if this regular
1679     * expression object doesn't match at any position, the original String is returned
1680     * unchanged).
1681     */

1682    public String JavaDoc subst(String JavaDoc substituteIn, String JavaDoc substitution)
1683    {
1684        return subst(substituteIn, substitution, REPLACE_ALL);
1685    }
1686
1687    /**
1688     * Substitutes a string for this regular expression in another string.
1689     * This method works like the Perl function of the same name.
1690     * Given a regular expression of "a*b", a String to substituteIn of
1691     * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1692     * resulting String returned by subst would be "-foo-garply-wacky-".
1693     * <p>
1694     * It is also possible to reference the contents of a parenthesized expression
1695     * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1696     * a String to substituteIn of "visit us: http://www.apache.org!" and the
1697     * substitution String "&lt;a HREF=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
1698     * returned by subst would be
1699     * "visit us: &lt;a HREF=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
1700     * <p>
1701     * <i>Note:</i> $0 represents the whole match.
1702     *
1703     * @param substituteIn String to substitute within
1704     * @param substitution String to substitute for matches of this regular expression
1705     * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1706     * flag bit is set, only the first occurrence of this regular expression is replaced.
1707     * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1708     * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1709     * be processed.
1710     * @return The string substituteIn with zero or more occurrences of the current
1711     * regular expression replaced with the substitution String (if this regular
1712     * expression object doesn't match at any position, the original String is returned
1713     * unchanged).
1714     */

1715    public String JavaDoc subst(String JavaDoc substituteIn, String JavaDoc substitution, int flags)
1716    {
1717        // String to return
1718
StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
1719
1720        // Start at position 0 and search the whole string
1721
int pos = 0;
1722        int len = substituteIn.length();
1723
1724        // Try a match at each position
1725
while (pos < len && match(substituteIn, pos))
1726        {
1727            // Append string before match
1728
ret.append(substituteIn.substring(pos, getParenStart(0)));
1729
1730            if ((flags & REPLACE_BACKREFERENCES) != 0)
1731            {
1732                // Process backreferences
1733
int lCurrentPosition = 0;
1734                int lLastPosition = 0;
1735                int lLength = substitution.length();
1736
1737                while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1738                {
1739                    if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1740                        && lCurrentPosition+1 < lLength)
1741                    {
1742                        char c = substitution.charAt(lCurrentPosition + 1);
1743                        if (c >= '0' && c <= '9')
1744                        {
1745                            // Append everything between the last and the current $ sign
1746
ret.append(substitution.substring(lLastPosition+2, lCurrentPosition));
1747
1748                            // Append the parenthesized expression
1749
// Note: if a parenthesized expression of the requested
1750
// index is not available "null" is added to the string
1751
ret.append(getParen(c - '0'));
1752                            lLastPosition = lCurrentPosition;
1753                        }
1754                    }
1755
1756                    // Move forward, skipping past match
1757
lCurrentPosition++;
1758                }
1759
1760                // Append everything after the last $ sign
1761
ret.append(substitution.substring(lLastPosition+2,lLength));
1762            }
1763            else
1764            {
1765                // Append substitution without processing backreferences
1766
ret.append(substitution);
1767            }
1768
1769            // Move forward, skipping past match
1770
int newpos = getParenEnd(0);
1771
1772            // We always want to make progress!
1773
if (newpos == pos)
1774            {
1775                newpos++;
1776            }
1777
1778            // Try new position
1779
pos = newpos;
1780
1781            // Break out if we're only supposed to replace one occurrence
1782
if ((flags & REPLACE_FIRSTONLY) != 0)
1783            {
1784                break;
1785            }
1786        }
1787
1788        // If there's remaining input, append it
1789
if (pos < len)
1790        {
1791            ret.append(substituteIn.substring(pos));
1792        }
1793
1794        // Return string buffer as string
1795
return ret.toString();
1796    }
1797
1798    /**
1799     * Returns an array of Strings, whose toString representation matches a regular
1800     * expression. This method works like the Perl function of the same name. Given
1801     * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1802     * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1803     * @param search Array of Objects to search
1804     * @return Array of Strings whose toString() value matches this regular expression.
1805     */

1806    public String JavaDoc[] grep(Object JavaDoc[] search)
1807    {
1808        // Create new vector to hold return items
1809
Vector JavaDoc v = new Vector JavaDoc();
1810
1811        // Traverse array of objects
1812
for (int i = 0; i < search.length; i++)
1813        {
1814            // Get next object as a string
1815
String JavaDoc s = search[i].toString();
1816
1817            // If it matches this regexp, add it to the list
1818
if (match(s))
1819            {
1820                v.addElement(s);
1821            }
1822        }
1823
1824        // Return vector as an array of strings
1825
String JavaDoc[] ret = new String JavaDoc[v.size()];
1826        v.copyInto(ret);
1827        return ret;
1828    }
1829
1830    /** @return true if at the i-th position in the 'search' a newline ends */
1831    private boolean isNewline(int i) {
1832
1833        if (i < NEWLINE.length() - 1) {
1834            return false;
1835        }
1836
1837        if (search.charAt(i) == '\n') {
1838            return true;
1839        }
1840
1841        for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
1842            if (NEWLINE.charAt(j) != search.charAt(i)) {
1843                return false;
1844            }
1845        }
1846        return true;
1847    }
1848}
1849
Popular Tags