Perl5Util


1   package org.apache.oro.text.perl;
2   
3   /* ====================================================================
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2000 The Apache Software Foundation.  All rights
7    * reserved.
8    *
9    * Redistribution and use in source and binary forms, with or without
10   * modification, are permitted provided that the following conditions
11   * are met:
12   *
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   *
16   * 2. Redistributions in binary form must reproduce the above copyright
17   *    notice, this list of conditions and the following disclaimer in
18   *    the documentation and/or other materials provided with the
19   *    distribution.
20   *
21   * 3. The end-user documentation included with the redistribution,
22   *    if any, must include the following acknowledgment:
23   *       "This product includes software developed by the
24   *        Apache Software Foundation (http://www.apache.org/)."
25   *    Alternately, this acknowledgment may appear in the software itself,
26   *    if and wherever such third-party acknowledgments normally appear.
27   *
28   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
29   *    must not be used to endorse or promote products derived from this
30   *    software without prior written permission. For written
31   *    permission, please contact apache@apache.org.
32   *
33   * 5. Products derived from this software may not be called "Apache" 
34   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
35   *    name, without prior written permission of the Apache Software Foundation.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   *
51   * This software consists of voluntary contributions made by many
52   * individuals on behalf of the Apache Software Foundation.  For more
53   * information on the Apache Software Foundation, please see
54   * <http://www.apache.org/>.
55   *
56   * Portions of this software are based upon software originally written 
57   * by Daniel F. Savarese. We appreciate his contributions.
58   */
59  
60  import java.util.*;
61  import org.apache.oro.text.regex.*;
62  import org.apache.oro.text.*;
63  import org.apache.oro.util.*;
64  
65  /**
66   * This is a utility class implementing the 3 most common Perl5 operations
67   * involving regular expressions:
68   * <ul>
69   * <li> [m]/pattern/[i][m][s][x],
70   * <li> s/pattern/replacement/[g][i][m][o][s][x],
71   * <li> and split().
72   * </ul>
73   * As with Perl, any non-alphanumeric character can be used in lieu of
74   * the slashes.
75   *  <p>
76   * The objective of the class is to minimize the amount of code a Java
77   * programmer using OROMatcher<font size="-2"><sup>TM</sup></font>
78   * has to write to achieve the same results as Perl by 
79   * transparently handling regular expression compilation, caching, and
80   * matching.  A second objective is to use the same Perl pattern matching
81   * syntax to ease the task of Perl programmers transitioning to Java
82   * (this also reduces the number of parameters to a method).
83   * All the state affecting methods are synchronized to avoid
84   * the maintenance of explicit locks in multithreaded programs.  This
85   * philosophy differs from the
86   * OROMatcher<font size="-2"><sup>TM</sup></font> package, where
87   * you are expected to either maintain explicit locks, or more preferably
88   * create separate compiler and matcher instances for each thread.
89   * <p>
90   * To use this class, first create an instance using the default constructor
91   * or initialize the instance with a PatternCache of your choosing using
92   * the alternate constructor.  The default cache used by Perl5Util is a
93   * PatternCacheLRU of capacity GenericPatternCache.DEFAULT_CAPACITY.  You may
94   * want to create a cache with a different capacity, a different
95   * cache replacement policy, or even devise your own PatternCache
96   * implementation.  The PatternCacheLRU is probably the best general purpose
97   * pattern cache, but your specific application may be better served by
98   * a different cache replacement policy.  You should remember that you can
99   * front-load a cache with all the patterns you will be using before
100  * initializing a Perl5Util instance, or you can just let Perl5Util
101  * fill the cache as you use it.
102  * <p>
103  * You might use the class as follows:
104  * <pre>
105  * Perl5Util util = new Perl5Util();
106  * String line;
107  * DataInputStream input;
108  * PrintStream output;
109  * 
110  * // Initialization of input and output omitted
111  * while((line = input.readLine()) != null) {
112  *     // First find the line with the string we want to substitute because
113  *     // it is cheaper than blindly substituting each line.
114  *     if(util.match("/HREF=\"description1.html\"") {
115  *        line = util.substitute("s/description1\\.html/about1.html/", line);
116  *     }
117  *    output.println(line);
118  * }
119  * </pre>
120  * <p>
121  * A couple of things to remember when using this class are that the
122  * {@link #match match()} methods have the same meaning as
123  * contains() in OROMatcher<font size="-2"><sup>TM</sup></font>
124  * and <code>=~ m/pattern/</code> in Perl.  The methods are named match
125  * to more closely associate them with Perl and to differentiate them
126  * from matches() in OROMatcher<font size="-2"><sup>TM</sup></font>.
127  * A further thing to keep in mind is that the
128  * {@link MalformedPerl5PatternException} class is derived from
129  * RuntimeException which means you DON'T have to catch it.  The reasoning
130  * behind this is that you will detect your regular expression mistakes
131  * as you write and debug your program when a MalformedPerl5PatternException
132  * is thrown during a test run.  However, we STRONGLY recommend that you
133  * ALWAYS catch MalformedPerl5PatternException whenever you deal with a
134  * DYNAMICALLY created pattern.  Relying on a fatal
135  * MalformedPerl5PatternException being thrown to detect errors while
136  * debugging is only useful for dealing with static patterns, that is, actual
137  * pregenerated strings present in your program.  Patterns created from user
138  * input or some other dynamic method CANNOT be relied upon to be correct
139  * and MUST be handled by catching MalformedPerl5PatternException for your
140  * programs to be robust.
141  * <p>
142  * Finally, as a convenience Perl5Util implements 
143  * the org.apache.oro.text.regex.MatchResult interface found in the
144  * OROMatcher<font size="-2"><sup>TM</sup></font> package.  The methods
145  * are merely wrappers which call the corresponding method of the last
146  * MatchResult found (which can be accessed with
147  * {@link #getMatch()} by a match or substitution
148  * (or even a split, but this isn't particularly useful).
149 
150  @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
151  @version $Id: Perl5Util.java,v 1.1.1.1 2000/07/23 23:08:50 jon Exp $
152 
153  * @see MalformedPerl5PatternException
154  * @see org.apache.oro.text.PatternCache
155  * @see org.apache.oro.text.PatternCacheLRU
156  * @see org.apache.oro.text.regex.MatchResult
157  */
158 public final class Perl5Util implements MatchResult {
159   /** The regular expression to use to parse match expression. */
160   private static final String   __matchExpression = "m?(\\W)(.*)\\1([imsx]*)";
161 
162   /** The pattern cache to compile and store patterns */
163   private PatternCache __patternCache;
164   /** The hashtable to cache higher-level expressions */
165   private Cache __expressionCache;
166   /** The pattern matcher to perform matching operations. */
167   private Perl5Matcher __matcher = new Perl5Matcher();
168   /** The compiled match expression parsing regular expression. */
169   private Pattern __matchPattern;
170   /** The last match from a successful call to a matching method. */
171   private MatchResult __lastMatch;
172 
173   /**
174    * Keeps track of the original input (for postMatch() and preMatch())
175    * methods.  This will be discarded if the preMatch() and postMatch()
176    * methods are moved into the MatchResult interface.
177    */
178   private Object   __originalInput;
179 
180   /**
181    * Keeps track of the begin and end offsets of the original input for
182    * the postMatch() and preMatch() methods.
183    */
184   private int __inputBeginOffset, __inputEndOffset;
185 
186   /** Used for default return value of post and pre Match() */
187   private static final String   __nullString = "";
188 
189   /**
190    * A constant passed to the {@link #split split()} methods indicating
191    * that all occurrences of a pattern should be used to split a string. 
192    */
193   public static final int SPLIT_ALL = Util.SPLIT_ALL;
194 
195   /**
196    * A secondary constructor for Perl5Util.  It initializes the Perl5Matcher
197    * used by the class to perform matching operations, but requires the
198    * programmer to provide a PatternCache instance for the class
199    * to use to compile and store regular expressions.  You would want to
200    * use this constructor if you want to change the capacity or policy 
201    * of the cache used.  Example uses might be:
202    * <pre>
203    * // We know we're going to use close to 50 expressions a whole lot, so
204    * // we create a cache of the proper size.
205    * util = new Perl5Util(new PatternCacheLRU(50));
206    * </pre>
207    * or
208    * <pre>
209    * // We're only going to use a few expressions and know that second-chance
210    * // fifo is best suited to the order in which we are using the patterns.
211    * util = new Perl5Util(new PatternCacheFIFO2(10));
212    * </pre>
213    */
214   public Perl5Util(PatternCache cache) {
215     __patternCache = cache;
216     __expressionCache = new CacheLRU(cache.capacity());
217     __compilePatterns();
218   }
219 
220   /**
221    * Default constructor for Perl5Util.  This initializes the Perl5Matcher
222    * used by the class to perform matching operations and creates a
223    * default PatternCacheLRU instance to use to compile and cache regular
224    * expressions.  The size of this cache is 
225    * GenericPatternCache.DEFAULT_CAPACITY.
226    */
227   public Perl5Util() {
228     this(new PatternCacheLRU());
229   }
230 
231   /**
232    * Compiles the patterns (currently only the match expression) used to
233    * parse Perl5 expressions.  Right now it initializes __matchPattern.
234    */
235   private void __compilePatterns() {
236     Perl5Compiler compiler = new Perl5Compiler();
237 
238     try {
239       __matchPattern = 
240     compiler.compile(__matchExpression, Perl5Compiler.SINGLELINE_MASK);
241     } catch(MalformedPatternException e) {
242       // This should only happen during debugging.
243       //e.printStackTrace();
244       throw new RuntimeException  (e.getMessage());
245     }
246   }
247 
248   /**
249    * Parses a match expression and returns a compiled pattern.
250    * First checks the expression cache and if the pattern is not found,
251    * then parses the expression and fetches a compiled pattern from the
252    * pattern cache.  Otherwise, just uses the pattern found in the
253    * expression cache.  __matchPattern is used to parse the expression.
254    * <p>
255    * @param pattern  The Perl5 match expression to parse.
256    * @exception MalformedPerl5PatternException If there is an error parsing
257    *            the expression.
258    */
259   private Pattern __parseMatchExpression(String   pattern)
260        throws MalformedPerl5PatternException 
261   {
262     int index, compileOptions;
263     String   options, regex;
264     MatchResult result;
265     Object   obj;
266     Pattern ret;
267 
268     obj = __expressionCache.getElement(pattern);
269 
270     // Must catch ClassCastException because someone might incorrectly 
271     // pass an s/// expression.  try block is cheaper than checking
272     // instanceof
273     try {
274       if(obj != null)
275     return (Pattern)obj;
276     } catch(ClassCastException   e) {
277       // Fall through and parse expression
278     }
279 
280     if(!__matcher.matches(pattern, __matchPattern))
281       throw new
282     MalformedPerl5PatternException("Invalid expression: " +
283                        pattern);
284 
285     result = __matcher.getMatch();
286 
287     regex = result.group(2);
288     compileOptions = Perl5Compiler.DEFAULT_MASK;
289 
290     options = result.group(3);
291 
292     if(options != null) {
293       index = options.length();
294 
295       while(index-- > 0) {
296     switch(options.charAt(index)) {
297     case 'i' :
298       compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
299       break;
300     case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
301     case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
302     case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
303     default  :
304       throw new
305         MalformedPerl5PatternException("Invalid options: " + options);
306     }
307       }
308     }
309 
310     ret = __patternCache.getPattern(regex, compileOptions);
311     __expressionCache.addElement(pattern, ret);
312 
313     return ret;
314   }
315 
316   /**
317    * Searches for the first pattern match somewhere in a character array
318    * taking a pattern specified in Perl5 native format:
319    * <blockquote><pre>
320    * [m]/pattern/[i][m][s][x]
321    * </pre></blockquote>
322    * The <code>m</code> prefix is optional and the meaning of the optional
323    * trailing options are:
324    * <dl compact> 
325    * <dt> i <dd> case insensitive match
326    * <dt> m <dd> treat the input as consisting of multiple lines
327    * <dt> s <dd> treat the input as consisting of a single line
328    * <dt> x <dd> enable extended expression syntax incorporating whitespace
329    *             and comments
330    * </dl>
331    * As with Perl, any non-alphanumeric character can be used in lieu of
332    * the slashes.
333    * <p>
334    * If the input contains the pattern, the org.apache.oro.text.regex.MatchResult
335    * can be obtained by calling {@link #getMatch()}.
336    * However, Perl5Util implements the MatchResult interface as a wrapper
337    * around the last MatchResult found, so you can call its methods to
338    * access match information.
339    * <p>
340    * @param pattern  The pattern to search for.
341    * @param input    The char[] input to search.
342    * @return True if the input contains the pattern, false otherwise.
343    * @exception MalformedPerl5PatternException  If there is an error in
344    *            the pattern.  You are not forced to catch this exception
345    *            because it is derived from RuntimeException.
346    */
347   public synchronized boolean match(String   pattern, char[] input) 
348        throws MalformedPerl5PatternException
349   {
350     boolean result;
351     __parseMatchExpression(pattern);
352 
353     result = __matcher.contains(input, __parseMatchExpression(pattern));
354              
355     if(result) {
356       __lastMatch        = __matcher.getMatch();
357       __originalInput    = input;
358       __inputBeginOffset = 0;
359       __inputEndOffset   = input.length;
360     }
361 
362     return result;
363   }
364 
365 
366   /**
367    * Searches for the first pattern match in a String taking
368    * a pattern specified in Perl5 native format:
369    * <blockquote><pre>
370    * [m]/pattern/[i][m][s][x]
371    * </pre></blockquote>
372    * The <code>m</code> prefix is optional and the meaning of the optional
373    * trailing options are:
374    * <dl compact> 
375    * <dt> i <dd> case insensitive match
376    * <dt> m <dd> treat the input as consisting of multiple lines
377    * <dt> s <dd> treat the input as consisting of a single line
378    * <dt> x <dd> enable extended expression syntax incorporating whitespace
379    *             and comments
380    * </dl>
381    * As with Perl, any non-alphanumeric character can be used in lieu of
382    * the slashes.
383    * <p>
384    * If the input contains the pattern, the org.apache.oro.text.regex.MatchResult
385    * can be obtained by calling {@link #getMatch()}.
386    * However, Perl5Util implements the MatchResult interface as a wrapper
387    * around the last MatchResult found, so you can call its methods to
388    * access match information.
389    * <p>
390    * @param pattern  The pattern to search for.
391    * @param input    The String input to search.
392    * @return True if the input contains the pattern, false otherwise.
393    * @exception MalformedPerl5PatternException  If there is an error in
394    *            the pattern.  You are not forced to catch this exception
395    *            because it is derived from RuntimeException.
396    */
397   public synchronized boolean match(String   pattern, String   input)
398        throws MalformedPerl5PatternException
399   {
400     return match(pattern, input.toCharArray());
401   }
402 
403 
404   /**
405    * Searches for the next pattern match somewhere in a
406    * org.apache.oro.text.regex.PatternMatcherInput instance, taking
407    * a pattern specified in Perl5 native format:
408    * <blockquote><pre>
409    * [m]/pattern/[i][m][s][x]
410    * </pre></blockquote>
411    * The <code>m</code> prefix is optional and the meaning of the optional
412    * trailing options are:
413    * <dl compact> 
414    * <dt> i <dd> case insensitive match
415    * <dt> m <dd> treat the input as consisting of multiple lines
416    * <dt> s <dd> treat the input as consisting of a single line
417    * <dt> x <dd> enable extended expression syntax incorporating whitespace
418    *             and comments
419    * </dl>
420    * As with Perl, any non-alphanumeric character can be used in lieu of
421    * the slashes.
422    * <p>
423    * If the input contains the pattern, the org.apache.oro.text.regex.MatchResult
424    * can be obtained by calling {@link #getMatch()}.
425    * However, Perl5Util implements the MatchResult interface as a wrapper
426    * around the last MatchResult found, so you can call its methods to
427    * access match information.
428    * After the call to this method, the PatternMatcherInput current offset
429    * is advanced to the end of the match, so you can use it to repeatedly
430    * search for expressions in the entire input using a while loop as
431    * explained in the OROMatcher<font size="-2"><sup>TM</sup></font> package.
432    * <p>
433    * @param pattern  The pattern to search for.
434    * @param input    The PatternMatcherInput to search.
435    * @return True if the input contains the pattern, false otherwise.
436    * @exception MalformedPerl5PatternException  If there is an error in
437    *            the pattern.  You are not forced to catch this exception
438    *            because it is derived from RuntimeException.
439    */
440   public synchronized boolean match(String   pattern, PatternMatcherInput input)
441        throws MalformedPerl5PatternException
442   {
443     boolean result;
444 
445     result = __matcher.contains(input, __parseMatchExpression(pattern));
446 
447     if(result) {
448       __lastMatch     = __matcher.getMatch();
449       __originalInput = input.getInput();
450       __inputBeginOffset = input.getBeginOffset();
451       __inputEndOffset   = input.getEndOffset();
452     }
453 
454     return result;
455   }
456 
457 
458   /**
459    * Returns the last match found by a call to a match(), substitute(), or
460    * split() method.  This method is only intended for use to retrieve a match
461    * found by the last match found by a match() method.  This method should
462    * be used when you want to save MatchResult instances.  Otherwise, for
463    * simply accessing match information, it is more convenient to use the
464    * Perl5Util methods implementing the MatchResult interface.
465    * <p>
466    * @return The org.apache.oro.text.regex.MatchResult instance containing the
467    *         last match found.
468    */
469   public synchronized MatchResult getMatch() {
470     return __lastMatch;
471   }
472 
473 
474   /**
475    * Substitutes a pattern in a given input with a replacement string.
476    * The substitution expression is specified in Perl5 native format:
477    * <blockquote><pre>
478    * s/pattern/replacement/[g][i][m][o][s][x]
479    * </pre></blockquote>
480    * The <code>s</code> prefix is mandatory and the meaning of the optional
481    * trailing options are:
482    * <dl compact> 
483    * <dt> g <dd> Substitute all occurrences of pattern with replacement.
484    *             The default is to replace only the first occurrence.
485    * <dt> i <dd> perform a case insensitive match
486    * <dt> m <dd> treat the input as consisting of multiple lines
487    * <dt> o <dd> If variable interopolation is used, only evaluate the
488    *             interpolation once (the first time).  This is equivalent
489    *             to using a numInterpolations argument of 1 in the 
490    *             OROMatcher<font size="-2"><sup>TM</sup></font> 
491    *             Util.substitute() method.  The default is to compute
492    *             each interpolation independently.  See the
493    *             OROMatcher<font size="-2"><sup>TM</sup></font>
494    *             Util.substitute() method for more details on variable
495    *             interpolation in substitutions.
496    * <dt> s <dd> treat the input as consisting of a single line
497    * <dt> x <dd> enable extended expression syntax incorporating whitespace
498    *             and comments
499    * </dl>
500    * As with Perl, any non-alphanumeric character can be used in lieu of
501    * the slashes.  This is helpful to avoid backslashing.  For example,
502    * using slashes you would have to do:
503    * <blockquote><pre>
504    * result = util.substitute("s/foo\\/bar/goo\\/\\/baz/", input);
505    * </pre></blockquote>
506    * when you could more easily write:
507    * <blockquote><pre>
508    * result = util.substitute("s#foo/bar#goo//baz#", input);
509    * </pre></blockquote>
510    * where the hashmarks are used instead of slashes.
511    * <p>
512    * There is a special case of backslashing that you need to pay attention
513    * to.  As demonstrated above, to denote a delimiter in the substituted
514    * string it must be backslashed.  However, this can be a problem
515    * when you want to denote a backslash at the end of the substituted
516    * string.  As of PerlTools 1.3, a new means of handling this
517    * situation has been implemented.
518    * In previous versions, the behavior was that
519    * <blockquote>
520    * "... a double backslash (quadrupled in the Java String) always
521    * represents two backslashes unless the second backslash is followed
522    * by the delimiter, in which case it represents a single backslash."
523    * </blockquote>
524    * <p>
525    * The new behavior is that a backslash is always a backslash
526    * in the substitution portion of the expression unless it is used to
527    * escape a delimiter.  A backslash is considered to escape a delimiter
528    * if an even number of contiguous backslashes preceed the backslash
529    * and the delimiter following the backslash is not the FINAL delimiter
530    * in the expression.  Therefore, backslashes preceding final delimiters
531    * are never considered to escape the delimiter.  The following, which
532    * used to be an invalid expression and require a special-case extra
533    * backslash, will now replace all instances of / with \:
534    * <blockquote><pre>
535    * result = util.substitute("s#/#\\#g", input);
536    * </pre></blockquote>
537    * <p>
538    * @param expression The substitution expression.
539    * @param input      The input.
540    * @return           The input after substitutions have been performed.
541    * @exception MalformedPerl5PatternException  If there is an error in
542    *            the expression.  You are not forced to catch this exception
543    *            because it is derived from RuntimeException.
544    */
545   // Expression parsing will have to be moved into a separate method if
546   // there are going to be variations of this method.
547   public synchronized String   substitute(String   expression, String   input)
548        throws MalformedPerl5PatternException 
549   {
550     boolean backslash, finalDelimiter;
551     int index, compileOptions, numSubstitutions, numInterpolations;
552     int firstOffset, secondOffset, thirdOffset;
553     String   result;
554     StringBuffer   replacement;
555     Pattern compiledPattern;
556     char exp[], delimiter;
557     ParsedSubstitutionEntry entry;
558     Perl5Substitution substitution;
559     Object   obj;
560 
561     obj = __expressionCache.getElement(expression);
562 
563   __nullTest:
564     if(obj != null) {
565       // Must catch ClassCastException because someone might incorrectly 
566       // pass an m// expression.  try block is cheaper than checking
567       // instanceof.  We want to go ahead with parsing just in case so
568       // we break.
569       try {
570     entry = (ParsedSubstitutionEntry)obj;
571       } catch(ClassCastException   e) {
572     break __nullTest;
573       }
574 
575       result = Util.substitute(__matcher, entry._pattern, entry._substitution,
576                    input, entry._numSubstitutions);
577 
578       __lastMatch = __matcher.getMatch();
579 
580       return result;
581     }
582 
583     exp = expression.toCharArray();
584 
585     // Make sure basic conditions for a valid substitution expression hold.
586     if(exp.length < 4 || exp[0] != 's' || Character.isLetterOrDigit(exp[1])
587        || exp[1] == '-')
588       throw new
589     MalformedPerl5PatternException("Invalid expression: " + expression);
590     delimiter    = exp[1];
591     firstOffset  = 2;
592     secondOffset = thirdOffset = -1;
593     backslash    = false;
594 
595     // Parse pattern
596     for(index = firstOffset; index < exp.length; index++) {
597       if(exp[index] == '\\')
598     backslash = !backslash;
599       else if(exp[index] == delimiter && !backslash) {
600     secondOffset = index;
601     break;
602       } else if(backslash) 
603     backslash = !backslash;
604     }
605 
606     if(secondOffset == -1 || secondOffset == exp.length - 1)
607       throw new
608     MalformedPerl5PatternException("Invalid expression: " + expression);
609 
610     // Parse replacement string
611 
612     backslash = false;
613     finalDelimiter = true;
614     replacement = new StringBuffer  (exp.length - secondOffset);
615     for(index = secondOffset + 1; index < exp.length; index++) {
616       if(exp[index] == '\\') {
617     backslash = !backslash;
618 
619     // 05/05/99 dfs
620     // We unbackslash backslashed delimiters in the replacement string
621     // only if we're on an odd backslash and there is another occurrence
622     // of a delimiter later in the string.
623     if(backslash && index + 1 < exp.length && exp[index + 1] == delimiter
624       && expression.lastIndexOf(delimiter, exp.length - 1) != (index + 1))
625     {
626       finalDelimiter = false;
627       continue;
628     }
629       } else if(exp[index] == delimiter && finalDelimiter) {
630     thirdOffset = index;
631     break;
632       } else {
633     backslash      = false;
634     finalDelimiter = true;
635       }
636 
637       replacement.append(exp[index]);
638     }
639 
640     if(thirdOffset == -1)
641       throw new
642     MalformedPerl5PatternException("Invalid expression: " + expression);
643 
644     compileOptions    = Perl5Compiler.DEFAULT_MASK;
645     numSubstitutions  = 1;
646 
647     // Single quotes cause no interpolations to be performed in replacement
648     if(delimiter != '\'')
649       numInterpolations = Perl5Substitution.INTERPOLATE_ALL;
650     else
651       numInterpolations = Perl5Substitution.INTERPOLATE_NONE;
652 
653     // Parse options
654     for(index = thirdOffset + 1; index < exp.length; index++) {
655       switch(exp[index]) {
656       case 'i' :
657     compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
658     break;
659       case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
660       case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
661       case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
662       case 'g' : numSubstitutions = Util.SUBSTITUTE_ALL; break;
663       case 'o' : numInterpolations = 1; break;
664       default  :
665     throw new
666       MalformedPerl5PatternException("Invalid option: " + exp[index]);
667       }
668     }
669 
670     compiledPattern =
671       __patternCache.getPattern(new String  (exp, firstOffset,
672                        secondOffset - firstOffset),
673                 compileOptions);
674     substitution =
675       new Perl5Substitution(replacement.toString(), numInterpolations);
676     entry = new ParsedSubstitutionEntry(compiledPattern, substitution,
677                     numSubstitutions);
678     __expressionCache.addElement(expression, entry);
679 
680     result = Util.substitute(__matcher, compiledPattern, substitution,
681                  input, numSubstitutions);
682 
683     __lastMatch = __matcher.getMatch();
684 
685     return result;
686   }
687 
688 
689   /**
690    * Splits a String into strings contained in a Vector of size no greater
691    * than a specified limit.  The String is split using a regular expression
692    * as the delimiter.  The regular expression is a pattern specified
693    * in Perl5 native format:
694    * <blockquote><pre>
695    * [m]/pattern/[i][m][s][x]
696    * </pre></blockquote>
697    * The <code>m</code> prefix is optional and the meaning of the optional
698    * trailing options are:
699    * <dl compact> 
700    * <dt> i <dd> case insensitive match
701    * <dt> m <dd> treat the input as consisting of multiple lines
702    * <dt> s <dd> treat the input as consisting of a single line
703    * <dt> x <dd> enable extended expression syntax incorporating whitespace
704    *             and comments
705    * </dl>
706    * As with Perl, any non-alphanumeric character can be used in lieu of
707    * the slashes.
708    * <p>
709    * The limit parameter causes the string to be split on at most the first
710    * <b>limit - 1</b> number of pattern occurences.
711    * <p>
712    * Of special note is that this split method performs EXACTLY the same
713    * as the Perl split() function.  In other words, if the split pattern
714    * contains parentheses, additional Vector elements are created from
715    * each of the matching subgroups in the pattern.  Using an example
716    * similar to the one from the Camel book:
717    * <blockquote><pre>
718    * split("/([,-])/", "8-12,15,18")
719    * </pre></blockquote>
720    * produces the Vector containing:
721    * <blockquote><pre>
722    * { "8", "-", "12", ",", "15", ",", "18" }
723    * </pre></blockquote>
724    * The Util.split() method in the
725    * OROMatcher<font size="-2"><sup>TM</sup></font> package does NOT
726    * implement this particular behavior because it is intended to
727    * be usable with Pattern instances other than Perl5Pattern.
728    * <p>
729    * @param pattern The regular expression to use as a split delimiter.
730    * @param input The String to split.
731    * @param limit The limit on the size of the returned <code>Vector</code>.
732    *   Values <= 0 produce the same behavior as the SPLIT_ALL constant which
733    *   causes the limit to be ignored and splits to be performed on all
734    *   occurrences of the pattern.  You should use the SPLIT_ALL constant
735    *   to achieve this behavior instead of relying on the default behavior
736    *   associated with non-positive limit values.
737    * @return A <code> Vector </code> containing the substrings of the input
738    *    that occur between the regular expression delimiter occurences. The
739    *    input will not be split into any more substrings than the specified 
740    *    limit. A way of thinking of this is that only the first
741    *    <b>limit - 1</b>
742    *    matches of the delimiting regular expression will be used to split the
743    *    input. 
744    * @exception MalformedPerl5PatternException  If there is an error in
745    *            the expression.  You are not forced to catch this exception
746    *            because it is derived from RuntimeException.
747    */
748   public synchronized Vector split(String   pattern, String   input, int limit)
749        throws MalformedPerl5PatternException 
750   {
751     int beginOffset, groups, index;
752     String   group;
753     Vector results = new Vector(20);
754     MatchResult currentResult = null;
755     PatternMatcherInput pinput;
756     Pattern compiledPattern;
757 
758     compiledPattern = __parseMatchExpression(pattern);
759 
760     pinput = new PatternMatcherInput(input);
761     beginOffset = 0;
762 
763     while(--limit != 0 && __matcher.contains(pinput, compiledPattern)) {
764       currentResult = __matcher.getMatch();
765 
766       results.addElement(input.substring(beginOffset,
767                                          currentResult.beginOffset(0)));
768       if((groups = currentResult.groups()) > 1) {
769     for(index = 1; index < groups; ++index) {
770       group = currentResult.group(index);
771       if(group != null && group.length() > 0)
772         results.addElement(group);
773     }
774       }
775 
776       beginOffset = currentResult.endOffset(0);
777     }
778 
779     results.addElement(input.substring(beginOffset, input.length()));
780 
781     // Just for the sake of completeness
782     __lastMatch = currentResult;
783 
784     return results;       
785   }
786 
787   /**
788    * This method is identical to calling:
789    * <blockquote><pre>
790    * split(pattern, input, SPLIT_ALL);
791    * </pre></blockquote>
792    */
793   public synchronized Vector split(String   pattern, String   input)
794        throws MalformedPerl5PatternException 
795   {
796     return split(pattern, input, SPLIT_ALL);
797   }
798 
799   /**
800    * Splits input in the default Perl manner, splitting on all whitespace.
801    * This method is identical to calling:
802    * <blockquote><pre>
803    * split("/\\s+/", input);
804    * </pre></blockquote>
805    */
806   public synchronized Vector split(String   input)
807        throws MalformedPerl5PatternException 
808   {
809     return split("/\\s+/", input);
810   }
811 
812   //
813   // MatchResult interface methods.
814   //
815 
816   /**
817    * Returns the length of the last match found.
818    * <p>
819    * @return The length of the last match found.
820    */
821   public synchronized int length() {
822     return __lastMatch.length();
823   }
824 
825   /**
826    * @return The number of groups contained in the last match found.
827    *         This number includes the 0th group.  In other words, the
828    *         result refers to the number of parenthesized subgroups plus
829    *         the entire match itself.          
830    */
831   public synchronized int groups() {
832     return __lastMatch.groups();
833   }
834 
835 
836   /**
837    * Returns the contents of the parenthesized subgroups of the last match
838    * found according to the behavior dictated by the MatchResult interface.
839    * <p>
840    * @param group The pattern subgroup to return.
841    * @return A string containing the indicated pattern subgroup.  Group
842    *         0 always refers to the entire match.  If a group was never
843    *         matched, it returns null.  This is not to be confused with
844    *         a group matching the null string, which will return a String
845    *         of length 0.
846    */                       
847   public synchronized String   group(int group) {
848     return __lastMatch.group(group);
849   }
850 
851   /**
852    * Returns the begin offset of the subgroup of the last match found 
853    * relative the beginning of the match.
854    * <p>
855    * @param group The pattern subgroup.
856    * @return The offset into group 0 of the first token in the indicated
857    *         pattern subgroup.  If a group was never matched or does
858    *         not exist, returns -1.  Be aware that a group that matches
859    *         the null string at the end of a match will have an offset
860    *         equal to the length of the string, so you shouldn't blindly
861    *         use the offset to index an array or String.
862    */                                                                 
863   public synchronized int begin(int group) {
864     return __lastMatch.begin(group);
865   }
866 
867 
868   /**
869    * Returns the end offset of the subgroup of the last match found 
870    * relative the beginning of the match.
871    * <p>
872    * @param group The pattern subgroup.
873    * @return Returns one plus the offset into group 0 of the last token in
874    *         the indicated pattern subgroup.  If a group was never matched
875    *         or does not exist, returns -1.  A group matching the null
876    *         string will return its start offset.
877    */
878   public synchronized int end(int group) {
879     return __lastMatch.end(group);
880   }
881 
882 
883   /**
884    * Returns an offset marking the beginning of the last pattern match
885    * found relative to the beginning of the input from which the match
886    * was extracted.
887    * <p>
888    * @param group The pattern subgroup.
889    * @return The offset of the first token in the indicated
890    *         pattern subgroup.  If a group was never matched or does
891    *         not exist, returns -1.          
892    */
893   public synchronized int beginOffset(int group) {
894     return __lastMatch.beginOffset(group);
895   }
896 
897   /**
898    * Returns an offset marking the end of the last pattern match found
899    * relative to the beginning of the input from which the match was
900    * extracted.
901    * <p>
902    * @param group The pattern subgroup.
903    * @return Returns one plus the offset of the last token in
904    *         the indicated pattern subgroup.  If a group was never matched
905    *         or does not exist, returns -1.  A group matching the null
906    *         string will return its start offset.
907    */                   
908   public synchronized int endOffset(int group) {
909     return __lastMatch.endOffset(group);
910   }
911 
912   /**
913    * Returns the same as group(0).
914    * <p>
915    * @return A string containing the entire match.
916    */  
917   public synchronized String   toString() {
918     return __lastMatch.toString();
919   }
920 
921 
922   /**
923    * Returns the part of the input preceding that last match found.
924    * <p>
925    * @return The part of the input following the last match found.
926    */
927   public synchronized String   preMatch() {
928     int begin;
929 
930     if(__originalInput == null)
931       return __nullString;
932 
933     begin = __lastMatch.beginOffset(0);
934 
935     if(begin <= 0)
936       return __nullString;
937 
938     if(__originalInput instanceof char[]) {
939       char[] input;
940 
941       input = (char[])__originalInput;
942 
943       // Just in case we make sure begin offset is in bounds.  It should
944       // be but we're paranoid.
945       if(begin > input.length)
946     begin = input.length;
947 
948       return new String  (input, __inputBeginOffset, begin);
949     } else if(__originalInput instanceof String  ) {
950       String   input;
951 
952       input = (String  )__originalInput;
953 
954       // Just in case we make sure begin offset is in bounds.  It should
955       // be but we're paranoid.
956       if(begin > input.length())
957     begin = input.length();
958 
959       return input.substring(__inputBeginOffset, begin);
960     }
961 
962     return __nullString;
963   }
964 
965 
966   /**
967    * Returns the part of the input following that last match found.
968    * <p>
969    * @return The part of the input following the last match found.
970    */
971   public synchronized String   postMatch() {
972     int end;
973 
974     if(__originalInput == null)
975       return __nullString;
976 
977     end = __lastMatch.endOffset(0);
978 
979     if(end < 0)
980       return __nullString;
981 
982     if(__originalInput instanceof char[]) {
983       char[] input;
984 
985       input = (char[])__originalInput;
986       // Just in case we make sure begin offset is in bounds.  It should
987       // be but we're paranoid.
988       if(end >= input.length)
989     return __nullString;
990 
991       return new String  (input, end, __inputEndOffset - end);
992     } else if(__originalInput instanceof String  ) {
993       String   input;
994 
995       input = (String  )__originalInput;
996 
997       // Just in case we make sure begin offset is in bounds.  It should
998       // be but we're paranoid.
999       if(end >= input.length())
1000    return __nullString;
1001
1002      return input.substring(end, __inputEndOffset);
1003    }
1004
1005    return __nullString;
1006  }
1007
1008
1009  /**
1010   * Returns the part of the input preceding that last match found as a
1011   * char array.  This method eliminates the extra
1012   * buffer copying caused by preMatch().toCharArray().
1013   * <p>
1014   * @return The part of the input following the last match found as a char[].
1015   *         If the result is of zero length, returns null instead of a zero
1016   *         length array.
1017   */
1018  public synchronized char[] preMatchCharArray() {
1019    int begin;
1020    char[] result = null;
1021
1022    if(__originalInput == null)
1023      return null;
1024
1025    begin = __lastMatch.beginOffset(0);
1026
1027    if(begin <= 0)
1028      return null;
1029
1030    if(__originalInput instanceof char[]) {
1031      char[] input;
1032
1033      input = (char[])__originalInput;
1034
1035      // Just in case we make sure begin offset is in bounds.  It should
1036      // be but we're paranoid.
1037      if(begin >= input.length)
1038    begin = input.length;
1039
1040      result = new char[begin - __inputBeginOffset];
1041      System.arraycopy(input, __inputBeginOffset, result, 0, result.length);
1042    } else if(__originalInput instanceof String  ) {
1043      String   input;
1044
1045      input = (String  )__originalInput;
1046
1047      // Just in case we make sure begin offset is in bounds.  It should
1048      // be but we're paranoid.
1049      if(begin >= input.length())
1050    begin = input.length();
1051
1052      result = new char[begin - __inputBeginOffset];
1053      input.getChars(__inputBeginOffset, begin, result, 0);
1054    }
1055
1056    return result;
1057  }
1058
1059
1060  /**
1061   * Returns the part of the input following that last match found as a char
1062   * array.  This method eliminates the extra buffer copying caused by
1063   * preMatch().toCharArray().
1064   * <p>
1065   * @return The part of the input following the last match found as a char[].
1066   *         If the result is of zero length, returns null instead of a zero
1067   *         length array.
1068   */
1069  public synchronized char[] postMatchCharArray() {
1070    int end;
1071    char[] result = null;
1072
1073    if(__originalInput == null)
1074      return null;
1075
1076    end = __lastMatch.endOffset(0);
1077
1078    if(end < 0)
1079      return null;
1080
1081    if(__originalInput instanceof char[]) {
1082      int length;
1083      char[] input;
1084
1085      input = (char[])__originalInput;
1086      // Just in case we make sure begin offset is in bounds.  It should
1087      // be but we're paranoid.
1088      if(end >= input.length)
1089    return null;
1090
1091      length = __inputEndOffset - end;
1092      result = new char[length];
1093      System.arraycopy(input, end, result, 0, length);
1094    } else if(__originalInput instanceof String  ) {
1095      String   input;
1096
1097      input = (String  )__originalInput;
1098
1099      // Just in case we make sure begin offset is in bounds.  It should
1100      // be but we're paranoid.
1101      if(end >= __inputEndOffset)
1102    return null;
1103
1104      result = new char[__inputEndOffset - end];
1105      input.getChars(end, __inputEndOffset, result, 0);
1106    }
1107
1108    return result;
1109  }
1110
1111}
1112
1113
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags