Perl5Compiler


1   package org.apache.oro.text.regex;
2   
3   /* ====================================================================
4    * The Apache Software License, Version 1.1
5    *
6    * Copyright (c) 2000 The Apache Software Foundation.  All rights
7    * reserved.
8    *
9    * Redistribution and use in source and binary forms, with or without
10   * modification, are permitted provided that the following conditions
11   * are met:
12   *
13   * 1. Redistributions of source code must retain the above copyright
14   *    notice, this list of conditions and the following disclaimer.
15   *
16   * 2. Redistributions in binary form must reproduce the above copyright
17   *    notice, this list of conditions and the following disclaimer in
18   *    the documentation and/or other materials provided with the
19   *    distribution.
20   *
21   * 3. The end-user documentation included with the redistribution,
22   *    if any, must include the following acknowledgment:
23   *       "This product includes software developed by the
24   *        Apache Software Foundation (http://www.apache.org/)."
25   *    Alternately, this acknowledgment may appear in the software itself,
26   *    if and wherever such third-party acknowledgments normally appear.
27   *
28   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
29   *    must not be used to endorse or promote products derived from this
30   *    software without prior written permission. For written
31   *    permission, please contact apache@apache.org.
32   *
33   * 5. Products derived from this software may not be called "Apache" 
34   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
35   *    name, without prior written permission of the Apache Software Foundation.
36   *
37   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48   * SUCH DAMAGE.
49   * ====================================================================
50   *
51   * This software consists of voluntary contributions made by many
52   * individuals on behalf of the Apache Software Foundation.  For more
53   * information on the Apache Software Foundation, please see
54   * <http://www.apache.org/>.
55   *
56   * Portions of this software are based upon software originally written 
57   * by Daniel F. Savarese. We appreciate his contributions.
58   */
59  
60  /**
61   * The Perl5Compiler class is used to create compiled regular expressions
62   * conforming to the Perl5 regular expression syntax.  It generates
63   * Perl5Pattern instances upon compilation to be used in conjunction
64   * with a Perl5Matcher instance.  Please see the user's guide for more 
65   * information about Perl5 regular expressions.
66  
67   @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
68   @version $Id: Perl5Compiler.java,v 1.1.1.1 2000/07/23 23:08:52 jon Exp $
69  
70   * @see PatternCompiler
71   * @see MalformedPatternException
72   * @see Perl5Pattern
73   * @see Perl5Matcher
74   */
75  
76  public final class Perl5Compiler implements PatternCompiler {
77    private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2,
78                             __SPSTART = 0x4, __TRYAGAIN = 0x8;
79  
80    private static final char
81      __CASE_INSENSITIVE = 0x0001,
82      __GLOBAL           = 0x0002,
83      __KEEP             = 0x0004,
84      __MULTILINE        = 0x0008,
85      __SINGLELINE       = 0x0010,
86      __EXTENDED         = 0x0020,
87      __READ_ONLY        = 0x8000;
88  
89    private static final String   __META_CHARS = "^$.[()|?+*\\";
90    private static final String   __HEX_DIGIT =
91    "0123456789abcdef0123456789ABCDEFx";
92    private CharStringPointer __input;
93    private boolean __sawBackreference;
94    private char[] __modifierFlags = { 0 };
95  
96    // IMPORTANT: __numParentheses starts out equal to 1 during compilation.
97    // It is always one greater than the number of parentheses encountered
98    // so far in the regex.  That is because it refers to the number of groups
99    // to save, and the entire match is always saved (group 0)
100   private int __numParentheses, __programSize, __cost;
101 
102   // When doing the second pass and actually generating code, __programSize
103   // keeps track of the current offset.
104   private char[] __program;
105 
106   /**
107    * The default mask for the {@link #compile compile} methods.
108    * It is equal to 0.
109    * The default behavior is for a regular expression to be case sensitive
110    * and to not specify if it is multiline or singleline.  When MULITLINE_MASK
111    * and SINGLINE_MASK are not defined, the <b>^</b>, <b>$</b>, and <b>.</b>
112    * metacharacters are
113    * interpreted according to the value of isMultiline() in Perl5Matcher.
114    * The default behavior of Perl5Matcher is to treat the Perl5Pattern
115    * as though MULTILINE_MASK were enabled.  If isMultiline() returns false,
116    * then the pattern is treated as though SINGLINE_MASK were set.  However,
117    * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks
118    * will ALWAYS override whatever behavior is specified by the setMultiline()
119    * in Perl5Matcher.
120    */
121   public static final int DEFAULT_MASK          = 0;
122 
123   /**
124    * A mask passed as an option to the {@link #compile compile} methods
125    * to indicate a compiled regular expression should be case insensitive.
126    */
127   public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE;
128 
129   /**
130    * A mask passed as an option to the  {@link #compile compile} methods
131    * to indicate a compiled regular expression should treat input as having
132    * multiple lines.  This option affects the interpretation of
133    * the <b>^</b> and <b>$</b> metacharacters.  When this mask is used,
134    * the <b>^</b> metacharacter matches at the beginning of every line,
135    * and the <b>$</b> metacharacter matches at the end of every line.
136    * Additionally the <b> . </b> metacharacter will not match newlines when
137    * an expression is compiled with <b> MULTILINE_MASK </b>, which is its
138    * default behavior.
139    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
140    * used together.
141    */
142   public static final int MULTILINE_MASK        = __MULTILINE;
143 
144   /**
145    * A mask passed as an option to the {@link #compile compile} methods
146    * to indicate a compiled regular expression should treat input as being
147    * a single line.  This option affects the interpretation of
148    * the <b>^</b> and <b>$</b> metacharacters.  When this mask is used,
149    * the <b>^</b> metacharacter matches at the beginning of the input,
150    * and the <b>$</b> metacharacter matches at the end of the input.
151    * The <b>^</b> and <b>$</b> metacharacters will not match at the beginning
152    * and end of lines occurring between the begnning and end of the input.
153    * Additionally, the <b> . </b> metacharacter will match newlines when
154    * an expression is compiled with <b> SINGLELINE_MASK </b>, unlike its
155    * default behavior.
156    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
157    * used together.
158    */
159   public static final int SINGLELINE_MASK       = __SINGLELINE;
160 
161   /**
162    * A mask passed as an option to the {@link #compile compile} methods
163    * to indicate a compiled regular expression should be treated as a Perl5
164    * extended pattern (i.e., a pattern using the <b>/x</b> modifier).  This 
165    * option tells the compiler to ignore whitespace that is not backslashed or
166    * within a character class.  It also tells the compiler to treat the
167    * <b>#</b> character as a metacharacter introducing a comment as in
168    * Perl.  In other words, the <b>#</b> character will comment out any
169    * text in the regular expression between it and the next newline.
170    * The intent of this option is to allow you to divide your patterns
171    * into more readable parts.  It is provided to maintain compatibility
172    * with Perl5 regular expressions, although it will not often
173    * make sense to use it in Java.
174    */
175   public static final int EXTENDED_MASK         = __EXTENDED;
176 
177   /**
178    * A mask passed as an option to the {@link #compile compile} methods
179    * to indicate that the resulting Perl5Pattern should be treated as a
180    * read only data structure by Perl5Matcher, making it safe to share
181    * a single Perl5Pattern instance among multiple threads without needing
182    * synchronization.  Without this option, Perl5Matcher reserves the right
183    * to store heuristic or other information in Perl5Pattern that might
184    * accelerate future matches.  When you use this option, Perl5Matcher will
185    * not store or modify any information in a Perl5Pattern.  Use this option
186    * when you want to share a Perl5Pattern instance among multiple threads
187    * using different Perl5Matcher instances.
188    */
189   public static final int READ_ONLY_MASK        = __READ_ONLY;
190 
191   /**
192    * Given a character string, returns a Perl5 expression that interprets
193    * each character of the original string literally.  In other words, all
194    * special metacharacters are quoted/escaped.  This method is useful for
195    * converting user input meant for literal interpretation into a safe
196    * regular expression representing the literal input.
197    * <p>
198    * In effect, this method is the analog of the Perl5 quotemeta() builtin
199    * method.
200    * <p>
201    * @param expression The expression to convert.
202    * @return A String containing a Perl5 regular expression corresponding to
203    *         a literal interpretation of the pattern.
204    */
205   public static final String   quotemeta(char[] expression) {
206     int ch;
207     StringBuffer   buffer;
208 
209     buffer = new StringBuffer  (2*expression.length);
210     for(ch = 0; ch < expression.length; ch++) {
211       if(!OpCode._isWordCharacter(expression[ch]))
212     buffer.append('\\');
213       buffer.append(expression[ch]);
214     }
215 
216     return buffer.toString();
217   }
218 
219   /**
220    * Given a character string, returns a Perl5 expression that interprets
221    * each character of the original string literally.  In other words, all
222    * special metacharacters are quoted/escaped.  This method is useful for
223    * converting user input meant for literal interpretation into a safe
224    * regular expression representing the literal input.
225    * <p>
226    * In effect, this method is the analog of the Perl5 quotemeta() builtin
227    * method.
228    * <p>
229    * @param pattern The pattern to convert.
230    * @return A String containing a Perl5 regular expression corresponding to
231    *         a literal interpretation of the pattern.
232    */
233   public static final String   quotemeta(String   expression) {
234     return quotemeta(expression.toCharArray());
235   }
236 
237   private static boolean __isSimpleRepetitionOp(char ch) {
238     return (ch == '*' || ch == '+' || ch == '?');
239   }
240 
241   private static boolean __isComplexRepetitionOp(char[] ch, int offset) {
242     if(offset < ch.length && offset >= 0)
243        return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?'
244            || (ch[offset] == '{' && __parseRepetition(ch, offset)));
245     return false;
246   }
247 
248   // determines if {\d+,\d*} is the next part of the string
249   private static boolean __parseRepetition(char[] str, int offset) {
250     if(str[offset] != '{')
251       return false;
252     ++offset;
253 
254     if(offset >= str.length || !Character.isDigit(str[offset]))
255       return false;
256 
257     while(offset < str.length && Character.isDigit(str[offset]))
258       ++offset;
259 
260     if(offset < str.length && str[offset] == ',')
261       ++offset;
262 
263     while(offset < str.length && Character.isDigit(str[offset]))
264       ++offset;
265 
266     if(offset >= str.length || str[offset] != '}')
267       return false;
268 
269     return true;
270   }
271 
272   private static int __parseHex(char[] str, int offset, int maxLength,
273                 int[] scanned)
274   {
275     int val = 0, index;
276 
277     scanned[0] = 0;
278     while(offset < str.length && maxLength-- > 0 &&
279       (index = __HEX_DIGIT.indexOf(str[offset])) != -1) {
280       val <<= 4;
281       val |= (index & 15);
282       ++offset;
283       ++scanned[0];
284     }
285 
286     return val;
287   }
288 
289   private static int __parseOctal(char[] str, int offset, int maxLength,
290                  int[] scanned)
291   {
292     int val = 0, index;
293 
294     scanned[0] = 0;
295     while(offset < str.length && 
296       maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') {
297       val <<= 3;
298       val |= (str[offset] - '0');
299       --maxLength;
300       ++offset;
301       ++scanned[0];
302     }
303 
304     return val;
305   }
306 
307   private static void __setModifierFlag(char[] flags, char ch) {
308     switch(ch) {
309     case 'i' : flags[0] |= __CASE_INSENSITIVE; return;
310     case 'g' : flags[0] |= __GLOBAL; return;
311     case 'o' : flags[0] |= __KEEP; return;
312     case 'm' : flags[0] |= __MULTILINE; return;
313     case 's' : flags[0] |= __SINGLELINE; return;
314     case 'x' : flags[0] |= __EXTENDED; return;
315     }
316   }
317 
318   // Emit a specific character code.
319   private void __emitCode(char code) {
320 
321     if(__program != null)
322       __program[__programSize] = code;
323 
324     ++__programSize;
325   }
326 
327 
328   // Emit an operator with no arguments.
329   // Return an offset into the __program array as a pointer to node.
330   private int __emitNode(char operator) {
331     int offset;
332 
333     offset = __programSize;
334 
335     if(__program == null)
336       __programSize+=2;
337     else {
338       __program[__programSize++] = operator;
339       __program[__programSize++] = OpCode._NULL_POINTER;
340     }
341 
342     return offset;
343   }
344 
345 
346   // Emit an operator with arguments.
347   // Return an offset into the __programarray as a pointer to node.
348   private int __emitArgNode(char operator, char arg) {
349     int offset;
350 
351     offset = __programSize;
352 
353     if(__program== null)
354       __programSize+=3;
355     else {
356       __program[__programSize++] = operator;
357       __program[__programSize++] = OpCode._NULL_POINTER;
358       __program[__programSize++] = arg;
359     }
360 
361     return offset;
362   }
363 
364 
365   // Insert an operator at a given offset.
366   private void __programInsertOperator(char operator, int operand) {
367     int src, dest, offset;
368 
369     offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0);
370 
371 
372     if(__program== null) {
373       __programSize+=(2 + offset);
374       return;
375     }
376 
377     src = __programSize;
378     __programSize+=(2 + offset);
379     dest = __programSize;
380 
381     while(src > operand) {
382       --src;
383       --dest;
384       __program[dest] = __program[src];
385     }
386 
387     __program[operand++] = operator;
388     __program[operand++] = OpCode._NULL_POINTER;
389 
390     while(offset-- > 0)
391       __program[operand++] = OpCode._NULL_POINTER;
392 
393   }
394 
395 
396 
397   private void __programAddTail(int current, int value) {
398     int scan, temp, offset;
399 
400     if(__program== null || current == OpCode._NULL_OFFSET)
401       return;
402 
403     scan = current;
404 
405     while(true) {
406       temp = OpCode._getNext(__program, scan);
407       if(temp == OpCode._NULL_OFFSET)
408     break;
409       scan = temp;
410     }
411 
412     if(__program[scan] == OpCode._BACK)
413       offset = scan - value;
414     else
415       offset = value - scan;
416 
417     __program[scan + 1] = (char)offset;
418   }
419 
420 
421   private void __programAddOperatorTail(int current, int value) {
422     if(__program== null || current == OpCode._NULL_OFFSET ||
423        OpCode._opType[__program[current]] != OpCode._BRANCH)
424       return;
425     __programAddTail(OpCode._getNextOperator(current), value);
426   }
427 
428 
429   private char __getNextChar() {
430     char ret, value;
431 
432     ret = __input._postIncrement();
433 
434     while(true) {
435       value = __input._getValue();
436 
437       if(value == '(' && __input._getValueRelative(1) == '?' &&
438      __input._getValueRelative(2) == '#') {
439     // Skip comments
440     while(value != CharStringPointer._END_OF_STRING && value != ')')
441       value = __input._increment();
442     __input._increment();
443     continue;
444       }
445 
446       if((__modifierFlags[0] & __EXTENDED) != 0) {
447     if(Character.isWhitespace(value)) {
448       __input._increment();
449       continue;
450     } else if(value == '#') {
451       while(value != CharStringPointer._END_OF_STRING && value != '\n')
452         value = __input._increment();
453       __input._increment();
454       continue;
455     }
456       }
457 
458       // System.err.println("next: " + ret + " last: " + __input._getValue()); // debug
459 
460 
461       return ret;
462     }
463 
464   }
465 
466 
467   private int __parseAlternation(int[] retFlags)
468     throws MalformedPatternException 
469   {
470     int chain, offset, latest;
471     int flags = 0;
472     char value;
473 
474     retFlags[0] = __WORSTCASE;
475 
476     offset = __emitNode(OpCode._BRANCH);
477 
478     chain  = OpCode._NULL_OFFSET;
479 
480     if(__input._getOffset() == 0) {
481       __input._setOffset(-1);
482       __getNextChar();
483     } else {
484       __input._decrement();
485       __getNextChar();
486     }
487 
488     value = __input._getValue();
489 
490     while(value != CharStringPointer._END_OF_STRING &&
491       value != '|' && value != ')') {
492       flags &= ~__TRYAGAIN;
493       latest = __parseBranch(retFlags);
494 
495       if(latest == OpCode._NULL_OFFSET) {
496     if((flags & __TRYAGAIN) != 0){
497       value = __input._getValue();
498       continue;
499     }
500     return OpCode._NULL_OFFSET;
501       }
502 
503       retFlags[0] |= (flags & __NONNULL);
504 
505       if(chain == OpCode._NULL_OFFSET)
506     retFlags[0] |= (flags & __SPSTART);
507       else {
508     ++__cost;
509     __programAddTail(chain, latest);
510       }
511       chain = latest;
512       value = __input._getValue();
513     }
514 
515     // If loop was never entered.
516     if(chain == OpCode._NULL_OFFSET)
517       __emitNode(OpCode._NOTHING);
518 
519     return offset;
520   }
521 
522 
523   private int __parseAtom(int[] retFlags) throws MalformedPatternException {
524     boolean doDefault;
525     char value;
526     int offset, flags[] = { 0 };
527     
528     
529     retFlags[0] = __WORSTCASE;
530     doDefault = false;
531     offset = OpCode._NULL_OFFSET;
532 
533   tryAgain:
534     while(true) {
535 
536       value = __input._getValue();
537 
538       switch(value) {
539       case '^' :
540     __getNextChar();
541     // The order here is important in order to support /ms.
542     // /m takes precedence over /s for ^ and $, but not for .
543     if((__modifierFlags[0] & __MULTILINE) != 0)
544       offset = __emitNode(OpCode._MBOL);
545     else if((__modifierFlags[0] & __SINGLELINE) != 0)
546       offset = __emitNode(OpCode._SBOL);
547     else
548       offset = __emitNode(OpCode._BOL);
549     break tryAgain;
550 
551       case '$':
552     __getNextChar();
553     // The order here is important in order to support /ms.
554     // /m takes precedence over /s for ^ and $, but not for .
555     if((__modifierFlags[0] & __MULTILINE) != 0)
556       offset = __emitNode(OpCode._MEOL);
557     else if((__modifierFlags[0] & __SINGLELINE) != 0)
558       offset = __emitNode(OpCode._SEOL);
559     else
560       offset = __emitNode(OpCode._EOL);
561     break tryAgain;
562 
563       case '.':
564     __getNextChar();
565     // The order here is important in order to support /ms.
566     // /m takes precedence over /s for ^ and $, but not for .
567     if((__modifierFlags[0] & __SINGLELINE) != 0)
568       offset = __emitNode(OpCode._SANY);
569     else
570       offset = __emitNode(OpCode._ANY);
571     ++__cost;
572     retFlags[0] |= (__NONNULL | __SIMPLE);
573     break tryAgain;
574 
575       case '[':
576     __input._increment();
577     offset = __parseCharacterClass();
578     retFlags[0] |= (__NONNULL | __SIMPLE);
579     break tryAgain;
580 
581       case '(':
582     __getNextChar();
583     offset = __parseExpression(true, flags);
584     if(offset == OpCode._NULL_OFFSET) {
585       if((flags[0] & __TRYAGAIN) != 0)
586         continue tryAgain;
587       return OpCode._NULL_OFFSET;
588     }
589     retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART));
590     break tryAgain;
591 
592       case '|':
593       case ')':
594     if((flags[0] & __TRYAGAIN) != 0) {
595       retFlags[0] |= __TRYAGAIN;
596       return OpCode._NULL_OFFSET;
597     }
598 
599     throw new MalformedPatternException("Error in expression at " +
600                    __input._toString(__input._getOffset()));
601     //break tryAgain;
602 
603       case '?':
604       case '+':
605       case '*':
606     throw new MalformedPatternException(
607                  "?+* follows nothing in expression");
608     //break tryAgain;
609 
610       case '\\':
611     value = __input._increment();
612 
613     switch(value) {
614     case 'A' :
615       offset = __emitNode(OpCode._SBOL);
616       retFlags[0] |= __SIMPLE;
617       __getNextChar();
618       break;
619     case 'G':
620       offset = __emitNode(OpCode._GBOL);
621       retFlags[0] |= __SIMPLE;
622       __getNextChar();
623       break;
624     case 'Z':
625       offset = __emitNode(OpCode._SEOL);
626       retFlags[0] |= __SIMPLE;
627       __getNextChar();
628       break;
629     case 'w':
630       offset = __emitNode(OpCode._ALNUM);
631       retFlags[0] |= (__NONNULL | __SIMPLE);
632       __getNextChar();
633       break;
634     case 'W':
635       offset = __emitNode(OpCode._NALNUM);
636       retFlags[0] |= (__NONNULL | __SIMPLE);
637       __getNextChar();
638       break;
639     case 'b':
640       offset = __emitNode(OpCode._BOUND);
641       retFlags[0] |= __SIMPLE;
642       __getNextChar();
643       break;
644     case 'B':
645       offset = __emitNode(OpCode._NBOUND);
646       retFlags[0] |= __SIMPLE;
647       __getNextChar();
648       break;
649     case 's':
650       offset = __emitNode(OpCode._SPACE);
651       retFlags[0] |= (__NONNULL | __SIMPLE);
652       __getNextChar();
653       break;
654     case 'S':
655       offset = __emitNode(OpCode._NSPACE);
656       retFlags[0] |= (__NONNULL | __SIMPLE);
657       __getNextChar();
658       break;
659     case 'd':
660       offset = __emitNode(OpCode._DIGIT);
661       retFlags[0] |= (__NONNULL | __SIMPLE);
662       __getNextChar();
663       break;
664     case 'D':
665       offset = __emitNode(OpCode._NDIGIT);
666       retFlags[0] |= (__NONNULL | __SIMPLE);
667       __getNextChar();
668       break;
669     case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x':
670     case 'c': case '0':
671       doDefault = true;
672       break tryAgain;
673     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
674     case '8': case '9':
675       int num;
676       StringBuffer   buffer = new StringBuffer  (10);
677 
678       num = 0;
679       value = __input._getValueRelative(num);
680 
681       while(Character.isDigit(value)) {
682         buffer.append(value);
683         ++num;
684         value = __input._getValueRelative(num);
685       }
686 
687       try {
688         num = Integer.parseInt(buffer.toString());
689       } catch(NumberFormatException   e) {
690         throw new MalformedPatternException(
691        "Unexpected number format exception.  Please report this bug." +
692        "NumberFormatException message: " + e.getMessage());
693       }
694 
695       if(num > 9 && num >= __numParentheses) {
696         doDefault = true;
697         break tryAgain;
698       } else {
699         // A backreference may only occur AFTER its group
700         if(num >= __numParentheses)
701           throw new MalformedPatternException("Invalid backreference: \\" +
702                           num);
703         __sawBackreference = true;
704         offset = __emitArgNode(OpCode._REF, (char)num);
705         retFlags[0] |= __NONNULL;
706 
707         value = __input._getValue();
708         while(Character.isDigit(value))
709           value = __input._increment();
710 
711         __input._decrement();
712         __getNextChar();
713       }
714       break;
715     case '\0':
716     case CharStringPointer._END_OF_STRING:
717       if(__input._isAtEnd())
718         throw new
719           MalformedPatternException("Trailing \\ in expression.");
720       // fall through to default
721     default:
722       doDefault = true;
723       break tryAgain;
724     }
725     break tryAgain;
726 
727       case '#':
728     // skip over comments
729     if((__modifierFlags[0] & __EXTENDED) != 0) {
730       while(!__input._isAtEnd() && __input._getValue() != '\n')
731         __input._increment();
732       if(!__input._isAtEnd())
733         continue tryAgain;
734     }
735     // fall through to default
736       default:
737     __input._increment();
738     doDefault = true;
739     break tryAgain;
740       }// end master switch
741     } // end tryAgain
742 
743 
744     if(doDefault) {
745       char ender;
746       int length, pOffset, maxOffset, lastOffset, numLength[];
747 
748       offset = __emitNode(OpCode._EXACTLY);
749       // Not sure that it's ok to use 0 to mark end.
750       //__emitCode((char)0);
751       __emitCode((char)CharStringPointer._END_OF_STRING);
752 
753     forLoop:
754       for(length = 0, pOffset = __input._getOffset() - 1,
755         maxOffset = __input._getLength();
756       length < 127 && pOffset < maxOffset; ++length) {
757 
758     lastOffset = pOffset;
759     value = __input._getValue(pOffset);
760 
761     switch(value) {
762     case '^': case '$': case '.': case '[': case '(': case ')':
763     case '|':
764       break forLoop;
765     case '\\':
766       value = __input._getValue(++pOffset);
767 
768       switch(value) {
769       case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b':
770       case 'B': case 's': case 'S': case 'd': case 'D':
771         --pOffset;
772         break forLoop;
773       case 'n':
774         ender = '\n';
775         ++pOffset;
776         break;
777       case 'r':
778         ender = '\r';
779         ++pOffset;
780         break;
781       case 't':
782         ender = '\t';
783         ++pOffset;
784         break;
785       case 'f':
786         ender = '\f';
787         ++pOffset;
788         break;
789       case 'e':
790         ender = '\033';
791         ++pOffset;
792         break;
793       case 'a':
794         ender = '\007';
795         ++pOffset;
796         break;
797       case 'x':
798         numLength = new int[1];
799         ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength);
800         pOffset+=numLength[0];
801         break;
802       case 'c':
803         ++pOffset;
804         ender = __input._getValue(pOffset++);
805         if(Character.isLowerCase(ender))
806           ender = Character.toUpperCase(ender);
807         ender ^= 64;
808         break;
809       case '0': case '1': case '2': case'3': case '4': case '5':
810       case '6': case '7': case '8': case '9':
811         boolean doOctal = false;
812         value = __input._getValue(pOffset);
813 
814         if(value == '0')
815           doOctal = true;
816         value = __input._getValue(pOffset + 1);
817 
818         if(Character.isDigit(value)) {
819           int num;
820           StringBuffer   buffer = new StringBuffer  (10);
821 
822           num = pOffset;
823           value = __input._getValue(num);
824 
825           while(Character.isDigit(value)){
826         buffer.append(value);
827         ++num;
828         value = __input._getValue(num);
829           }
830 
831           try {
832         num = Integer.parseInt(buffer.toString());
833           } catch(NumberFormatException   e) {
834         throw new MalformedPatternException(
835          "Unexpected number format exception.  Please report this bug." +
836          "NumberFormatException message: " + e.getMessage());
837           }
838 
839           if(!doOctal)
840         doOctal = (num >= __numParentheses);
841         }
842 
843         if(doOctal) {
844           numLength = new int[1];
845           ender = (char)__parseOctal(__input._array, pOffset, 3, numLength);
846           pOffset+=numLength[0];
847         } else {
848           --pOffset;
849           break forLoop;
850         }
851         break;
852 
853       case CharStringPointer._END_OF_STRING:
854       case '\0':
855         if(pOffset >= maxOffset)
856           throw new
857         MalformedPatternException("Trailing \\ in expression.");
858         // fall through to default
859       default:
860         ender = __input._getValue(pOffset++);
861         break;
862       } // end backslash switch
863       break;
864 
865     case '#':
866       if((__modifierFlags[0] & __EXTENDED) != 0) {
867         while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
868           ++pOffset;
869       }
870       // fall through to whitespace handling
871     case ' ': case '\t': case '\n': case '\r': case '\f': case '\013':
872       if((__modifierFlags[0] & __EXTENDED) != 0) {
873         ++pOffset;
874         --length;
875         continue;
876       }
877       // fall through to default
878     default:
879       ender = __input._getValue(pOffset++);
880       break;
881 
882     }   // end master switch
883 
884     if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
885        Character.isUpperCase(ender))
886       ender = Character.toLowerCase(ender);
887 
888     if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) {
889       if(length > 0)
890         pOffset = lastOffset;
891       else {
892         ++length;
893         __emitCode(ender);
894       }
895       break;
896     }
897 
898     __emitCode(ender);
899 
900 
901       } // end for loop
902 
903 
904       __input._setOffset(pOffset - 1);
905       __getNextChar();
906 
907       if(length < 0)
908     throw new MalformedPatternException(
909          "Unexpected compilation failure.  Please report this bug!");
910       if(length > 0)
911     retFlags[0] |= __NONNULL;
912       if(length == 1)
913     retFlags[0] |= __SIMPLE;
914       if(__program!= null)
915     __program[OpCode._getOperand(offset)] = (char)length;
916       //__emitCode('\0'); // debug
917       __emitCode(CharStringPointer._END_OF_STRING);
918     }
919 
920     return offset;
921   }
922 
923 
924   // Set the bits in a character class.  Only recognizes ascii.
925   private void __setCharacterClassBits(char[] bits, int offset, char deflt,
926                        char ch)
927   {
928     if(__program== null || ch >= 256)
929       return;
930     ch &= 0xffff;
931 
932     if(deflt == 0) {
933       bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
934     } else {
935       bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf));
936     }
937   }
938 
939 
940   private int __parseCharacterClass() throws MalformedPatternException {
941     boolean range = false, skipTest;
942     char clss, deflt, lastclss = Character.MAX_VALUE;
943     int offset, bits, numLength[] = { 0 };
944 
945     offset = __emitNode(OpCode._ANYOF);
946 
947     if(__input._getValue() == '^') {
948       ++__cost;
949       __input._increment();
950       deflt = 0;
951     } else {
952       deflt = 0xffff;
953     }
954 
955     bits = __programSize;
956     for(clss = 0; clss < 16; clss++)
957       __emitCode(deflt);
958 
959     clss = __input._getValue();
960 
961     if(clss == ']' || clss == '-')
962       skipTest = true;
963     else
964       skipTest = false;
965 
966     while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
967       || skipTest) {
968       // It sucks, but we have to make this assignment every time
969       skipTest = false;
970       __input._increment();
971       if(clss == '\\') {
972     clss = __input._postIncrement();
973 
974     switch(clss){
975     case 'w':
976       for(clss = 0; clss < 256; clss++)
977         if(OpCode._isWordCharacter(clss))
978           __setCharacterClassBits(__program, bits, deflt, clss);
979       lastclss = Character.MAX_VALUE;
980       continue;
981     case 'W':
982       for(clss = 0; clss < 256; clss++)
983         if(!OpCode._isWordCharacter(clss))
984           __setCharacterClassBits(__program, bits, deflt, clss);
985       lastclss = Character.MAX_VALUE;
986       continue;
987     case 's':
988       for(clss = 0; clss < 256; clss++)
989         if(Character.isWhitespace(clss))
990           __setCharacterClassBits(__program, bits, deflt, clss);
991       lastclss = Character.MAX_VALUE;
992       continue;
993     case 'S':
994       for(clss = 0; clss < 256; clss++)
995         if(!Character.isWhitespace(clss))
996           __setCharacterClassBits(__program, bits, deflt, clss);
997       lastclss = Character.MAX_VALUE;
998       continue;
999     case 'd':
1000      for(clss = '0'; clss <= '9'; clss++)
1001        __setCharacterClassBits(__program, bits, deflt, clss);
1002      lastclss = Character.MAX_VALUE;
1003      continue;
1004    case 'D':
1005      for(clss = 0; clss < '0'; clss++)
1006        __setCharacterClassBits(__program, bits, deflt, clss);
1007      for(clss = (char)('9' + 1); clss < 256; clss++)
1008        __setCharacterClassBits(__program, bits, deflt, clss);
1009      lastclss = Character.MAX_VALUE;
1010      continue;
1011    case 'n':
1012      clss = '\n';
1013      break;
1014    case 'r':
1015      clss = '\r';
1016      break;
1017    case 't':
1018      clss = '\t';
1019      break;
1020    case 'f':
1021      clss = '\f';
1022      break;
1023    case 'b':
1024      clss = '\b';
1025      break;
1026    case 'e':
1027      clss = '\033';
1028      break;
1029    case 'a':
1030      clss = '\007';
1031      break;
1032    case 'x':
1033      clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
1034                  numLength);
1035      __input._increment(numLength[0]);
1036      break;
1037    case 'c':
1038      clss = __input._postIncrement();
1039      if(Character.isLowerCase(clss))
1040        clss = Character.toUpperCase(clss);
1041      clss ^= 64;
1042      break;
1043    case '0': case '1': case '2': case '3': case '4':
1044    case '5': case '6': case '7': case '8': case '9':
1045      clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
1046                    3, numLength);
1047      __input._increment(numLength[0] - 1);
1048      break;
1049    }
1050      }
1051
1052      if(range) {
1053    if(lastclss > clss)
1054      throw new MalformedPatternException(
1055             "Invalid [] range in expression.");
1056    range = false;
1057      } else {
1058    lastclss = clss;
1059
1060    if(__input._getValue() == '-' &&
1061       __input._getOffset() + 1 < __input._getLength() &&
1062       __input._getValueRelative(1) != ']') {
1063      __input._increment();
1064      range = true;
1065      continue;
1066    }
1067      }
1068
1069      while(lastclss <= clss) {
1070    __setCharacterClassBits(__program, bits, deflt, lastclss);
1071    if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
1072       Character.isUpperCase(lastclss))
1073      __setCharacterClassBits(__program, bits, deflt,
1074                 Character.toLowerCase(lastclss));
1075
1076    ++lastclss;
1077      }
1078
1079      lastclss = clss;
1080    }
1081
1082    if(__input._getValue() != ']')
1083      throw new MalformedPatternException("Unmatched [] in expression.");
1084
1085    __getNextChar();
1086
1087    return offset;
1088  }
1089
1090
1091  private int __parseBranch(int[] retFlags) throws MalformedPatternException {
1092    boolean nestCheck = false, handleRepetition = false;
1093    int offset, next, min, max, flags[] = { 0 };
1094    char operator, value;
1095
1096    min = 0;
1097    max = Character.MAX_VALUE;
1098    offset = __parseAtom(flags);
1099
1100    if(offset == OpCode._NULL_OFFSET) {
1101      if((flags[0] & __TRYAGAIN) != 0)
1102    retFlags[0] |= __TRYAGAIN;
1103      return OpCode._NULL_OFFSET;
1104    }
1105
1106    operator = __input._getValue();
1107
1108    if(operator == '(' && __input._getValueRelative(1) == '?' &&
1109       __input._getValueRelative(2) == '#') {
1110      while(operator != CharStringPointer._END_OF_STRING && operator != ')')
1111    operator = __input._increment();
1112
1113      if(operator != CharStringPointer._END_OF_STRING) {
1114    __getNextChar();
1115    operator = __input._getValue();
1116      }
1117    }
1118
1119    if(operator == '{' &&
1120       __parseRepetition(__input._array, __input._getOffset())) {
1121      int maxOffset, pos;
1122
1123      next = __input._getOffset() + 1;
1124      pos = maxOffset = __input._getLength();
1125
1126      value = __input._getValue(next);
1127
1128      while(Character.isDigit(value) || value == ',') {
1129    if(value == ',') {
1130      if(pos != maxOffset)
1131        break;
1132      else
1133        pos = next;
1134    }
1135    ++next;
1136    value = __input._getValue(next);
1137      }
1138
1139      if(value == '}') {
1140    int num;
1141    StringBuffer   buffer = new StringBuffer  (10);
1142
1143    if(pos == maxOffset)
1144      pos = next;
1145    __input._increment();
1146
1147    num = __input._getOffset();
1148    value = __input._getValue(num);
1149
1150    while(Character.isDigit(value)) {
1151      buffer.append(value);
1152      ++num;
1153      value = __input._getValue(num);
1154    }
1155
1156    try {
1157      min = Integer.parseInt(buffer.toString());
1158    } catch(NumberFormatException   e) {
1159      throw new MalformedPatternException(
1160     "Unexpected number format exception.  Please report this bug." +
1161       "NumberFormatException message: " + e.getMessage());
1162    }
1163
1164    value = __input._getValue(pos);
1165    if(value == ',')
1166      ++pos;
1167    else
1168      pos = __input._getOffset();
1169
1170    num = pos;
1171    buffer = new StringBuffer  (10);
1172
1173    value = __input._getValue(num);
1174
1175    while(Character.isDigit(value)){
1176      buffer.append(value);
1177      ++num;
1178      value = __input._getValue(num);
1179    }
1180
1181    try {
1182      if(num != pos)
1183        max = Integer.parseInt(buffer.toString());
1184    } catch(NumberFormatException   e) {
1185      throw new MalformedPatternException(
1186     "Unexpected number format exception.  Please report this bug." +
1187       "NumberFormatException message: " + e.getMessage());
1188    }
1189
1190    //System.err.println("min: " + min + " max: " + max); //debug
1191
1192    if(max == 0 && __input._getValue(pos) != '0')
1193      max = Character.MAX_VALUE;
1194    __input._setOffset(next);
1195    __getNextChar();
1196
1197    //System.err.println("min: " + min + " max: " + max); //debug
1198
1199    nestCheck = true;
1200    handleRepetition = true;
1201      }
1202    }
1203
1204    if(!nestCheck) {
1205      handleRepetition = false;
1206
1207      if(!__isSimpleRepetitionOp(operator)) {
1208    retFlags[0] = flags[0];
1209    return offset;
1210      }
1211
1212      __getNextChar();
1213
1214      retFlags[0] = ((operator != '+') ?
1215          (__WORSTCASE | __SPSTART) : (__WORSTCASE | __NONNULL));
1216
1217      if(operator == '*' && ((flags[0] & __SIMPLE) != 0)) {
1218    __programInsertOperator(OpCode._STAR, offset);
1219    __cost+=4;
1220      } else if(operator == '*') {
1221    min = 0;
1222    handleRepetition = true;
1223      } else if(operator == '+' && (flags[0] & __SIMPLE) != 0) {
1224    __programInsertOperator(OpCode._PLUS, offset);
1225    __cost+=3;
1226      } else if(operator == '+') {
1227    min = 1;
1228    handleRepetition = true;
1229      } else if(operator == '?') {
1230    min = 0;
1231    max = 1;
1232    handleRepetition = true;
1233      }
1234    }
1235
1236    if(handleRepetition) {
1237
1238      // handle repetition
1239      if((flags[0] & __SIMPLE) != 0){
1240    __cost+= ((2 + __cost) / 2);
1241    __programInsertOperator(OpCode._CURLY, offset);
1242      } else {
1243    __cost += (4 + __cost);
1244    __programAddTail(offset, __emitNode(OpCode._WHILEM));
1245    __programInsertOperator(OpCode._CURLYX, offset);
1246    __programAddTail(offset, __emitNode(OpCode._NOTHING));
1247      }
1248
1249      if(min > 0)
1250    retFlags[0] = (__WORSTCASE | __NONNULL);
1251
1252      if(max != 0 && max < min)
1253    throw new MalformedPatternException(
1254       "Invalid interval {" + min + "," + max + "}");
1255
1256      if(__program!= null) {
1257    __program[offset + 2] = (char)min;
1258    __program[offset + 3] = (char)max;
1259      }
1260    }
1261
1262
1263    if(__input._getValue() == '?') {
1264      __getNextChar();
1265      __programInsertOperator(OpCode._MINMOD, offset);
1266      __programAddTail(offset, offset + 2);
1267    }
1268
1269    if(__isComplexRepetitionOp(__input._array, __input._getOffset()))
1270      throw new MalformedPatternException(
1271        "Nested repetitions *?+ in expression");
1272
1273    return offset;
1274  }
1275
1276
1277  private int __parseExpression(boolean isParenthesized, int[] hintFlags)
1278    throws MalformedPatternException {
1279    char value, paren;
1280    int nodeOffset = OpCode._NULL_OFFSET, parenthesisNum = 0, br, ender;
1281    int[] flags = { 0 };
1282    String   modifiers = "iogmsx";
1283
1284
1285    // Initially we assume expression doesn't match null string.
1286    hintFlags[0] = __NONNULL;
1287
1288    if (isParenthesized) {
1289      paren = 1;
1290      if(__input._getValue() == '?') {
1291    __input._increment();
1292    paren = value = __input._postIncrement();
1293
1294    switch(value) {
1295    case ':' :
1296    case '=' :
1297    case '!' : break;
1298    case '#' :
1299      value = __input._getValue();
1300      while(value != CharStringPointer._END_OF_STRING && value != ')')
1301        value = __input._increment();
1302      if(value != ')')
1303        throw new MalformedPatternException(
1304           "Sequence (?#... not terminated");
1305      __getNextChar();
1306      hintFlags[0] = __TRYAGAIN;
1307      return OpCode._NULL_OFFSET;
1308    default :
1309      __input._decrement();
1310      value = __input._getValue();
1311      while(value != CharStringPointer._END_OF_STRING &&
1312        modifiers.indexOf(value) != -1) {
1313        __setModifierFlag(__modifierFlags, value);
1314        value = __input._increment();
1315      }
1316      if(value != ')')
1317        throw new MalformedPatternException(
1318           "Sequence (?" + value + "...) not recognized");
1319      __getNextChar();
1320      hintFlags[0] = __TRYAGAIN;
1321      return OpCode._NULL_OFFSET;
1322    }
1323      } else {
1324    parenthesisNum = __numParentheses;
1325    ++__numParentheses;
1326    nodeOffset = __emitArgNode(OpCode._OPEN, (char)parenthesisNum);
1327      }
1328    } else 
1329      paren = 0;
1330
1331    br = __parseAlternation(flags);
1332
1333    if(br == OpCode._NULL_OFFSET)
1334      return OpCode._NULL_OFFSET;
1335
1336    if(nodeOffset != OpCode._NULL_OFFSET)
1337      __programAddTail(nodeOffset, br);
1338    else
1339      nodeOffset = br;
1340
1341    if((flags[0] & __NONNULL) == 0)
1342      hintFlags[0] &= ~__NONNULL;
1343
1344    hintFlags[0] |= (flags[0] & __SPSTART);
1345
1346    while(__input._getValue() == '|') {
1347      __getNextChar();
1348      br = __parseAlternation(flags);
1349
1350      if(br == OpCode._NULL_OFFSET)
1351    return OpCode._NULL_OFFSET;
1352
1353      __programAddTail(nodeOffset, br);
1354
1355      if((flags[0] & __NONNULL) == 0)
1356    hintFlags[0] &= ~__NONNULL;
1357
1358      hintFlags[0] |= (flags[0] & __SPSTART);
1359    }
1360
1361    switch(paren) {
1362    case ':' :
1363      ender = __emitNode(OpCode._NOTHING);
1364      break;
1365    case 1:
1366      ender = __emitArgNode(OpCode._CLOSE, (char)parenthesisNum);
1367      break;
1368    case '=':
1369    case '!':
1370      ender = __emitNode(OpCode._SUCCEED);
1371      hintFlags[0] &= ~__NONNULL;
1372      break;
1373    case 0  :
1374    default :
1375      ender = __emitNode(OpCode._END);
1376      break;
1377    }
1378
1379    __programAddTail(nodeOffset, ender);
1380
1381    for(br = nodeOffset; br != OpCode._NULL_OFFSET;
1382    br = OpCode._getNext(__program, br))
1383      __programAddOperatorTail(br, ender);
1384
1385    if(paren == '=') {
1386      __programInsertOperator(OpCode._IFMATCH, nodeOffset);
1387      __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1388    } else if(paren == '!') {
1389      __programInsertOperator(OpCode._UNLESSM, nodeOffset);
1390      __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1391    }
1392
1393    if(paren != 0 && (__input._isAtEnd() || __getNextChar() != ')')) {
1394      throw new MalformedPatternException("Unmatched parentheses.");
1395    } else if(paren == 0 && !__input._isAtEnd()) { 
1396      if(__input._getValue() == ')')
1397    throw new MalformedPatternException("Unmatched parentheses.");
1398      else
1399    // Should never happen.
1400    throw new MalformedPatternException(
1401       "Unreached characters at end of expression.  Please report this bug!");
1402    }
1403
1404
1405    return nodeOffset;
1406  }
1407
1408
1409  /**
1410   * Compiles a Perl5 regular expression into a Perl5Pattern instance that
1411   * can be used by a Perl5Matcher object to perform pattern matching.
1412   * Please see the user's guide for more information about Perl5 regular
1413   * expressions.
1414   * <p>
1415   * @param pattern  A Perl5 regular expression to compile.
1416   * @param options  A set of flags giving the compiler instructions on
1417   *                 how to treat the regular expression.  The flags
1418   *                 are a logical OR of any number of the five <b>MASK</b>
1419   *                 constants.  For example:
1420   *                 <pre>
1421   * regex =
1422   *   compiler.compile(pattern, Perl5Compiler.
1423   *                    CASE_INSENSITIVE_MASK |
1424   *                    Perl5Compiler.MULTILINE_MASK);
1425   *                 </pre>
1426   *                  This says to compile the pattern so that it treats
1427   *                  input as consisting of multiple lines and to perform
1428   *                  matches in a case insensitive manner.
1429   * @return A Pattern instance constituting the compiled regular expression.
1430   *         This instance will always be a Perl5Pattern and can be reliably
1431   *         casted to a Perl5Pattern.
1432   * @exception MalformedPatternException  If the compiled expression
1433   *  is not a valid Perl5 regular expression.
1434   */
1435  public Pattern compile(char[] pattern, int options)
1436       throws MalformedPatternException {
1437    int[] flags = { 0 };
1438    int caseInsensitive, scan;
1439    Perl5Pattern regexp;
1440    String   mustString, startString;
1441
1442    int first;
1443    boolean sawOpen = false, sawPlus = false;
1444
1445    StringBuffer   lastLongest, longest;
1446    int length, minLength = 0, curBack, back, backmost;
1447
1448
1449    __input = new CharStringPointer(pattern);
1450
1451    caseInsensitive    = options & __CASE_INSENSITIVE;
1452    __modifierFlags[0] = (char)options;
1453    __sawBackreference = false;
1454    __numParentheses   = 1;
1455    __programSize      = 0;
1456    __cost             = 0;
1457    __program= null;
1458
1459    __emitCode((char)0);
1460    if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) {
1461      //System.err.println("null -- Size: " + __programSize); // debug
1462      // return null;
1463      throw new MalformedPatternException("Unknown compilation error.");
1464    }
1465
1466    //System.err.println("First Pass Size: " + __programSize); //debug
1467
1468    if(__programSize >= Character.MAX_VALUE - 1)
1469      throw new MalformedPatternException("Expression is too large.");
1470
1471
1472    __program= new char[__programSize];
1473    regexp = new Perl5Pattern();
1474
1475    regexp._program    = __program;
1476    regexp._expression = new String  (pattern);
1477
1478    __input._setOffset(0);
1479
1480    __numParentheses   = 1;
1481    __programSize      = 0;
1482    __cost             = 0;
1483
1484    __emitCode((char)0);
1485    if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) {
1486      //System.err.println("null -- Size: " + __programSize); //debug 
1487      //return null;
1488      throw new MalformedPatternException("Unknown compilation error.");
1489    }
1490
1491    //System.err.println("Second Pass Size: " + __programSize); //debug
1492
1493    caseInsensitive = __modifierFlags[0] & __CASE_INSENSITIVE;
1494
1495    regexp._isExpensive      = (__cost >= 10);
1496    regexp._startClassOffset = OpCode._NULL_OFFSET;
1497    regexp._anchor           = 0;
1498    regexp._back             = -1;
1499    regexp._options          = options;
1500    regexp._startString      = null;
1501    regexp._mustString       = null;
1502    mustString               = null;
1503    startString              = null;
1504
1505    scan = 1;
1506    if(__program[OpCode._getNext(__program, scan)] == OpCode._END){
1507      boolean doItAgain;  // bad variables names!
1508      char op;
1509
1510      first = scan = OpCode._getNextOperator(scan);
1511      op = __program[first];
1512
1513      while((op == OpCode._OPEN && (sawOpen = true)) ||
1514        (op == OpCode._BRANCH &&
1515         __program[OpCode._getNext(__program, first)] != OpCode._BRANCH) ||
1516        op == OpCode._PLUS || op == OpCode._MINMOD ||
1517        (OpCode._opType[op] == OpCode._CURLY && 
1518         OpCode._getArg1(__program, first) > 0)) {
1519    if(op == OpCode._PLUS)
1520      sawPlus = true;
1521    else
1522      first+=OpCode._operandLength[op];
1523
1524    first = OpCode._getNextOperator(first);
1525    op = __program[first];
1526      }
1527
1528      doItAgain = true;
1529
1530      while(doItAgain) {
1531    doItAgain = false;
1532    op = __program[first];
1533
1534    if(op == OpCode._EXACTLY) {
1535      startString =
1536        new String  (__program, OpCode._getOperand(first + 1),
1537               __program[OpCode._getOperand(first)]);
1538
1539    } else if(OpCode._isInArray(op, OpCode._opLengthOne, 2))
1540      regexp._startClassOffset = first;
1541    else if(op == OpCode._BOUND || op == OpCode._NBOUND)
1542      regexp._startClassOffset = first;
1543    else if(OpCode._opType[op] == OpCode._BOL) {
1544      regexp._anchor = Perl5Pattern._OPT_ANCH;
1545      first = OpCode._getNextOperator(first);
1546      doItAgain = true;
1547      continue;
1548    } else if(op == OpCode._STAR &&
1549          OpCode._opType[__program[OpCode._getNextOperator(first)]] == 
1550          OpCode._ANY && (regexp._anchor & Perl5Pattern._OPT_ANCH) != 0)
1551      {
1552        regexp._anchor = Perl5Pattern._OPT_ANCH | Perl5Pattern._OPT_IMPLICIT;
1553        first = OpCode._getNextOperator(first);
1554        doItAgain = true;
1555        continue;
1556    }
1557      } // end while do it again
1558
1559      if(sawPlus && (!sawOpen || !__sawBackreference))
1560    regexp._anchor |= Perl5Pattern._OPT_SKIP;
1561
1562
1563      //length = OpCode._getNextOperator(first); //debug
1564      // System.err.println("first: " + first + "nextoper: " + length);
1565      //System.err.print("first " + (int)op + " next "); // debug
1566      //if(length >= 0 && length < _program.length) //debug
1567      //System.err.print((int)(__program[length])); //debug
1568      //else  //debug
1569      //System.err.print("out of range"); //debug
1570      //System.err.println(" offset " + (int)(first - scan)); // debug
1571
1572      lastLongest   = new StringBuffer  ();
1573      longest   = new StringBuffer  ();
1574      length    = 0;
1575      minLength = 0;
1576      curBack   = 0;
1577      back   = 0;
1578      backmost   = 0;
1579
1580      while(scan > 0 && (op = __program[scan]) != OpCode._END) {
1581
1582    if(op == OpCode._BRANCH) {
1583      if(__program[OpCode._getNext(__program, scan)] == OpCode._BRANCH) {
1584        curBack = -30000;
1585        while(__program[scan] == OpCode._BRANCH)
1586          scan = OpCode._getNext(__program, scan);
1587      } else
1588        scan = OpCode._getNextOperator(scan);
1589      continue;
1590    }
1591
1592    if(op == OpCode._UNLESSM) {
1593      curBack = -30000;
1594      scan = OpCode._getNext(__program, scan);
1595      continue;
1596    }
1597
1598    if(op == OpCode._EXACTLY) {
1599      int temp;
1600
1601      first = scan;
1602      while(__program[(temp = OpCode._getNext(__program, scan))] == 
1603        OpCode._CLOSE)
1604        scan = temp;
1605
1606      minLength += __program[OpCode._getOperand(first)];
1607
1608      temp = __program[OpCode._getOperand(first)];
1609
1610      if(curBack - back == length) {
1611        lastLongest.append(new String  (__program, OpCode._getOperand(first) + 1,
1612                      temp));
1613        length  += temp;
1614        curBack += temp;
1615        first = OpCode._getNext(__program, scan);
1616      } else if(temp >= (length + (curBack >= 0 ? 1 : 0))) {
1617        length = temp;
1618        lastLongest =
1619          new StringBuffer  (new String  (__program,
1620                      OpCode._getOperand(first) + 1, temp));
1621        back = curBack;
1622        curBack += length;
1623        first = OpCode._getNext(__program, scan);
1624      } else
1625        curBack += temp;
1626    } else if(OpCode._isInArray(op, OpCode._opLengthVaries, 0)) {
1627      curBack = -30000;
1628      length = 0;
1629
1630      if(lastLongest.length() > longest.length()) {
1631        longest = lastLongest;
1632        backmost = back;
1633      }
1634
1635      lastLongest = new StringBuffer  ();
1636
1637      if(op == OpCode._PLUS && 
1638         OpCode._isInArray(__program[OpCode._getNextOperator(scan)],
1639                OpCode._opLengthOne, 0))
1640        ++minLength;
1641      else if(OpCode._opType[op] == OpCode._CURLY &&
1642          OpCode._isInArray(__program[OpCode._getNextOperator(scan) + 2],
1643                 OpCode._opLengthOne, 0))
1644        minLength += OpCode._getArg1(__program, scan);
1645    } else if(OpCode._isInArray(op, OpCode._opLengthOne, 0)) {
1646      ++curBack;
1647      ++minLength;
1648      length = 0;
1649      if(lastLongest.length() > longest.length()) {
1650        longest = lastLongest;
1651        backmost = back;
1652      }
1653      lastLongest = new StringBuffer  ();
1654    }
1655
1656    scan = OpCode._getNext(__program, scan);
1657      } // end while
1658
1659      if(lastLongest.length() +
1660     ((OpCode._opType[__program[first]] == OpCode._EOL) ? 1 : 0) >
1661     longest.length()) {
1662    longest = lastLongest;
1663    backmost = back;
1664      } else
1665    lastLongest = new StringBuffer  ();
1666
1667      if(longest.length() > 0 && startString == null) {
1668    mustString = longest.toString();
1669    if(backmost < 0)
1670      backmost = -1;
1671    regexp._back = backmost;
1672
1673    /*
1674
1675      if(longest.length() > 
1676      (((caseInsensitive & __CASE_INSENSITIVE) != 0 ||
1677      OpCode._opType[__program[first]] == OpCode._EOL)
1678      ? 1 : 0))
1679      */        
1680      } else
1681    longest = null;
1682    } // end if
1683
1684
1685    regexp._isCaseInsensitive = ((caseInsensitive & __CASE_INSENSITIVE) != 0);
1686    regexp._numParentheses  = __numParentheses - 1;
1687    regexp._minLength       = minLength;
1688
1689    if(mustString != null) {
1690      regexp._mustString = mustString.toCharArray();
1691      regexp._mustUtility = 100;
1692    }
1693
1694    if(startString != null)
1695      regexp._startString = startString.toCharArray();
1696
1697    return regexp;
1698  }
1699
1700  /**
1701   * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
1702   * <p>
1703   * @param pattern  A regular expression to compile.
1704   * @return A Pattern instance constituting the compiled regular expression.
1705   *         This instance will always be a Perl5Pattern and can be reliably
1706   *         casted to a Perl5Pattern.
1707   * @exception MalformedPatternException  If the compiled expression
1708   *  is not a valid Perl5 regular expression.
1709   */
1710  public Pattern compile(char[] pattern) throws MalformedPatternException {
1711     return compile(pattern, DEFAULT_MASK);
1712  }
1713
1714
1715  /**
1716   * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
1717   * <p>
1718   * @param pattern  A regular expression to compile.
1719   * @return A Pattern instance constituting the compiled regular expression.
1720   *         This instance will always be a Perl5Pattern and can be reliably
1721   *         casted to a Perl5Pattern.
1722   * @exception MalformedPatternException  If the compiled expression
1723   *  is not a valid Perl5 regular expression.
1724   */
1725  public Pattern compile(String   pattern) throws MalformedPatternException {
1726     return compile(pattern.toCharArray(), DEFAULT_MASK);
1727  }
1728
1729
1730  /**
1731   * Compiles a Perl5 regular expression into a Perl5Pattern instance that
1732   * can be used by a Perl5Matcher object to perform pattern matching.
1733   * Please see the user's guide for more information about Perl5 regular
1734   * expressions.
1735   * <p>
1736   * @param pattern  A Perl5 regular expression to compile.
1737   * @param options  A set of flags giving the compiler instructions on
1738   *                 how to treat the regular expression.  The flags
1739   *                 are a logical OR of any number of the five <b>MASK</b>
1740   *                 constants.  For example:
1741   *                 <pre>
1742   * regex =
1743   *   compiler.compile("^\\w+\\d+$",
1744   *                    Perl5Compiler.CASE_INSENSITIVE_MASK |
1745   *                    Perl5Compiler.MULTILINE_MASK);
1746   *                 </pre>
1747   *                  This says to compile the pattern so that it treats
1748   *                  input as consisting of multiple lines and to perform
1749   *                  matches in a case insensitive manner.
1750   * @return A Pattern instance constituting the compiled regular expression.
1751   *         This instance will always be a Perl5Pattern and can be reliably
1752   *         casted to a Perl5Pattern.
1753   * @exception MalformedPatternException  If the compiled expression
1754   *  is not a valid Perl5 regular expression.
1755   */
1756  public Pattern compile(String   pattern, int options)
1757       throws MalformedPatternException {
1758     return compile(pattern.toCharArray(), options);
1759  }
1760
1761}
1762
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags