KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > oro > text > regex > Perl5Compiler


1 package org.apache.oro.text.regex;
2
3 /* ====================================================================
4  * The Apache Software License, Version 1.1
5  *
6  * Copyright (c) 2000 The Apache Software Foundation. All rights
7  * reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  *
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in
18  * the documentation and/or other materials provided with the
19  * distribution.
20  *
21  * 3. The end-user documentation included with the redistribution,
22  * if any, must include the following acknowledgment:
23  * "This product includes software developed by the
24  * Apache Software Foundation (http://www.apache.org/)."
25  * Alternately, this acknowledgment may appear in the software itself,
26  * if and wherever such third-party acknowledgments normally appear.
27  *
28  * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
29  * must not be used to endorse or promote products derived from this
30  * software without prior written permission. For written
31  * permission, please contact apache@apache.org.
32  *
33  * 5. Products derived from this software may not be called "Apache"
34  * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
35  * name, without prior written permission of the Apache Software Foundation.
36  *
37  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  * ====================================================================
50  *
51  * This software consists of voluntary contributions made by many
52  * individuals on behalf of the Apache Software Foundation. For more
53  * information on the Apache Software Foundation, please see
54  * <http://www.apache.org/>.
55  *
56  * Portions of this software are based upon software originally written
57  * by Daniel F. Savarese. We appreciate his contributions.
58  */

59
60 /**
61  * The Perl5Compiler class is used to create compiled regular expressions
62  * conforming to the Perl5 regular expression syntax. It generates
63  * Perl5Pattern instances upon compilation to be used in conjunction
64  * with a Perl5Matcher instance. Please see the user's guide for more
65  * information about Perl5 regular expressions.
66
67  @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
68  @version $Id: Perl5Compiler.java,v 1.1.1.1 2000/07/23 23:08:52 jon Exp $
69
70  * @see PatternCompiler
71  * @see MalformedPatternException
72  * @see Perl5Pattern
73  * @see Perl5Matcher
74  */

75
76 public final class Perl5Compiler implements PatternCompiler {
77   private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2,
78                            __SPSTART = 0x4, __TRYAGAIN = 0x8;
79
80   private static final char
81     __CASE_INSENSITIVE = 0x0001,
82     __GLOBAL = 0x0002,
83     __KEEP = 0x0004,
84     __MULTILINE = 0x0008,
85     __SINGLELINE = 0x0010,
86     __EXTENDED = 0x0020,
87     __READ_ONLY = 0x8000;
88
89   private static final String JavaDoc __META_CHARS = "^$.[()|?+*\\";
90   private static final String JavaDoc __HEX_DIGIT =
91   "0123456789abcdef0123456789ABCDEFx";
92   private CharStringPointer __input;
93   private boolean __sawBackreference;
94   private char[] __modifierFlags = { 0 };
95
96   // IMPORTANT: __numParentheses starts out equal to 1 during compilation.
97
// It is always one greater than the number of parentheses encountered
98
// so far in the regex. That is because it refers to the number of groups
99
// to save, and the entire match is always saved (group 0)
100
private int __numParentheses, __programSize, __cost;
101
102   // When doing the second pass and actually generating code, __programSize
103
// keeps track of the current offset.
104
private char[] __program;
105
106   /**
107    * The default mask for the {@link #compile compile} methods.
108    * It is equal to 0.
109    * The default behavior is for a regular expression to be case sensitive
110    * and to not specify if it is multiline or singleline. When MULITLINE_MASK
111    * and SINGLINE_MASK are not defined, the <b>^</b>, <b>$</b>, and <b>.</b>
112    * metacharacters are
113    * interpreted according to the value of isMultiline() in Perl5Matcher.
114    * The default behavior of Perl5Matcher is to treat the Perl5Pattern
115    * as though MULTILINE_MASK were enabled. If isMultiline() returns false,
116    * then the pattern is treated as though SINGLINE_MASK were set. However,
117    * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks
118    * will ALWAYS override whatever behavior is specified by the setMultiline()
119    * in Perl5Matcher.
120    */

121   public static final int DEFAULT_MASK = 0;
122
123   /**
124    * A mask passed as an option to the {@link #compile compile} methods
125    * to indicate a compiled regular expression should be case insensitive.
126    */

127   public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE;
128
129   /**
130    * A mask passed as an option to the {@link #compile compile} methods
131    * to indicate a compiled regular expression should treat input as having
132    * multiple lines. This option affects the interpretation of
133    * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
134    * the <b>^</b> metacharacter matches at the beginning of every line,
135    * and the <b>$</b> metacharacter matches at the end of every line.
136    * Additionally the <b> . </b> metacharacter will not match newlines when
137    * an expression is compiled with <b> MULTILINE_MASK </b>, which is its
138    * default behavior.
139    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
140    * used together.
141    */

142   public static final int MULTILINE_MASK = __MULTILINE;
143
144   /**
145    * A mask passed as an option to the {@link #compile compile} methods
146    * to indicate a compiled regular expression should treat input as being
147    * a single line. This option affects the interpretation of
148    * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
149    * the <b>^</b> metacharacter matches at the beginning of the input,
150    * and the <b>$</b> metacharacter matches at the end of the input.
151    * The <b>^</b> and <b>$</b> metacharacters will not match at the beginning
152    * and end of lines occurring between the begnning and end of the input.
153    * Additionally, the <b> . </b> metacharacter will match newlines when
154    * an expression is compiled with <b> SINGLELINE_MASK </b>, unlike its
155    * default behavior.
156    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
157    * used together.
158    */

159   public static final int SINGLELINE_MASK = __SINGLELINE;
160
161   /**
162    * A mask passed as an option to the {@link #compile compile} methods
163    * to indicate a compiled regular expression should be treated as a Perl5
164    * extended pattern (i.e., a pattern using the <b>/x</b> modifier). This
165    * option tells the compiler to ignore whitespace that is not backslashed or
166    * within a character class. It also tells the compiler to treat the
167    * <b>#</b> character as a metacharacter introducing a comment as in
168    * Perl. In other words, the <b>#</b> character will comment out any
169    * text in the regular expression between it and the next newline.
170    * The intent of this option is to allow you to divide your patterns
171    * into more readable parts. It is provided to maintain compatibility
172    * with Perl5 regular expressions, although it will not often
173    * make sense to use it in Java.
174    */

175   public static final int EXTENDED_MASK = __EXTENDED;
176
177   /**
178    * A mask passed as an option to the {@link #compile compile} methods
179    * to indicate that the resulting Perl5Pattern should be treated as a
180    * read only data structure by Perl5Matcher, making it safe to share
181    * a single Perl5Pattern instance among multiple threads without needing
182    * synchronization. Without this option, Perl5Matcher reserves the right
183    * to store heuristic or other information in Perl5Pattern that might
184    * accelerate future matches. When you use this option, Perl5Matcher will
185    * not store or modify any information in a Perl5Pattern. Use this option
186    * when you want to share a Perl5Pattern instance among multiple threads
187    * using different Perl5Matcher instances.
188    */

189   public static final int READ_ONLY_MASK = __READ_ONLY;
190
191   /**
192    * Given a character string, returns a Perl5 expression that interprets
193    * each character of the original string literally. In other words, all
194    * special metacharacters are quoted/escaped. This method is useful for
195    * converting user input meant for literal interpretation into a safe
196    * regular expression representing the literal input.
197    * <p>
198    * In effect, this method is the analog of the Perl5 quotemeta() builtin
199    * method.
200    * <p>
201    * @param expression The expression to convert.
202    * @return A String containing a Perl5 regular expression corresponding to
203    * a literal interpretation of the pattern.
204    */

205   public static final String JavaDoc quotemeta(char[] expression) {
206     int ch;
207     StringBuffer JavaDoc buffer;
208
209     buffer = new StringBuffer JavaDoc(2*expression.length);
210     for(ch = 0; ch < expression.length; ch++) {
211       if(!OpCode._isWordCharacter(expression[ch]))
212     buffer.append('\\');
213       buffer.append(expression[ch]);
214     }
215
216     return buffer.toString();
217   }
218
219   /**
220    * Given a character string, returns a Perl5 expression that interprets
221    * each character of the original string literally. In other words, all
222    * special metacharacters are quoted/escaped. This method is useful for
223    * converting user input meant for literal interpretation into a safe
224    * regular expression representing the literal input.
225    * <p>
226    * In effect, this method is the analog of the Perl5 quotemeta() builtin
227    * method.
228    * <p>
229    * @param pattern The pattern to convert.
230    * @return A String containing a Perl5 regular expression corresponding to
231    * a literal interpretation of the pattern.
232    */

233   public static final String JavaDoc quotemeta(String JavaDoc expression) {
234     return quotemeta(expression.toCharArray());
235   }
236
237   private static boolean __isSimpleRepetitionOp(char ch) {
238     return (ch == '*' || ch == '+' || ch == '?');
239   }
240
241   private static boolean __isComplexRepetitionOp(char[] ch, int offset) {
242     if(offset < ch.length && offset >= 0)
243        return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?'
244            || (ch[offset] == '{' && __parseRepetition(ch, offset)));
245     return false;
246   }
247
248   // determines if {\d+,\d*} is the next part of the string
249
private static boolean __parseRepetition(char[] str, int offset) {
250     if(str[offset] != '{')
251       return false;
252     ++offset;
253
254     if(offset >= str.length || !Character.isDigit(str[offset]))
255       return false;
256
257     while(offset < str.length && Character.isDigit(str[offset]))
258       ++offset;
259
260     if(offset < str.length && str[offset] == ',')
261       ++offset;
262
263     while(offset < str.length && Character.isDigit(str[offset]))
264       ++offset;
265
266     if(offset >= str.length || str[offset] != '}')
267       return false;
268
269     return true;
270   }
271
272   private static int __parseHex(char[] str, int offset, int maxLength,
273                 int[] scanned)
274   {
275     int val = 0, index;
276
277     scanned[0] = 0;
278     while(offset < str.length && maxLength-- > 0 &&
279       (index = __HEX_DIGIT.indexOf(str[offset])) != -1) {
280       val <<= 4;
281       val |= (index & 15);
282       ++offset;
283       ++scanned[0];
284     }
285
286     return val;
287   }
288
289   private static int __parseOctal(char[] str, int offset, int maxLength,
290                  int[] scanned)
291   {
292     int val = 0, index;
293
294     scanned[0] = 0;
295     while(offset < str.length &&
296       maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') {
297       val <<= 3;
298       val |= (str[offset] - '0');
299       --maxLength;
300       ++offset;
301       ++scanned[0];
302     }
303
304     return val;
305   }
306
307   private static void __setModifierFlag(char[] flags, char ch) {
308     switch(ch) {
309     case 'i' : flags[0] |= __CASE_INSENSITIVE; return;
310     case 'g' : flags[0] |= __GLOBAL; return;
311     case 'o' : flags[0] |= __KEEP; return;
312     case 'm' : flags[0] |= __MULTILINE; return;
313     case 's' : flags[0] |= __SINGLELINE; return;
314     case 'x' : flags[0] |= __EXTENDED; return;
315     }
316   }
317
318   // Emit a specific character code.
319
private void __emitCode(char code) {
320
321     if(__program != null)
322       __program[__programSize] = code;
323
324     ++__programSize;
325   }
326
327
328   // Emit an operator with no arguments.
329
// Return an offset into the __program array as a pointer to node.
330
private int __emitNode(char operator) {
331     int offset;
332
333     offset = __programSize;
334
335     if(__program == null)
336       __programSize+=2;
337     else {
338       __program[__programSize++] = operator;
339       __program[__programSize++] = OpCode._NULL_POINTER;
340     }
341
342     return offset;
343   }
344
345
346   // Emit an operator with arguments.
347
// Return an offset into the __programarray as a pointer to node.
348
private int __emitArgNode(char operator, char arg) {
349     int offset;
350
351     offset = __programSize;
352
353     if(__program== null)
354       __programSize+=3;
355     else {
356       __program[__programSize++] = operator;
357       __program[__programSize++] = OpCode._NULL_POINTER;
358       __program[__programSize++] = arg;
359     }
360
361     return offset;
362   }
363
364
365   // Insert an operator at a given offset.
366
private void __programInsertOperator(char operator, int operand) {
367     int src, dest, offset;
368
369     offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0);
370
371
372     if(__program== null) {
373       __programSize+=(2 + offset);
374       return;
375     }
376
377     src = __programSize;
378     __programSize+=(2 + offset);
379     dest = __programSize;
380
381     while(src > operand) {
382       --src;
383       --dest;
384       __program[dest] = __program[src];
385     }
386
387     __program[operand++] = operator;
388     __program[operand++] = OpCode._NULL_POINTER;
389
390     while(offset-- > 0)
391       __program[operand++] = OpCode._NULL_POINTER;
392
393   }
394
395
396
397   private void __programAddTail(int current, int value) {
398     int scan, temp, offset;
399
400     if(__program== null || current == OpCode._NULL_OFFSET)
401       return;
402
403     scan = current;
404
405     while(true) {
406       temp = OpCode._getNext(__program, scan);
407       if(temp == OpCode._NULL_OFFSET)
408     break;
409       scan = temp;
410     }
411
412     if(__program[scan] == OpCode._BACK)
413       offset = scan - value;
414     else
415       offset = value - scan;
416
417     __program[scan + 1] = (char)offset;
418   }
419
420
421   private void __programAddOperatorTail(int current, int value) {
422     if(__program== null || current == OpCode._NULL_OFFSET ||
423        OpCode._opType[__program[current]] != OpCode._BRANCH)
424       return;
425     __programAddTail(OpCode._getNextOperator(current), value);
426   }
427
428
429   private char __getNextChar() {
430     char ret, value;
431
432     ret = __input._postIncrement();
433
434     while(true) {
435       value = __input._getValue();
436
437       if(value == '(' && __input._getValueRelative(1) == '?' &&
438      __input._getValueRelative(2) == '#') {
439     // Skip comments
440
while(value != CharStringPointer._END_OF_STRING && value != ')')
441       value = __input._increment();
442     __input._increment();
443     continue;
444       }
445
446       if((__modifierFlags[0] & __EXTENDED) != 0) {
447     if(Character.isWhitespace(value)) {
448       __input._increment();
449       continue;
450     } else if(value == '#') {
451       while(value != CharStringPointer._END_OF_STRING && value != '\n')
452         value = __input._increment();
453       __input._increment();
454       continue;
455     }
456       }
457
458       // System.err.println("next: " + ret + " last: " + __input._getValue()); // debug
459

460
461       return ret;
462     }
463
464   }
465
466
467   private int __parseAlternation(int[] retFlags)
468     throws MalformedPatternException
469   {
470     int chain, offset, latest;
471     int flags = 0;
472     char value;
473
474     retFlags[0] = __WORSTCASE;
475
476     offset = __emitNode(OpCode._BRANCH);
477
478     chain = OpCode._NULL_OFFSET;
479
480     if(__input._getOffset() == 0) {
481       __input._setOffset(-1);
482       __getNextChar();
483     } else {
484       __input._decrement();
485       __getNextChar();
486     }
487
488     value = __input._getValue();
489
490     while(value != CharStringPointer._END_OF_STRING &&
491       value != '|' && value != ')') {
492       flags &= ~__TRYAGAIN;
493       latest = __parseBranch(retFlags);
494
495       if(latest == OpCode._NULL_OFFSET) {
496     if((flags & __TRYAGAIN) != 0){
497       value = __input._getValue();
498       continue;
499     }
500     return OpCode._NULL_OFFSET;
501       }
502
503       retFlags[0] |= (flags & __NONNULL);
504
505       if(chain == OpCode._NULL_OFFSET)
506     retFlags[0] |= (flags & __SPSTART);
507       else {
508     ++__cost;
509     __programAddTail(chain, latest);
510       }
511       chain = latest;
512       value = __input._getValue();
513     }
514
515     // If loop was never entered.
516
if(chain == OpCode._NULL_OFFSET)
517       __emitNode(OpCode._NOTHING);
518
519     return offset;
520   }
521
522
523   private int __parseAtom(int[] retFlags) throws MalformedPatternException {
524     boolean doDefault;
525     char value;
526     int offset, flags[] = { 0 };
527     
528     
529     retFlags[0] = __WORSTCASE;
530     doDefault = false;
531     offset = OpCode._NULL_OFFSET;
532
533   tryAgain:
534     while(true) {
535
536       value = __input._getValue();
537
538       switch(value) {
539       case '^' :
540     __getNextChar();
541     // The order here is important in order to support /ms.
542
// /m takes precedence over /s for ^ and $, but not for .
543
if((__modifierFlags[0] & __MULTILINE) != 0)
544       offset = __emitNode(OpCode._MBOL);
545     else if((__modifierFlags[0] & __SINGLELINE) != 0)
546       offset = __emitNode(OpCode._SBOL);
547     else
548       offset = __emitNode(OpCode._BOL);
549     break tryAgain;
550
551       case '$':
552     __getNextChar();
553     // The order here is important in order to support /ms.
554
// /m takes precedence over /s for ^ and $, but not for .
555
if((__modifierFlags[0] & __MULTILINE) != 0)
556       offset = __emitNode(OpCode._MEOL);
557     else if((__modifierFlags[0] & __SINGLELINE) != 0)
558       offset = __emitNode(OpCode._SEOL);
559     else
560       offset = __emitNode(OpCode._EOL);
561     break tryAgain;
562
563       case '.':
564     __getNextChar();
565     // The order here is important in order to support /ms.
566
// /m takes precedence over /s for ^ and $, but not for .
567
if((__modifierFlags[0] & __SINGLELINE) != 0)
568       offset = __emitNode(OpCode._SANY);
569     else
570       offset = __emitNode(OpCode._ANY);
571     ++__cost;
572     retFlags[0] |= (__NONNULL | __SIMPLE);
573     break tryAgain;
574
575       case '[':
576     __input._increment();
577     offset = __parseCharacterClass();
578     retFlags[0] |= (__NONNULL | __SIMPLE);
579     break tryAgain;
580
581       case '(':
582     __getNextChar();
583     offset = __parseExpression(true, flags);
584     if(offset == OpCode._NULL_OFFSET) {
585       if((flags[0] & __TRYAGAIN) != 0)
586         continue tryAgain;
587       return OpCode._NULL_OFFSET;
588     }
589     retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART));
590     break tryAgain;
591
592       case '|':
593       case ')':
594     if((flags[0] & __TRYAGAIN) != 0) {
595       retFlags[0] |= __TRYAGAIN;
596       return OpCode._NULL_OFFSET;
597     }
598
599     throw new MalformedPatternException("Error in expression at " +
600                    __input._toString(__input._getOffset()));
601     //break tryAgain;
602

603       case '?':
604       case '+':
605       case '*':
606     throw new MalformedPatternException(
607                  "?+* follows nothing in expression");
608     //break tryAgain;
609

610       case '\\':
611     value = __input._increment();
612
613     switch(value) {
614     case 'A' :
615       offset = __emitNode(OpCode._SBOL);
616       retFlags[0] |= __SIMPLE;
617       __getNextChar();
618       break;
619     case 'G':
620       offset = __emitNode(OpCode._GBOL);
621       retFlags[0] |= __SIMPLE;
622       __getNextChar();
623       break;
624     case 'Z':
625       offset = __emitNode(OpCode._SEOL);
626       retFlags[0] |= __SIMPLE;
627       __getNextChar();
628       break;
629     case 'w':
630       offset = __emitNode(OpCode._ALNUM);
631       retFlags[0] |= (__NONNULL | __SIMPLE);
632       __getNextChar();
633       break;
634     case 'W':
635       offset = __emitNode(OpCode._NALNUM);
636       retFlags[0] |= (__NONNULL | __SIMPLE);
637       __getNextChar();
638       break;
639     case 'b':
640       offset = __emitNode(OpCode._BOUND);
641       retFlags[0] |= __SIMPLE;
642       __getNextChar();
643       break;
644     case 'B':
645       offset = __emitNode(OpCode._NBOUND);
646       retFlags[0] |= __SIMPLE;
647       __getNextChar();
648       break;
649     case 's':
650       offset = __emitNode(OpCode._SPACE);
651       retFlags[0] |= (__NONNULL | __SIMPLE);
652       __getNextChar();
653       break;
654     case 'S':
655       offset = __emitNode(OpCode._NSPACE);
656       retFlags[0] |= (__NONNULL | __SIMPLE);
657       __getNextChar();
658       break;
659     case 'd':
660       offset = __emitNode(OpCode._DIGIT);
661       retFlags[0] |= (__NONNULL | __SIMPLE);
662       __getNextChar();
663       break;
664     case 'D':
665       offset = __emitNode(OpCode._NDIGIT);
666       retFlags[0] |= (__NONNULL | __SIMPLE);
667       __getNextChar();
668       break;
669     case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x':
670     case 'c': case '0':
671       doDefault = true;
672       break tryAgain;
673     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
674     case '8': case '9':
675       int num;
676       StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(10);
677
678       num = 0;
679       value = __input._getValueRelative(num);
680
681       while(Character.isDigit(value)) {
682         buffer.append(value);
683         ++num;
684         value = __input._getValueRelative(num);
685       }
686
687       try {
688         num = Integer.parseInt(buffer.toString());
689       } catch(NumberFormatException JavaDoc e) {
690         throw new MalformedPatternException(
691        "Unexpected number format exception. Please report this bug." +
692        "NumberFormatException message: " + e.getMessage());
693       }
694
695       if(num > 9 && num >= __numParentheses) {
696         doDefault = true;
697         break tryAgain;
698       } else {
699         // A backreference may only occur AFTER its group
700
if(num >= __numParentheses)
701           throw new MalformedPatternException("Invalid backreference: \\" +
702                           num);
703         __sawBackreference = true;
704         offset = __emitArgNode(OpCode._REF, (char)num);
705         retFlags[0] |= __NONNULL;
706
707         value = __input._getValue();
708         while(Character.isDigit(value))
709           value = __input._increment();
710
711         __input._decrement();
712         __getNextChar();
713       }
714       break;
715     case '\0':
716     case CharStringPointer._END_OF_STRING:
717       if(__input._isAtEnd())
718         throw new
719           MalformedPatternException("Trailing \\ in expression.");
720       // fall through to default
721
default:
722       doDefault = true;
723       break tryAgain;
724     }
725     break tryAgain;
726
727       case '#':
728     // skip over comments
729
if((__modifierFlags[0] & __EXTENDED) != 0) {
730       while(!__input._isAtEnd() && __input._getValue() != '\n')
731         __input._increment();
732       if(!__input._isAtEnd())
733         continue tryAgain;
734     }
735     // fall through to default
736
default:
737     __input._increment();
738     doDefault = true;
739     break tryAgain;
740       }// end master switch
741
} // end tryAgain
742

743
744     if(doDefault) {
745       char ender;
746       int length, pOffset, maxOffset, lastOffset, numLength[];
747
748       offset = __emitNode(OpCode._EXACTLY);
749       // Not sure that it's ok to use 0 to mark end.
750
//__emitCode((char)0);
751
__emitCode((char)CharStringPointer._END_OF_STRING);
752
753     forLoop:
754       for(length = 0, pOffset = __input._getOffset() - 1,
755         maxOffset = __input._getLength();
756       length < 127 && pOffset < maxOffset; ++length) {
757
758     lastOffset = pOffset;
759     value = __input._getValue(pOffset);
760
761     switch(value) {
762     case '^': case '$': case '.': case '[': case '(': case ')':
763     case '|':
764       break forLoop;
765     case '\\':
766       value = __input._getValue(++pOffset);
767
768       switch(value) {
769       case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b':
770       case 'B': case 's': case 'S': case 'd': case 'D':
771         --pOffset;
772         break forLoop;
773       case 'n':
774         ender = '\n';
775         ++pOffset;
776         break;
777       case 'r':
778         ender = '\r';
779         ++pOffset;
780         break;
781       case 't':
782         ender = '\t';
783         ++pOffset;
784         break;
785       case 'f':
786         ender = '\f';
787         ++pOffset;
788         break;
789       case 'e':
790         ender = '\033';
791         ++pOffset;
792         break;
793       case 'a':
794         ender = '\007';
795         ++pOffset;
796         break;
797       case 'x':
798         numLength = new int[1];
799         ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength);
800         pOffset+=numLength[0];
801         break;
802       case 'c':
803         ++pOffset;
804         ender = __input._getValue(pOffset++);
805         if(Character.isLowerCase(ender))
806           ender = Character.toUpperCase(ender);
807         ender ^= 64;
808         break;
809       case '0': case '1': case '2': case'3': case '4': case '5':
810       case '6': case '7': case '8': case '9':
811         boolean doOctal = false;
812         value = __input._getValue(pOffset);
813
814         if(value == '0')
815           doOctal = true;
816         value = __input._getValue(pOffset + 1);
817
818         if(Character.isDigit(value)) {
819           int num;
820           StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(10);
821
822           num = pOffset;
823           value = __input._getValue(num);
824
825           while(Character.isDigit(value)){
826         buffer.append(value);
827         ++num;
828         value = __input._getValue(num);
829           }
830
831           try {
832         num = Integer.parseInt(buffer.toString());
833           } catch(NumberFormatException JavaDoc e) {
834         throw new MalformedPatternException(
835          "Unexpected number format exception. Please report this bug." +
836          "NumberFormatException message: " + e.getMessage());
837           }
838
839           if(!doOctal)
840         doOctal = (num >= __numParentheses);
841         }
842
843         if(doOctal) {
844           numLength = new int[1];
845           ender = (char)__parseOctal(__input._array, pOffset, 3, numLength);
846           pOffset+=numLength[0];
847         } else {
848           --pOffset;
849           break forLoop;
850         }
851         break;
852
853       case CharStringPointer._END_OF_STRING:
854       case '\0':
855         if(pOffset >= maxOffset)
856           throw new
857         MalformedPatternException("Trailing \\ in expression.");
858         // fall through to default
859
default:
860         ender = __input._getValue(pOffset++);
861         break;
862       } // end backslash switch
863
break;
864
865     case '#':
866       if((__modifierFlags[0] & __EXTENDED) != 0) {
867         while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
868           ++pOffset;
869       }
870       // fall through to whitespace handling
871
case ' ': case '\t': case '\n': case '\r': case '\f': case '\013':
872       if((__modifierFlags[0] & __EXTENDED) != 0) {
873         ++pOffset;
874         --length;
875         continue;
876       }
877       // fall through to default
878
default:
879       ender = __input._getValue(pOffset++);
880       break;
881
882     } // end master switch
883

884     if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
885        Character.isUpperCase(ender))
886       ender = Character.toLowerCase(ender);
887
888     if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) {
889       if(length > 0)
890         pOffset = lastOffset;
891       else {
892         ++length;
893         __emitCode(ender);
894       }
895       break;
896     }
897
898     __emitCode(ender);
899
900
901       } // end for loop
902

903
904       __input._setOffset(pOffset - 1);
905       __getNextChar();
906
907       if(length < 0)
908     throw new MalformedPatternException(
909          "Unexpected compilation failure. Please report this bug!");
910       if(length > 0)
911     retFlags[0] |= __NONNULL;
912       if(length == 1)
913     retFlags[0] |= __SIMPLE;
914       if(__program!= null)
915     __program[OpCode._getOperand(offset)] = (char)length;
916       //__emitCode('\0'); // debug
917
__emitCode(CharStringPointer._END_OF_STRING);
918     }
919
920     return offset;
921   }
922
923
924   // Set the bits in a character class. Only recognizes ascii.
925
private void __setCharacterClassBits(char[] bits, int offset, char deflt,
926                        char ch)
927   {
928     if(__program== null || ch >= 256)
929       return;
930     ch &= 0xffff;
931
932     if(deflt == 0) {
933       bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
934     } else {
935       bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf));
936     }
937   }
938
939
940   private int __parseCharacterClass() throws MalformedPatternException {
941     boolean range = false, skipTest;
942     char clss, deflt, lastclss = Character.MAX_VALUE;
943     int offset, bits, numLength[] = { 0 };
944
945     offset = __emitNode(OpCode._ANYOF);
946
947     if(__input._getValue() == '^') {
948       ++__cost;
949       __input._increment();
950       deflt = 0;
951     } else {
952       deflt = 0xffff;
953     }
954
955     bits = __programSize;
956     for(clss = 0; clss < 16; clss++)
957       __emitCode(deflt);
958
959     clss = __input._getValue();
960
961     if(clss == ']' || clss == '-')
962       skipTest = true;
963     else
964       skipTest = false;
965
966     while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
967       || skipTest) {
968       // It sucks, but we have to make this assignment every time
969
skipTest = false;
970       __input._increment();
971       if(clss == '\\') {
972     clss = __input._postIncrement();
973
974     switch(clss){
975     case 'w':
976       for(clss = 0; clss < 256; clss++)
977         if(OpCode._isWordCharacter(clss))
978           __setCharacterClassBits(__program, bits, deflt, clss);
979       lastclss = Character.MAX_VALUE;
980       continue;
981     case 'W':
982       for(clss = 0; clss < 256; clss++)
983         if(!OpCode._isWordCharacter(clss))
984           __setCharacterClassBits(__program, bits, deflt, clss);
985       lastclss = Character.MAX_VALUE;
986       continue;
987     case 's':
988       for(clss = 0; clss < 256; clss++)
989         if(Character.isWhitespace(clss))
990           __setCharacterClassBits(__program, bits, deflt, clss);
991       lastclss = Character.MAX_VALUE;
992       continue;
993     case 'S':
994       for(clss = 0; clss < 256; clss++)
995         if(!Character.isWhitespace(clss))
996           __setCharacterClassBits(__program, bits, deflt, clss);
997       lastclss = Character.MAX_VALUE;
998       continue;
999     case 'd':
1000      for(clss = '0'; clss <= '9'; clss++)
1001        __setCharacterClassBits(__program, bits, deflt, clss);
1002      lastclss = Character.MAX_VALUE;
1003      continue;
1004    case 'D':
1005      for(clss = 0; clss < '0'; clss++)
1006        __setCharacterClassBits(__program, bits, deflt, clss);
1007      for(clss = (char)('9' + 1); clss < 256; clss++)
1008        __setCharacterClassBits(__program, bits, deflt, clss);
1009      lastclss = Character.MAX_VALUE;
1010      continue;
1011    case 'n':
1012      clss = '\n';
1013      break;
1014    case 'r':
1015      clss = '\r';
1016      break;
1017    case 't':
1018      clss = '\t';
1019      break;
1020    case 'f':
1021      clss = '\f';
1022      break;
1023    case 'b':
1024      clss = '\b';
1025      break;
1026    case 'e':
1027      clss = '\033';
1028      break;
1029    case 'a':
1030      clss = '\007';
1031      break;
1032    case 'x':
1033      clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
1034                  numLength);
1035      __input._increment(numLength[0]);
1036      break;
1037    case 'c':
1038      clss = __input._postIncrement();
1039      if(Character.isLowerCase(clss))
1040        clss = Character.toUpperCase(clss);
1041      clss ^= 64;
1042      break;
1043    case '0': case '1': case '2': case '3': case '4':
1044    case '5': case '6': case '7': case '8': case '9':
1045      clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
1046                    3, numLength);
1047      __input._increment(numLength[0] - 1);
1048      break;
1049    }
1050      }
1051
1052      if(range) {
1053    if(lastclss > clss)
1054      throw new MalformedPatternException(
1055             "Invalid [] range in expression.");
1056    range = false;
1057      } else {
1058    lastclss = clss;
1059
1060    if(__input._getValue() == '-' &&
1061       __input._getOffset() + 1 < __input._getLength() &&
1062       __input._getValueRelative(1) != ']') {
1063      __input._increment();
1064      range = true;
1065      continue;
1066    }
1067      }
1068
1069      while(lastclss <= clss) {
1070    __setCharacterClassBits(__program, bits, deflt, lastclss);
1071    if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
1072       Character.isUpperCase(lastclss))
1073      __setCharacterClassBits(__program, bits, deflt,
1074                 Character.toLowerCase(lastclss));
1075
1076    ++lastclss;
1077      }
1078
1079      lastclss = clss;
1080    }
1081
1082    if(__input._getValue() != ']')
1083      throw new MalformedPatternException("Unmatched [] in expression.");
1084
1085    __getNextChar();
1086
1087    return offset;
1088  }
1089
1090
1091  private int __parseBranch(int[] retFlags) throws MalformedPatternException {
1092    boolean nestCheck = false, handleRepetition = false;
1093    int offset, next, min, max, flags[] = { 0 };
1094    char operator, value;
1095
1096    min = 0;
1097    max = Character.MAX_VALUE;
1098    offset = __parseAtom(flags);
1099
1100    if(offset == OpCode._NULL_OFFSET) {
1101      if((flags[0] & __TRYAGAIN) != 0)
1102    retFlags[0] |= __TRYAGAIN;
1103      return OpCode._NULL_OFFSET;
1104    }
1105
1106    operator = __input._getValue();
1107
1108    if(operator == '(' && __input._getValueRelative(1) == '?' &&
1109       __input._getValueRelative(2) == '#') {
1110      while(operator != CharStringPointer._END_OF_STRING && operator != ')')
1111    operator = __input._increment();
1112
1113      if(operator != CharStringPointer._END_OF_STRING) {
1114    __getNextChar();
1115    operator = __input._getValue();
1116      }
1117    }
1118
1119    if(operator == '{' &&
1120       __parseRepetition(__input._array, __input._getOffset())) {
1121      int maxOffset, pos;
1122
1123      next = __input._getOffset() + 1;
1124      pos = maxOffset = __input._getLength();
1125
1126      value = __input._getValue(next);
1127
1128      while(Character.isDigit(value) || value == ',') {
1129    if(value == ',') {
1130      if(pos != maxOffset)
1131        break;
1132      else
1133        pos = next;
1134    }
1135    ++next;
1136    value = __input._getValue(next);
1137      }
1138
1139      if(value == '}') {
1140    int num;
1141    StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(10);
1142
1143    if(pos == maxOffset)
1144      pos = next;
1145    __input._increment();
1146
1147    num = __input._getOffset();
1148    value = __input._getValue(num);
1149
1150    while(Character.isDigit(value)) {
1151      buffer.append(value);
1152      ++num;
1153      value = __input._getValue(num);
1154    }
1155
1156    try {
1157      min = Integer.parseInt(buffer.toString());
1158    } catch(NumberFormatException JavaDoc e) {
1159      throw new MalformedPatternException(
1160     "Unexpected number format exception. Please report this bug." +
1161       "NumberFormatException message: " + e.getMessage());
1162    }
1163
1164    value = __input._getValue(pos);
1165    if(value == ',')
1166      ++pos;
1167    else
1168      pos = __input._getOffset();
1169
1170    num = pos;
1171    buffer = new StringBuffer JavaDoc(10);
1172
1173    value = __input._getValue(num);
1174
1175    while(Character.isDigit(value)){
1176      buffer.append(value);
1177      ++num;
1178      value = __input._getValue(num);
1179    }
1180
1181    try {
1182      if(num != pos)
1183        max = Integer.parseInt(buffer.toString());
1184    } catch(NumberFormatException JavaDoc e) {
1185      throw new MalformedPatternException(
1186     "Unexpected number format exception. Please report this bug." +
1187       "NumberFormatException message: " + e.getMessage());
1188    }
1189
1190    //System.err.println("min: " + min + " max: " + max); //debug
1191

1192    if(max == 0 && __input._getValue(pos) != '0')
1193      max = Character.MAX_VALUE;
1194    __input._setOffset(next);
1195    __getNextChar();
1196
1197    //System.err.println("min: " + min + " max: " + max); //debug
1198

1199    nestCheck = true;
1200    handleRepetition = true;
1201      }
1202    }
1203
1204    if(!nestCheck) {
1205      handleRepetition = false;
1206
1207      if(!__isSimpleRepetitionOp(operator)) {
1208    retFlags[0] = flags[0];
1209    return offset;
1210      }
1211
1212      __getNextChar();
1213
1214      retFlags[0] = ((operator != '+') ?
1215          (__WORSTCASE | __SPSTART) : (__WORSTCASE | __NONNULL));
1216
1217      if(operator == '*' && ((flags[0] & __SIMPLE) != 0)) {
1218    __programInsertOperator(OpCode._STAR, offset);
1219    __cost+=4;
1220      } else if(operator == '*') {
1221    min = 0;
1222    handleRepetition = true;
1223      } else if(operator == '+' && (flags[0] & __SIMPLE) != 0) {
1224    __programInsertOperator(OpCode._PLUS, offset);
1225    __cost+=3;
1226      } else if(operator == '+') {
1227    min = 1;
1228    handleRepetition = true;
1229      } else if(operator == '?') {
1230    min = 0;
1231    max = 1;
1232    handleRepetition = true;
1233      }
1234    }
1235
1236    if(handleRepetition) {
1237
1238      // handle repetition
1239
if((flags[0] & __SIMPLE) != 0){
1240    __cost+= ((2 + __cost) / 2);
1241    __programInsertOperator(OpCode._CURLY, offset);
1242      } else {
1243    __cost += (4 + __cost);
1244    __programAddTail(offset, __emitNode(OpCode._WHILEM));
1245    __programInsertOperator(OpCode._CURLYX, offset);
1246    __programAddTail(offset, __emitNode(OpCode._NOTHING));
1247      }
1248
1249      if(min > 0)
1250    retFlags[0] = (__WORSTCASE | __NONNULL);
1251
1252      if(max != 0 && max < min)
1253    throw new MalformedPatternException(
1254       "Invalid interval {" + min + "," + max + "}");
1255
1256      if(__program!= null) {
1257    __program[offset + 2] = (char)min;
1258    __program[offset + 3] = (char)max;
1259      }
1260    }
1261
1262
1263    if(__input._getValue() == '?') {
1264      __getNextChar();
1265      __programInsertOperator(OpCode._MINMOD, offset);
1266      __programAddTail(offset, offset + 2);
1267    }
1268
1269    if(__isComplexRepetitionOp(__input._array, __input._getOffset()))
1270      throw new MalformedPatternException(
1271        "Nested repetitions *?+ in expression");
1272
1273    return offset;
1274  }
1275
1276
1277  private int __parseExpression(boolean isParenthesized, int[] hintFlags)
1278    throws MalformedPatternException {
1279    char value, paren;
1280    int nodeOffset = OpCode._NULL_OFFSET, parenthesisNum = 0, br, ender;
1281    int[] flags = { 0 };
1282    String JavaDoc modifiers = "iogmsx";
1283
1284
1285    // Initially we assume expression doesn't match null string.
1286
hintFlags[0] = __NONNULL;
1287
1288    if (isParenthesized) {
1289      paren = 1;
1290      if(__input._getValue() == '?') {
1291    __input._increment();
1292    paren = value = __input._postIncrement();
1293
1294    switch(value) {
1295    case ':' :
1296    case '=' :
1297    case '!' : break;
1298    case '#' :
1299      value = __input._getValue();
1300      while(value != CharStringPointer._END_OF_STRING && value != ')')
1301        value = __input._increment();
1302      if(value != ')')
1303        throw new MalformedPatternException(
1304           "Sequence (?#... not terminated");
1305      __getNextChar();
1306      hintFlags[0] = __TRYAGAIN;
1307      return OpCode._NULL_OFFSET;
1308    default :
1309      __input._decrement();
1310      value = __input._getValue();
1311      while(value != CharStringPointer._END_OF_STRING &&
1312        modifiers.indexOf(value) != -1) {
1313        __setModifierFlag(__modifierFlags, value);
1314        value = __input._increment();
1315      }
1316      if(value != ')')
1317        throw new MalformedPatternException(
1318           "Sequence (?" + value + "...) not recognized");
1319      __getNextChar();
1320      hintFlags[0] = __TRYAGAIN;
1321      return OpCode._NULL_OFFSET;
1322    }
1323      } else {
1324    parenthesisNum = __numParentheses;
1325    ++__numParentheses;
1326    nodeOffset = __emitArgNode(OpCode._OPEN, (char)parenthesisNum);
1327      }
1328    } else
1329      paren = 0;
1330
1331    br = __parseAlternation(flags);
1332
1333    if(br == OpCode._NULL_OFFSET)
1334      return OpCode._NULL_OFFSET;
1335
1336    if(nodeOffset != OpCode._NULL_OFFSET)
1337      __programAddTail(nodeOffset, br);
1338    else
1339      nodeOffset = br;
1340
1341    if((flags[0] & __NONNULL) == 0)
1342      hintFlags[0] &= ~__NONNULL;
1343
1344    hintFlags[0] |= (flags[0] & __SPSTART);
1345
1346    while(__input._getValue() == '|') {
1347      __getNextChar();
1348      br = __parseAlternation(flags);
1349
1350      if(br == OpCode._NULL_OFFSET)
1351    return OpCode._NULL_OFFSET;
1352
1353      __programAddTail(nodeOffset, br);
1354
1355      if((flags[0] & __NONNULL) == 0)
1356    hintFlags[0] &= ~__NONNULL;
1357
1358      hintFlags[0] |= (flags[0] & __SPSTART);
1359    }
1360
1361    switch(paren) {
1362    case ':' :
1363      ender = __emitNode(OpCode._NOTHING);
1364      break;
1365    case 1:
1366      ender = __emitArgNode(OpCode._CLOSE, (char)parenthesisNum);
1367      break;
1368    case '=':
1369    case '!':
1370      ender = __emitNode(OpCode._SUCCEED);
1371      hintFlags[0] &= ~__NONNULL;
1372      break;
1373    case 0 :
1374    default :
1375      ender = __emitNode(OpCode._END);
1376      break;
1377    }
1378
1379    __programAddTail(nodeOffset, ender);
1380
1381    for(br = nodeOffset; br != OpCode._NULL_OFFSET;
1382    br = OpCode._getNext(__program, br))
1383      __programAddOperatorTail(br, ender);
1384
1385    if(paren == '=') {
1386      __programInsertOperator(OpCode._IFMATCH, nodeOffset);
1387      __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1388    } else if(paren == '!') {
1389      __programInsertOperator(OpCode._UNLESSM, nodeOffset);
1390      __programAddTail(nodeOffset, __emitNode(OpCode._NOTHING));
1391    }
1392
1393    if(paren != 0 && (__input._isAtEnd() || __getNextChar() != ')')) {
1394      throw new MalformedPatternException("Unmatched parentheses.");
1395    } else if(paren == 0 && !__input._isAtEnd()) {
1396      if(__input._getValue() == ')')
1397    throw new MalformedPatternException("Unmatched parentheses.");
1398      else
1399    // Should never happen.
1400
throw new MalformedPatternException(
1401       "Unreached characters at end of expression. Please report this bug!");
1402    }
1403
1404
1405    return nodeOffset;
1406  }
1407
1408
1409  /**
1410   * Compiles a Perl5 regular expression into a Perl5Pattern instance that
1411   * can be used by a Perl5Matcher object to perform pattern matching.
1412   * Please see the user's guide for more information about Perl5 regular
1413   * expressions.
1414   * <p>
1415   * @param pattern A Perl5 regular expression to compile.
1416   * @param options A set of flags giving the compiler instructions on
1417   * how to treat the regular expression. The flags
1418   * are a logical OR of any number of the five <b>MASK</b>
1419   * constants. For example:
1420   * <pre>
1421   * regex =
1422   * compiler.compile(pattern, Perl5Compiler.
1423   * CASE_INSENSITIVE_MASK |
1424   * Perl5Compiler.MULTILINE_MASK);
1425   * </pre>
1426   * This says to compile the pattern so that it treats
1427   * input as consisting of multiple lines and to perform
1428   * matches in a case insensitive manner.
1429   * @return A Pattern instance constituting the compiled regular expression.
1430   * This instance will always be a Perl5Pattern and can be reliably
1431   * casted to a Perl5Pattern.
1432   * @exception MalformedPatternException If the compiled expression
1433   * is not a valid Perl5 regular expression.
1434   */

1435  public Pattern compile(char[] pattern, int options)
1436       throws MalformedPatternException {
1437    int[] flags = { 0 };
1438    int caseInsensitive, scan;
1439    Perl5Pattern regexp;
1440    String JavaDoc mustString, startString;
1441
1442    int first;
1443    boolean sawOpen = false, sawPlus = false;
1444
1445    StringBuffer JavaDoc lastLongest, longest;
1446    int length, minLength = 0, curBack, back, backmost;
1447
1448
1449    __input = new CharStringPointer(pattern);
1450
1451    caseInsensitive = options & __CASE_INSENSITIVE;
1452    __modifierFlags[0] = (char)options;
1453    __sawBackreference = false;
1454    __numParentheses = 1;
1455    __programSize = 0;
1456    __cost = 0;
1457    __program= null;
1458
1459    __emitCode((char)0);
1460    if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) {
1461      //System.err.println("null -- Size: " + __programSize); // debug
1462
// return null;
1463
throw new MalformedPatternException("Unknown compilation error.");
1464    }
1465
1466    //System.err.println("First Pass Size: " + __programSize); //debug
1467

1468    if(__programSize >= Character.MAX_VALUE - 1)
1469      throw new MalformedPatternException("Expression is too large.");
1470
1471
1472    __program= new char[__programSize];
1473    regexp = new Perl5Pattern();
1474
1475    regexp._program = __program;
1476    regexp._expression = new String JavaDoc(pattern);
1477
1478    __input._setOffset(0);
1479
1480    __numParentheses = 1;
1481    __programSize = 0;
1482    __cost = 0;
1483
1484    __emitCode((char)0);
1485    if(__parseExpression(false, flags) == OpCode._NULL_OFFSET) {
1486      //System.err.println("null -- Size: " + __programSize); //debug
1487
//return null;
1488
throw new MalformedPatternException("Unknown compilation error.");
1489    }
1490
1491    //System.err.println("Second Pass Size: " + __programSize); //debug
1492

1493    caseInsensitive = __modifierFlags[0] & __CASE_INSENSITIVE;
1494
1495    regexp._isExpensive = (__cost >= 10);
1496    regexp._startClassOffset = OpCode._NULL_OFFSET;
1497    regexp._anchor = 0;
1498    regexp._back = -1;
1499    regexp._options = options;
1500    regexp._startString = null;
1501    regexp._mustString = null;
1502    mustString = null;
1503    startString = null;
1504
1505    scan = 1;
1506    if(__program[OpCode._getNext(__program, scan)] == OpCode._END){
1507      boolean doItAgain; // bad variables names!
1508
char op;
1509
1510      first = scan = OpCode._getNextOperator(scan);
1511      op = __program[first];
1512
1513      while((op == OpCode._OPEN && (sawOpen = true)) ||
1514        (op == OpCode._BRANCH &&
1515         __program[OpCode._getNext(__program, first)] != OpCode._BRANCH) ||
1516        op == OpCode._PLUS || op == OpCode._MINMOD ||
1517        (OpCode._opType[op] == OpCode._CURLY &&
1518         OpCode._getArg1(__program, first) > 0)) {
1519    if(op == OpCode._PLUS)
1520      sawPlus = true;
1521    else
1522      first+=OpCode._operandLength[op];
1523
1524    first = OpCode._getNextOperator(first);
1525    op = __program[first];
1526      }
1527
1528      doItAgain = true;
1529
1530      while(doItAgain) {
1531    doItAgain = false;
1532    op = __program[first];
1533
1534    if(op == OpCode._EXACTLY) {
1535      startString =
1536        new String JavaDoc(__program, OpCode._getOperand(first + 1),
1537               __program[OpCode._getOperand(first)]);
1538
1539    } else if(OpCode._isInArray(op, OpCode._opLengthOne, 2))
1540      regexp._startClassOffset = first;
1541    else if(op == OpCode._BOUND || op == OpCode._NBOUND)
1542      regexp._startClassOffset = first;
1543    else if(OpCode._opType[op] == OpCode._BOL) {
1544      regexp._anchor = Perl5Pattern._OPT_ANCH;
1545      first = OpCode._getNextOperator(first);
1546      doItAgain = true;
1547      continue;
1548    } else if(op == OpCode._STAR &&
1549          OpCode._opType[__program[OpCode._getNextOperator(first)]] ==
1550          OpCode._ANY && (regexp._anchor & Perl5Pattern._OPT_ANCH) != 0)
1551      {
1552        regexp._anchor = Perl5Pattern._OPT_ANCH | Perl5Pattern._OPT_IMPLICIT;
1553        first = OpCode._getNextOperator(first);
1554        doItAgain = true;
1555        continue;
1556    }
1557      } // end while do it again
1558

1559      if(sawPlus && (!sawOpen || !__sawBackreference))
1560    regexp._anchor |= Perl5Pattern._OPT_SKIP;
1561
1562
1563      //length = OpCode._getNextOperator(first); //debug
1564
// System.err.println("first: " + first + "nextoper: " + length);
1565
//System.err.print("first " + (int)op + " next "); // debug
1566
//if(length >= 0 && length < _program.length) //debug
1567
//System.err.print((int)(__program[length])); //debug
1568
//else //debug
1569
//System.err.print("out of range"); //debug
1570
//System.err.println(" offset " + (int)(first - scan)); // debug
1571

1572      lastLongest = new StringBuffer JavaDoc();
1573      longest = new StringBuffer JavaDoc();
1574      length = 0;
1575      minLength = 0;
1576      curBack = 0;
1577      back = 0;
1578      backmost = 0;
1579
1580      while(scan > 0 && (op = __program[scan]) != OpCode._END) {
1581
1582    if(op == OpCode._BRANCH) {
1583      if(__program[OpCode._getNext(__program, scan)] == OpCode._BRANCH) {
1584        curBack = -30000;
1585        while(__program[scan] == OpCode._BRANCH)
1586          scan = OpCode._getNext(__program, scan);
1587      } else
1588        scan = OpCode._getNextOperator(scan);
1589      continue;
1590    }
1591
1592    if(op == OpCode._UNLESSM) {
1593      curBack = -30000;
1594      scan = OpCode._getNext(__program, scan);
1595      continue;
1596    }
1597
1598    if(op == OpCode._EXACTLY) {
1599      int temp;
1600
1601      first = scan;
1602      while(__program[(temp = OpCode._getNext(__program, scan))] ==
1603        OpCode._CLOSE)
1604        scan = temp;
1605
1606      minLength += __program[OpCode._getOperand(first)];
1607
1608      temp = __program[OpCode._getOperand(first)];
1609
1610      if(curBack - back == length) {
1611        lastLongest.append(new String JavaDoc(__program, OpCode._getOperand(first) + 1,
1612                      temp));
1613        length += temp;
1614        curBack += temp;
1615        first = OpCode._getNext(__program, scan);
1616      } else if(temp >= (length + (curBack >= 0 ? 1 : 0))) {
1617        length = temp;
1618        lastLongest =
1619          new StringBuffer JavaDoc(new String JavaDoc(__program,
1620                      OpCode._getOperand(first) + 1, temp));
1621        back = curBack;
1622        curBack += length;
1623        first = OpCode._getNext(__program, scan);
1624      } else
1625        curBack += temp;
1626    } else if(OpCode._isInArray(op, OpCode._opLengthVaries, 0)) {
1627      curBack = -30000;
1628      length = 0;
1629
1630      if(lastLongest.length() > longest.length()) {
1631        longest = lastLongest;
1632        backmost = back;
1633      }
1634
1635      lastLongest = new StringBuffer JavaDoc();
1636
1637      if(op == OpCode._PLUS &&
1638         OpCode._isInArray(__program[OpCode._getNextOperator(scan)],
1639                OpCode._opLengthOne, 0))
1640        ++minLength;
1641      else if(OpCode._opType[op] == OpCode._CURLY &&
1642          OpCode._isInArray(__program[OpCode._getNextOperator(scan) + 2],
1643                 OpCode._opLengthOne, 0))
1644        minLength += OpCode._getArg1(__program, scan);
1645    } else if(OpCode._isInArray(op, OpCode._opLengthOne, 0)) {
1646      ++curBack;
1647      ++minLength;
1648      length = 0;
1649      if(lastLongest.length() > longest.length()) {
1650        longest = lastLongest;
1651        backmost = back;
1652      }
1653      lastLongest = new StringBuffer JavaDoc();
1654    }
1655
1656    scan = OpCode._getNext(__program, scan);
1657      } // end while
1658

1659      if(lastLongest.length() +
1660     ((OpCode._opType[__program[first]] == OpCode._EOL) ? 1 : 0) >
1661     longest.length()) {
1662    longest = lastLongest;
1663    backmost = back;
1664      } else
1665    lastLongest = new StringBuffer JavaDoc();
1666
1667      if(longest.length() > 0 && startString == null) {
1668    mustString = longest.toString();
1669    if(backmost < 0)
1670      backmost = -1;
1671    regexp._back = backmost;
1672
1673    /*
1674
1675      if(longest.length() >
1676      (((caseInsensitive & __CASE_INSENSITIVE) != 0 ||
1677      OpCode._opType[__program[first]] == OpCode._EOL)
1678      ? 1 : 0))
1679      */

1680      } else
1681    longest = null;
1682    } // end if
1683

1684
1685    regexp._isCaseInsensitive = ((caseInsensitive & __CASE_INSENSITIVE) != 0);
1686    regexp._numParentheses = __numParentheses - 1;
1687    regexp._minLength = minLength;
1688
1689    if(mustString != null) {
1690      regexp._mustString = mustString.toCharArray();
1691      regexp._mustUtility = 100;
1692    }
1693
1694    if(startString != null)
1695      regexp._startString = startString.toCharArray();
1696
1697    return regexp;
1698  }
1699
1700  /**
1701   * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
1702   * <p>
1703   * @param pattern A regular expression to compile.
1704   * @return A Pattern instance constituting the compiled regular expression.
1705   * This instance will always be a Perl5Pattern and can be reliably
1706   * casted to a Perl5Pattern.
1707   * @exception MalformedPatternException If the compiled expression
1708   * is not a valid Perl5 regular expression.
1709   */

1710  public Pattern compile(char[] pattern) throws MalformedPatternException {
1711     return compile(pattern, DEFAULT_MASK);
1712  }
1713
1714
1715  /**
1716   * Same as calling <b>compile(pattern, Perl5Compiler.DEFAULT_MASK);</b>
1717   * <p>
1718   * @param pattern A regular expression to compile.
1719   * @return A Pattern instance constituting the compiled regular expression.
1720   * This instance will always be a Perl5Pattern and can be reliably
1721   * casted to a Perl5Pattern.
1722   * @exception MalformedPatternException If the compiled expression
1723   * is not a valid Perl5 regular expression.
1724   */

1725  public Pattern compile(String JavaDoc pattern) throws MalformedPatternException {
1726     return compile(pattern.toCharArray(), DEFAULT_MASK);
1727  }
1728
1729
1730  /**
1731   * Compiles a Perl5 regular expression into a Perl5Pattern instance that
1732   * can be used by a Perl5Matcher object to perform pattern matching.
1733   * Please see the user's guide for more information about Perl5 regular
1734   * expressions.
1735   * <p>
1736   * @param pattern A Perl5 regular expression to compile.
1737   * @param options A set of flags giving the compiler instructions on
1738   * how to treat the regular expression. The flags
1739   * are a logical OR of any number of the five <b>MASK</b>
1740   * constants. For example:
1741   * <pre>
1742   * regex =
1743   * compiler.compile("^\\w+\\d+$",
1744   * Perl5Compiler.CASE_INSENSITIVE_MASK |
1745   * Perl5Compiler.MULTILINE_MASK);
1746   * </pre>
1747   * This says to compile the pattern so that it treats
1748   * input as consisting of multiple lines and to perform
1749   * matches in a case insensitive manner.
1750   * @return A Pattern instance constituting the compiled regular expression.
1751   * This instance will always be a Perl5Pattern and can be reliably
1752   * casted to a Perl5Pattern.
1753   * @exception MalformedPatternException If the compiled expression
1754   * is not a valid Perl5 regular expression.
1755   */

1756  public Pattern compile(String JavaDoc pattern, int options)
1757       throws MalformedPatternException {
1758     return compile(pattern.toCharArray(), options);
1759  }
1760
1761}
1762
Popular Tags