KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > oro > text > regex > Perl5Compiler


1 package org.apache.oro.text.regex;
2
3 /* ====================================================================
4  * The Apache Software License, Version 1.1
5  *
6  * Copyright (c) 2000 The Apache Software Foundation. All rights
7  * reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  *
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in
18  * the documentation and/or other materials provided with the
19  * distribution.
20  *
21  * 3. The end-user documentation included with the redistribution,
22  * if any, must include the following acknowledgment:
23  * "This product includes software developed by the
24  * Apache Software Foundation (http://www.apache.org/)."
25  * Alternately, this acknowledgment may appear in the software itself,
26  * if and wherever such third-party acknowledgments normally appear.
27  *
28  * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
29  * must not be used to endorse or promote products derived from this
30  * software without prior written permission. For written
31  * permission, please contact apache@apache.org.
32  *
33  * 5. Products derived from this software may not be called "Apache"
34  * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
35  * name, without prior written permission of the Apache Software Foundation.
36  *
37  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
38  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
39  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
40  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
41  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
44  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
46  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
47  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  * ====================================================================
50  *
51  * This software consists of voluntary contributions made by many
52  * individuals on behalf of the Apache Software Foundation. For more
53  * information on the Apache Software Foundation, please see
54  * <http://www.apache.org/>.
55  *
56  * Portions of this software are based upon software originally written
57  * by Daniel F. Savarese. We appreciate his contributions.
58  */

59
60 /**
61  * The Perl5Compiler class is used to create compiled regular expressions
62  * conforming to the Perl5 regular expression syntax. It generates
63  * Perl5Pattern instances upon compilation to be used in conjunction
64  * with a Perl5Matcher instance. Please see the user's guide for more
65  * information about Perl5 regular expressions.
66
67  @author <a HREF="dfs@savarese.org">Daniel F. Savarese</a>
68  @version $Id: Perl5Compiler.java,v 1.1.1.1 2000/07/23 23:08:52 jon Exp $
69
70  * @see PatternCompiler
71  * @see MalformedPatternException
72  * @see Perl5Pattern
73  * @see Perl5Matcher
74  */

75
76 public final class Perl5Compiler implements PatternCompiler {
77   private static final int __WORSTCASE = 0, __NONNULL = 0x1, __SIMPLE = 0x2,
78                            __SPSTART = 0x4, __TRYAGAIN = 0x8;
79
80   private static final char
81     __CASE_INSENSITIVE = 0x0001,
82     __GLOBAL = 0x0002,
83     __KEEP = 0x0004,
84     __MULTILINE = 0x0008,
85     __SINGLELINE = 0x0010,
86     __EXTENDED = 0x0020,
87     __READ_ONLY = 0x8000;
88
89   private static final String JavaDoc __META_CHARS = "^$.[()|?+*\\";
90   private static final String JavaDoc __HEX_DIGIT =
91   "0123456789abcdef0123456789ABCDEFx";
92   private CharStringPointer __input;
93   private boolean __sawBackreference;
94   private char[] __modifierFlags = { 0 };
95
96   // IMPORTANT: __numParentheses starts out equal to 1 during compilation.
97
// It is always one greater than the number of parentheses encountered
98
// so far in the regex. That is because it refers to the number of groups
99
// to save, and the entire match is always saved (group 0)
100
private int __numParentheses, __programSize, __cost;
101
102   // When doing the second pass and actually generating code, __programSize
103
// keeps track of the current offset.
104
private char[] __program;
105
106   /**
107    * The default mask for the {@link #compile compile} methods.
108    * It is equal to 0.
109    * The default behavior is for a regular expression to be case sensitive
110    * and to not specify if it is multiline or singleline. When MULITLINE_MASK
111    * and SINGLINE_MASK are not defined, the <b>^</b>, <b>$</b>, and <b>.</b>
112    * metacharacters are
113    * interpreted according to the value of isMultiline() in Perl5Matcher.
114    * The default behavior of Perl5Matcher is to treat the Perl5Pattern
115    * as though MULTILINE_MASK were enabled. If isMultiline() returns false,
116    * then the pattern is treated as though SINGLINE_MASK were set. However,
117    * compiling a pattern with the MULTILINE_MASK or SINGLELINE_MASK masks
118    * will ALWAYS override whatever behavior is specified by the setMultiline()
119    * in Perl5Matcher.
120    */

121   public static final int DEFAULT_MASK = 0;
122
123   /**
124    * A mask passed as an option to the {@link #compile compile} methods
125    * to indicate a compiled regular expression should be case insensitive.
126    */

127   public static final int CASE_INSENSITIVE_MASK = __CASE_INSENSITIVE;
128
129   /**
130    * A mask passed as an option to the {@link #compile compile} methods
131    * to indicate a compiled regular expression should treat input as having
132    * multiple lines. This option affects the interpretation of
133    * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
134    * the <b>^</b> metacharacter matches at the beginning of every line,
135    * and the <b>$</b> metacharacter matches at the end of every line.
136    * Additionally the <b> . </b> metacharacter will not match newlines when
137    * an expression is compiled with <b> MULTILINE_MASK </b>, which is its
138    * default behavior.
139    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
140    * used together.
141    */

142   public static final int MULTILINE_MASK = __MULTILINE;
143
144   /**
145    * A mask passed as an option to the {@link #compile compile} methods
146    * to indicate a compiled regular expression should treat input as being
147    * a single line. This option affects the interpretation of
148    * the <b>^</b> and <b>$</b> metacharacters. When this mask is used,
149    * the <b>^</b> metacharacter matches at the beginning of the input,
150    * and the <b>$</b> metacharacter matches at the end of the input.
151    * The <b>^</b> and <b>$</b> metacharacters will not match at the beginning
152    * and end of lines occurring between the begnning and end of the input.
153    * Additionally, the <b> . </b> metacharacter will match newlines when
154    * an expression is compiled with <b> SINGLELINE_MASK </b>, unlike its
155    * default behavior.
156    * The <b>SINGLELINE_MASK</b> and <b>MULTILINE_MASK</b> should not be
157    * used together.
158    */

159   public static final int SINGLELINE_MASK = __SINGLELINE;
160
161   /**
162    * A mask passed as an option to the {@link #compile compile} methods
163    * to indicate a compiled regular expression should be treated as a Perl5
164    * extended pattern (i.e., a pattern using the <b>/x</b> modifier). This
165    * option tells the compiler to ignore whitespace that is not backslashed or
166    * within a character class. It also tells the compiler to treat the
167    * <b>#</b> character as a metacharacter introducing a comment as in
168    * Perl. In other words, the <b>#</b> character will comment out any
169    * text in the regular expression between it and the next newline.
170    * The intent of this option is to allow you to divide your patterns
171    * into more readable parts. It is provided to maintain compatibility
172    * with Perl5 regular expressions, although it will not often
173    * make sense to use it in Java.
174    */

175   public static final int EXTENDED_MASK = __EXTENDED;
176
177   /**
178    * A mask passed as an option to the {@link #compile compile} methods
179    * to indicate that the resulting Perl5Pattern should be treated as a
180    * read only data structure by Perl5Matcher, making it safe to share
181    * a single Perl5Pattern instance among multiple threads without needing
182    * synchronization. Without this option, Perl5Matcher reserves the right
183    * to store heuristic or other information in Perl5Pattern that might
184    * accelerate future matches. When you use this option, Perl5Matcher will
185    * not store or modify any information in a Perl5Pattern. Use this option
186    * when you want to share a Perl5Pattern instance among multiple threads
187    * using different Perl5Matcher instances.
188    */

189   public static final int READ_ONLY_MASK = __READ_ONLY;
190
191   /**
192    * Given a character string, returns a Perl5 expression that interprets
193    * each character of the original string literally. In other words, all
194    * special metacharacters are quoted/escaped. This method is useful for
195    * converting user input meant for literal interpretation into a safe
196    * regular expression representing the literal input.
197    * <p>
198    * In effect, this method is the analog of the Perl5 quotemeta() builtin
199    * method.
200    * <p>
201    * @param expression The expression to convert.
202    * @return A String containing a Perl5 regular expression corresponding to
203    * a literal interpretation of the pattern.
204    */

205   public static final String JavaDoc quotemeta(char[] expression) {
206     int ch;
207     StringBuffer JavaDoc buffer;
208
209     buffer = new StringBuffer JavaDoc(2*expression.length);
210     for(ch = 0; ch < expression.length; ch++) {
211       if(!OpCode._isWordCharacter(expression[ch]))
212     buffer.append('\\');
213       buffer.append(expression[ch]);
214     }
215
216     return buffer.toString();
217   }
218
219   /**
220    * Given a character string, returns a Perl5 expression that interprets
221    * each character of the original string literally. In other words, all
222    * special metacharacters are quoted/escaped. This method is useful for
223    * converting user input meant for literal interpretation into a safe
224    * regular expression representing the literal input.
225    * <p>
226    * In effect, this method is the analog of the Perl5 quotemeta() builtin
227    * method.
228    * <p>
229    * @param pattern The pattern to convert.
230    * @return A String containing a Perl5 regular expression corresponding to
231    * a literal interpretation of the pattern.
232    */

233   public static final String JavaDoc quotemeta(String JavaDoc expression) {
234     return quotemeta(expression.toCharArray());
235   }
236
237   private static boolean __isSimpleRepetitionOp(char ch) {
238     return (ch == '*' || ch == '+' || ch == '?');
239   }
240
241   private static boolean __isComplexRepetitionOp(char[] ch, int offset) {
242     if(offset < ch.length && offset >= 0)
243        return (ch[offset] == '*' || ch[offset] == '+' || ch[offset] == '?'
244            || (ch[offset] == '{' && __parseRepetition(ch, offset)));
245     return false;
246   }
247
248   // determines if {\d+,\d*} is the next part of the string
249
private static boolean __parseRepetition(char[] str, int offset) {
250     if(str[offset] != '{')
251       return false;
252     ++offset;
253
254     if(offset >= str.length || !Character.isDigit(str[offset]))
255       return false;
256
257     while(offset < str.length && Character.isDigit(str[offset]))
258       ++offset;
259
260     if(offset < str.length && str[offset] == ',')
261       ++offset;
262
263     while(offset < str.length && Character.isDigit(str[offset]))
264       ++offset;
265
266     if(offset >= str.length || str[offset] != '}')
267       return false;
268
269     return true;
270   }
271
272   private static int __parseHex(char[] str, int offset, int maxLength,
273                 int[] scanned)
274   {
275     int val = 0, index;
276
277     scanned[0] = 0;
278     while(offset < str.length && maxLength-- > 0 &&
279       (index = __HEX_DIGIT.indexOf(str[offset])) != -1) {
280       val <<= 4;
281       val |= (index & 15);
282       ++offset;
283       ++scanned[0];
284     }
285
286     return val;
287   }
288
289   private static int __parseOctal(char[] str, int offset, int maxLength,
290                  int[] scanned)
291   {
292     int val = 0, index;
293
294     scanned[0] = 0;
295     while(offset < str.length &&
296       maxLength > 0 && str[offset] >= '0' && str[offset] <= '7') {
297       val <<= 3;
298       val |= (str[offset] - '0');
299       --maxLength;
300       ++offset;
301       ++scanned[0];
302     }
303
304     return val;
305   }
306
307   private static void __setModifierFlag(char[] flags, char ch) {
308     switch(ch) {
309     case 'i' : flags[0] |= __CASE_INSENSITIVE; return;
310     case 'g' : flags[0] |= __GLOBAL; return;
311     case 'o' : flags[0] |= __KEEP; return;
312     case 'm' : flags[0] |= __MULTILINE; return;
313     case 's' : flags[0] |= __SINGLELINE; return;
314     case 'x' : flags[0] |= __EXTENDED; return;
315     }
316   }
317
318   // Emit a specific character code.
319
private void __emitCode(char code) {
320
321     if(__program != null)
322       __program[__programSize] = code;
323
324     ++__programSize;
325   }
326
327
328   // Emit an operator with no arguments.
329
// Return an offset into the __program array as a pointer to node.
330
private int __emitNode(char operator) {
331     int offset;
332
333     offset = __programSize;
334
335     if(__program == null)
336       __programSize+=2;
337     else {
338       __program[__programSize++] = operator;
339       __program[__programSize++] = OpCode._NULL_POINTER;
340     }
341
342     return offset;
343   }
344
345
346   // Emit an operator with arguments.
347
// Return an offset into the __programarray as a pointer to node.
348
private int __emitArgNode(char operator, char arg) {
349     int offset;
350
351     offset = __programSize;
352
353     if(__program== null)
354       __programSize+=3;
355     else {
356       __program[__programSize++] = operator;
357       __program[__programSize++] = OpCode._NULL_POINTER;
358       __program[__programSize++] = arg;
359     }
360
361     return offset;
362   }
363
364
365   // Insert an operator at a given offset.
366
private void __programInsertOperator(char operator, int operand) {
367     int src, dest, offset;
368
369     offset = (OpCode._opType[operator] == OpCode._CURLY ? 2 : 0);
370
371
372     if(__program== null) {
373       __programSize+=(2 + offset);
374       return;
375     }
376
377     src = __programSize;
378     __programSize+=(2 + offset);
379     dest = __programSize;
380
381     while(src > operand) {
382       --src;
383       --dest;
384       __program[dest] = __program[src];
385     }
386
387     __program[operand++] = operator;
388     __program[operand++] = OpCode._NULL_POINTER;
389
390     while(offset-- > 0)
391       __program[operand++] = OpCode._NULL_POINTER;
392
393   }
394
395
396
397   private void __programAddTail(int current, int value) {
398     int scan, temp, offset;
399
400     if(__program== null || current == OpCode._NULL_OFFSET)
401       return;
402
403     scan = current;
404
405     while(true) {
406       temp = OpCode._getNext(__program, scan);
407       if(temp == OpCode._NULL_OFFSET)
408     break;
409       scan = temp;
410     }
411
412     if(__program[scan] == OpCode._BACK)
413       offset = scan - value;
414     else
415       offset = value - scan;
416
417     __program[scan + 1] = (char)offset;
418   }
419
420
421   private void __programAddOperatorTail(int current, int value) {
422     if(__program== null || current == OpCode._NULL_OFFSET ||
423        OpCode._opType[__program[current]] != OpCode._BRANCH)
424       return;
425     __programAddTail(OpCode._getNextOperator(current), value);
426   }
427
428
429   private char __getNextChar() {
430     char ret, value;
431
432     ret = __input._postIncrement();
433
434     while(true) {
435       value = __input._getValue();
436
437       if(value == '(' && __input._getValueRelative(1) == '?' &&
438      __input._getValueRelative(2) == '#') {
439     // Skip comments
440
while(value != CharStringPointer._END_OF_STRING && value != ')')
441       value = __input._increment();
442     __input._increment();
443     continue;
444       }
445
446       if((__modifierFlags[0] & __EXTENDED) != 0) {
447     if(Character.isWhitespace(value)) {
448       __input._increment();
449       continue;
450     } else if(value == '#') {
451       while(value != CharStringPointer._END_OF_STRING && value != '\n')
452         value = __input._increment();
453       __input._increment();
454       continue;
455     }
456       }
457
458       // System.err.println("next: " + ret + " last: " + __input._getValue()); // debug
459

460
461       return ret;
462     }
463
464   }
465
466
467   private int __parseAlternation(int[] retFlags)
468     throws MalformedPatternException
469   {
470     int chain, offset, latest;
471     int flags = 0;
472     char value;
473
474     retFlags[0] = __WORSTCASE;
475
476     offset = __emitNode(OpCode._BRANCH);
477
478     chain = OpCode._NULL_OFFSET;
479
480     if(__input._getOffset() == 0) {
481       __input._setOffset(-1);
482       __getNextChar();
483     } else {
484       __input._decrement();
485       __getNextChar();
486     }
487
488     value = __input._getValue();
489
490     while(value != CharStringPointer._END_OF_STRING &&
491       value != '|' && value != ')') {
492       flags &= ~__TRYAGAIN;
493       latest = __parseBranch(retFlags);
494
495       if(latest == OpCode._NULL_OFFSET) {
496     if((flags & __TRYAGAIN) != 0){
497       value = __input._getValue();
498       continue;
499     }
500     return OpCode._NULL_OFFSET;
501       }
502
503       retFlags[0] |= (flags & __NONNULL);
504
505       if(chain == OpCode._NULL_OFFSET)
506     retFlags[0] |= (flags & __SPSTART);
507       else {
508     ++__cost;
509     __programAddTail(chain, latest);
510       }
511       chain = latest;
512       value = __input._getValue();
513     }
514
515     // If loop was never entered.
516
if(chain == OpCode._NULL_OFFSET)
517       __emitNode(OpCode._NOTHING);
518
519     return offset;
520   }
521
522
523   private int __parseAtom(int[] retFlags) throws MalformedPatternException {
524     boolean doDefault;
525     char value;
526     int offset, flags[] = { 0 };
527     
528     
529     retFlags[0] = __WORSTCASE;
530     doDefault = false;
531     offset = OpCode._NULL_OFFSET;
532
533   tryAgain:
534     while(true) {
535
536       value = __input._getValue();
537
538       switch(value) {
539       case '^' :
540     __getNextChar();
541     // The order here is important in order to support /ms.
542
// /m takes precedence over /s for ^ and $, but not for .
543
if((__modifierFlags[0] & __MULTILINE) != 0)
544       offset = __emitNode(OpCode._MBOL);
545     else if((__modifierFlags[0] & __SINGLELINE) != 0)
546       offset = __emitNode(OpCode._SBOL);
547     else
548       offset = __emitNode(OpCode._BOL);
549     break tryAgain;
550
551       case '$':
552     __getNextChar();
553     // The order here is important in order to support /ms.
554
// /m takes precedence over /s for ^ and $, but not for .
555
if((__modifierFlags[0] & __MULTILINE) != 0)
556       offset = __emitNode(OpCode._MEOL);
557     else if((__modifierFlags[0] & __SINGLELINE) != 0)
558       offset = __emitNode(OpCode._SEOL);
559     else
560       offset = __emitNode(OpCode._EOL);
561     break tryAgain;
562
563       case '.':
564     __getNextChar();
565     // The order here is important in order to support /ms.
566
// /m takes precedence over /s for ^ and $, but not for .
567
if((__modifierFlags[0] & __SINGLELINE) != 0)
568       offset = __emitNode(OpCode._SANY);
569     else
570       offset = __emitNode(OpCode._ANY);
571     ++__cost;
572     retFlags[0] |= (__NONNULL | __SIMPLE);
573     break tryAgain;
574
575       case '[':
576     __input._increment();
577     offset = __parseCharacterClass();
578     retFlags[0] |= (__NONNULL | __SIMPLE);
579     break tryAgain;
580
581       case '(':
582     __getNextChar();
583     offset = __parseExpression(true, flags);
584     if(offset == OpCode._NULL_OFFSET) {
585       if((flags[0] & __TRYAGAIN) != 0)
586         continue tryAgain;
587       return OpCode._NULL_OFFSET;
588     }
589     retFlags[0] |= (flags[0] & (__NONNULL | __SPSTART));
590     break tryAgain;
591
592       case '|':
593       case ')':
594     if((flags[0] & __TRYAGAIN) != 0) {
595       retFlags[0] |= __TRYAGAIN;
596       return OpCode._NULL_OFFSET;
597     }
598
599     throw new MalformedPatternException("Error in expression at " +
600                    __input._toString(__input._getOffset()));
601     //break tryAgain;
602

603       case '?':
604       case '+':
605       case '*':
606     throw new MalformedPatternException(
607                  "?+* follows nothing in expression");
608     //break tryAgain;
609

610       case '\\':
611     value = __input._increment();
612
613     switch(value) {
614     case 'A' :
615       offset = __emitNode(OpCode._SBOL);
616       retFlags[0] |= __SIMPLE;
617       __getNextChar();
618       break;
619     case 'G':
620       offset = __emitNode(OpCode._GBOL);
621       retFlags[0] |= __SIMPLE;
622       __getNextChar();
623       break;
624     case 'Z':
625       offset = __emitNode(OpCode._SEOL);
626       retFlags[0] |= __SIMPLE;
627       __getNextChar();
628       break;
629     case 'w':
630       offset = __emitNode(OpCode._ALNUM);
631       retFlags[0] |= (__NONNULL | __SIMPLE);
632       __getNextChar();
633       break;
634     case 'W':
635       offset = __emitNode(OpCode._NALNUM);
636       retFlags[0] |= (__NONNULL | __SIMPLE);
637       __getNextChar();
638       break;
639     case 'b':
640       offset = __emitNode(OpCode._BOUND);
641       retFlags[0] |= __SIMPLE;
642       __getNextChar();
643       break;
644     case 'B':
645       offset = __emitNode(OpCode._NBOUND);
646       retFlags[0] |= __SIMPLE;
647       __getNextChar();
648       break;
649     case 's':
650       offset = __emitNode(OpCode._SPACE);
651       retFlags[0] |= (__NONNULL | __SIMPLE);
652       __getNextChar();
653       break;
654     case 'S':
655       offset = __emitNode(OpCode._NSPACE);
656       retFlags[0] |= (__NONNULL | __SIMPLE);
657       __getNextChar();
658       break;
659     case 'd':
660       offset = __emitNode(OpCode._DIGIT);
661       retFlags[0] |= (__NONNULL | __SIMPLE);
662       __getNextChar();
663       break;
664     case 'D':
665       offset = __emitNode(OpCode._NDIGIT);
666       retFlags[0] |= (__NONNULL | __SIMPLE);
667       __getNextChar();
668       break;
669     case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x':
670     case 'c': case '0':
671       doDefault = true;
672       break tryAgain;
673     case '1': case '2': case '3': case '4': case '5': case '6': case '7':
674     case '8': case '9':
675       int num;
676       StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(10);
677
678       num = 0;
679       value = __input._getValueRelative(num);
680
681       while(Character.isDigit(value)) {
682         buffer.append(value);
683         ++num;
684         value = __input._getValueRelative(num);
685       }
686
687       try {
688         num = Integer.parseInt(buffer.toString());
689       } catch(NumberFormatException JavaDoc e) {
690         throw new MalformedPatternException(
691        "Unexpected number format exception. Please report this bug." +
692        "NumberFormatException message: " + e.getMessage());
693       }
694
695       if(num > 9 && num >= __numParentheses) {
696         doDefault = true;
697         break tryAgain;
698       } else {
699         // A backreference may only occur AFTER its group
700
if(num >= __numParentheses)
701           throw new MalformedPatternException("Invalid backreference: \\" +
702                           num);
703         __sawBackreference = true;
704         offset = __emitArgNode(OpCode._REF, (char)num);
705         retFlags[0] |= __NONNULL;
706
707         value = __input._getValue();
708         while(Character.isDigit(value))
709           value = __input._increment();
710
711         __input._decrement();
712         __getNextChar();
713       }
714       break;
715     case '\0':
716     case CharStringPointer._END_OF_STRING:
717       if(__input._isAtEnd())
718         throw new
719           MalformedPatternException("Trailing \\ in expression.");
720       // fall through to default
721
default:
722       doDefault = true;
723       break tryAgain;
724     }
725     break tryAgain;
726
727       case '#':
728     // skip over comments
729
if((__modifierFlags[0] & __EXTENDED) != 0) {
730       while(!__input._isAtEnd() && __input._getValue() != '\n')
731         __input._increment();
732       if(!__input._isAtEnd())
733         continue tryAgain;
734     }
735     // fall through to default
736
default:
737     __input._increment();
738     doDefault = true;
739     break tryAgain;
740       }// end master switch
741
} // end tryAgain
742

743
744     if(doDefault) {
745       char ender;
746       int length, pOffset, maxOffset, lastOffset, numLength[];
747
748       offset = __emitNode(OpCode._EXACTLY);
749       // Not sure that it's ok to use 0 to mark end.
750
//__emitCode((char)0);
751
__emitCode((char)CharStringPointer._END_OF_STRING);
752
753     forLoop:
754       for(length = 0, pOffset = __input._getOffset() - 1,
755         maxOffset = __input._getLength();
756       length < 127 && pOffset < maxOffset; ++length) {
757
758     lastOffset = pOffset;
759     value = __input._getValue(pOffset);
760
761     switch(value) {
762     case '^': case '$': case '.': case '[': case '(': case ')':
763     case '|':
764       break forLoop;
765     case '\\':
766       value = __input._getValue(++pOffset);
767
768       switch(value) {
769       case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b':
770       case 'B': case 's': case 'S': case 'd': case 'D':
771         --pOffset;
772         break forLoop;
773       case 'n':
774         ender = '\n';
775         ++pOffset;
776         break;
777       case 'r':
778         ender = '\r';
779         ++pOffset;
780         break;
781       case 't':
782         ender = '\t';
783         ++pOffset;
784         break;
785       case 'f':
786         ender = '\f';
787         ++pOffset;
788         break;
789       case 'e':
790         ender = '\033';
791         ++pOffset;
792         break;
793       case 'a':
794         ender = '\007';
795         ++pOffset;
796         break;
797       case 'x':
798         numLength = new int[1];
799         ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength);
800         pOffset+=numLength[0];
801         break;
802       case 'c':
803         ++pOffset;
804         ender = __input._getValue(pOffset++);
805         if(Character.isLowerCase(ender))
806           ender = Character.toUpperCase(ender);
807         ender ^= 64;
808         break;
809       case '0': case '1': case '2': case'3': case '4': case '5':
810       case '6': case '7': case '8': case '9':
811         boolean doOctal = false;
812         value = __input._getValue(pOffset);
813
814         if(value == '0')
815           doOctal = true;
816         value = __input._getValue(pOffset + 1);
817
818         if(Character.isDigit(value)) {
819           int num;
820           StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(10);
821
822           num = pOffset;
823           value = __input._getValue(num);
824
825           while(Character.isDigit(value)){
826         buffer.append(value);
827         ++num;
828         value = __input._getValue(num);
829           }
830
831           try {
832         num = Integer.parseInt(buffer.toString());
833           } catch(NumberFormatException JavaDoc e) {
834         throw new MalformedPatternException(
835          "Unexpected number format exception. Please report this bug." +
836          "NumberFormatException message: " + e.getMessage());
837           }
838
839           if(!doOctal)
840         doOctal = (num >= __numParentheses);
841         }
842
843         if(doOctal) {
844           numLength = new int[1];
845           ender = (char)__parseOctal(__input._array, pOffset, 3, numLength);
846           pOffset+=numLength[0];
847         } else {
848           --pOffset;
849           break forLoop;
850         }
851         break;
852
853       case CharStringPointer._END_OF_STRING:
854       case '\0':
855         if(pOffset >= maxOffset)
856           throw new
857         MalformedPatternException("Trailing \\ in expression.");
858         // fall through to default
859
default:
860         ender = __input._getValue(pOffset++);
861         break;
862       } // end backslash switch
863
break;
864
865     case '#':
866       if((__modifierFlags[0] & __EXTENDED) != 0) {
867         while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
868           ++pOffset;
869       }
870       // fall through to whitespace handling
871
case ' ': case '\t': case '\n': case '\r': case '\f': case '\013':
872       if((__modifierFlags[0] & __EXTENDED) != 0) {
873         ++pOffset;
874         --length;
875         continue;
876       }
877       // fall through to default
878
default:
879       ender = __input._getValue(pOffset++);
880       break;
881
882     } // end master switch
883

884     if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
885        Character.isUpperCase(ender))
886       ender = Character.toLowerCase(ender);
887
888     if(pOffset < maxOffset && __isComplexRepetitionOp(__input._array, pOffset)) {
889       if(length > 0)
890         pOffset = lastOffset;
891       else {
892         ++length;
893         __emitCode(ender);
894       }
895       break;
896     }
897
898     __emitCode(ender);
899
900
901       } // end for loop
902

903
904       __input._setOffset(pOffset - 1);
905       __getNextChar();
906
907       if(length < 0)
908     throw new MalformedPatternException(
909          "Unexpected compilation failure. Please report this bug!");
910       if(length > 0)
911     retFlags[0] |= __NONNULL;
912       if(length == 1)
913     retFlags[0] |= __SIMPLE;
914       if(__program!= null)
915     __program[OpCode._getOperand(offset)] = (char)length;
916       //__emitCode('\0'); // debug
917
__emitCode(CharStringPointer._END_OF_STRING);
918     }
919
920     return offset;
921   }
922
923
924   // Set the bits in a character class. Only recognizes ascii.
925
private void __setCharacterClassBits(char[] bits, int offset, char deflt,
926                        char ch)
927   {
928     if(__program== null || ch >= 256)
929       return;
930     ch &= 0xffff;
931
932     if(deflt == 0) {
933       bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
934     } else {
935       bits[offset + (ch >> 4)] &= ~(1 << (ch & 0xf));
936     }
937   }
938
939
940   private int __parseCharacterClass() throws MalformedPatternException {
941     boolean range = false, skipTest;
942     char clss, deflt, lastclss = Character.MAX_VALUE;
943     int offset, bits, numLength[] = { 0 };
944
945     offset = __emitNode(OpCode._ANYOF);
946
947     if(__input._getValue() == '^') {
948       ++__cost;
949       __input._increment();
950       deflt = 0;
951     } else {
952       deflt = 0xffff;
953     }
954
955     bits = __programSize;
956     for(clss = 0; clss < 16; clss++)
957       __emitCode(deflt);
958
959     clss = __input._getValue();
960
961     if(clss == ']' || clss == '-')
962       skipTest = true;
963     else
964       skipTest = false;
965
966     while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
967       || skipTest) {
968       // It sucks, but we have to make this assignment every time
969
skipTest = false;
970       __input._increment();
971       if(clss == '\\') {
972     clss = __input._postIncrement();
973
974     switch(clss){
975     case 'w':
976       for(clss = 0; clss < 256; clss++)
977         if(OpCode._isWordCharacter(clss))
978           __setCharacterClassBits(__program, bits, deflt, clss);
979       lastclss = Character.MAX_VALUE;
980       continue;
981     case 'W':
982       for(clss = 0; clss < 256; clss++)
983         if(!OpCode._isWordCharacter(clss))
984           __setCharacterClassBits(__program, bits, deflt, clss);
985       lastclss = Character.MAX_VALUE;
986       continue;
987     case 's':
988       for(clss = 0; clss < 256; clss++)
989         if(Character.isWhitespace(clss))
990           __setCharacterClassBits(__program, bits, deflt, clss);
991       lastclss = Character.MAX_VALUE;
992       continue;
993     case 'S':
994       for(clss = 0; clss < 256; clss++)
995         if(!Character.isWhitespace(clss))
996           __setCharacterClassBits(__program, bits, deflt, clss);
997       lastclss = Character.MAX_VALUE;
998       continue;
999     case 'd':
1000      for(clss = '0'; clss <= '9'; clss++)
1001        __setCharacterClassBits(__program, bits, deflt, clss);
1002      lastclss = Character.MAX_VALUE;
1003      continue;
1004    case 'D':
1005      for(clss = 0; clss < '0'; clss++)
1006        __setCharacterClassBits(__program, bits, deflt, clss);
1007      for(clss = (char)('9' + 1); clss < 256; clss++)
1008        __setCharacterClassBits(__program, bits, deflt, clss);
1009      lastclss = Character.MAX_VALUE;
1010      continue;
1011    case 'n':
1012      clss = '\n';
1013      break;
1014    case 'r':
1015      clss = '\r';
1016      break;
1017    case 't':
1018      clss = '\t';
1019      break;
1020    case 'f':
1021      clss =