KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > emf > ecore > xml > type > internal > RegEx


1 /**
2  * <copyright>
3  *
4  * Copyright (c) 2004-2005 IBM Corporation and others.
5  * All rights reserved. This program and the accompanying materials
6  * are made available under the terms of the Eclipse Public License v1.0
7  * which accompanies this distribution, and is available at
8  * http://www.eclipse.org/legal/epl-v10.html
9  *
10  * Contributors:
11  * IBM - Initial API and implementation
12  *
13  * </copyright>
14  *
15  * $Id: RegEx.java,v 1.8 2005/06/12 13:29:22 emerks Exp $
16  *
17  * ---------------------------------------------------------------------
18  *
19  * The Apache Software License, Version 1.1
20  *
21  *
22  * Copyright (c) 1999-2004 The Apache Software Foundation. All rights
23  * reserved.
24  *
25  * Redistribution and use in source and binary forms, with or without
26  * modification, are permitted provided that the following conditions
27  * are met:
28  *
29  * 1. Redistributions of source code must retain the above copyright
30  * notice, this list of conditions and the following disclaimer.
31  *
32  * 2. Redistributions in binary form must reproduce the above copyright
33  * notice, this list of conditions and the following disclaimer in
34  * the documentation and/or other materials provided with the
35  * distribution.
36  *
37  * 3. The end-user documentation included with the redistribution,
38  * if any, must include the following acknowledgment:
39  * "This product includes software developed by the
40  * Apache Software Foundation (http://www.apache.org/)."
41  * Alternately, this acknowledgment may appear in the software itself,
42  * if and wherever such third-party acknowledgments normally appear.
43  *
44  * 4. The names "Xerces" and "Apache Software Foundation" must
45  * not be used to endorse or promote products derived from this
46  * software without prior written permission. For written
47  * permission, please contact apache@apache.org.
48  *
49  * 5. Products derived from this software may not be called "Apache",
50  * nor may "Apache" appear in their name, without prior written
51  * permission of the Apache Software Foundation.
52  *
53  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
54  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
56  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
57  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
60  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
61  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
63  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  * ====================================================================
66  *
67  * This software consists of voluntary contributions made by many
68  * individuals on behalf of the Apache Software Foundation and was
69  * originally based on software copyright (c) 1999-2003, International
70  * Business Machines, Inc., http://www.apache.org. For more
71  * information on the Apache Software Foundation, please see
72  * <http://www.apache.org/>.
73  */

74
75 package org.eclipse.emf.ecore.xml.type.internal;
76
77
78 import java.text.CharacterIterator JavaDoc;
79 import java.util.Hashtable JavaDoc;
80 import java.util.Locale JavaDoc;
81 import java.util.ResourceBundle JavaDoc;
82 import java.util.Vector JavaDoc;
83
84 import org.eclipse.emf.ecore.plugin.EcorePlugin;
85
86 /**
87  * NOTE: this class is for internal use only.
88  */

89 public final class RegEx
90 {
91   static class BMPattern
92   {
93     char[] pattern;
94
95     int[] shiftTable;
96
97     boolean ignoreCase;
98
99     public BMPattern(String JavaDoc pat, boolean ignoreCase)
100     {
101       this(pat, 256, ignoreCase);
102     }
103
104     public BMPattern(String JavaDoc pat, int tableSize, boolean ignoreCase)
105     {
106       this.pattern = pat.toCharArray();
107       this.shiftTable = new int [tableSize];
108       this.ignoreCase = ignoreCase;
109       int length = pattern.length;
110       for (int i = 0; i < this.shiftTable.length; i++)
111         this.shiftTable[i] = length;
112       for (int i = 0; i < length; i++)
113       {
114         char ch = this.pattern[i];
115         int diff = length - i - 1;
116         int index = ch % this.shiftTable.length;
117         if (diff < this.shiftTable[index])
118           this.shiftTable[index] = diff;
119         if (this.ignoreCase)
120         {
121           ch = Character.toUpperCase(ch);
122           index = ch % this.shiftTable.length;
123           if (diff < this.shiftTable[index])
124             this.shiftTable[index] = diff;
125           ch = Character.toLowerCase(ch);
126           index = ch % this.shiftTable.length;
127           if (diff < this.shiftTable[index])
128             this.shiftTable[index] = diff;
129         }
130       }
131     }
132
133     /**
134      *
135      * @return -1 if <var>iterator</var> does not contain this pattern.
136      */

137     public int matches(CharacterIterator JavaDoc iterator, int start, int limit)
138     {
139       if (this.ignoreCase)
140         return this.matchesIgnoreCase(iterator, start, limit);
141       int plength = this.pattern.length;
142       if (plength == 0)
143         return start;
144       int index = start + plength;
145       while (index <= limit)
146       {
147         int pindex = plength;
148         int nindex = index + 1;
149         char ch;
150         do
151         {
152           if ((ch = iterator.setIndex(--index)) != this.pattern[--pindex])
153             break;
154           if (pindex == 0)
155             return index;
156         }
157         while (pindex > 0);
158         index += this.shiftTable[ch % this.shiftTable.length] + 1;
159         if (index < nindex)
160           index = nindex;
161       }
162       return -1;
163     }
164
165     /**
166      *
167      * @return -1 if <var>str</var> does not contain this pattern.
168      */

169     public int matches(String JavaDoc str, int start, int limit)
170     {
171       if (this.ignoreCase)
172         return this.matchesIgnoreCase(str, start, limit);
173       int plength = this.pattern.length;
174       if (plength == 0)
175         return start;
176       int index = start + plength;
177       while (index <= limit)
178       {
179         //System.err.println("Starts at "+index);
180
int pindex = plength;
181         int nindex = index + 1;
182         char ch;
183         do
184         {
185           if ((ch = str.charAt(--index)) != this.pattern[--pindex])
186             break;
187           if (pindex == 0)
188             return index;
189         }
190         while (pindex > 0);
191         index += this.shiftTable[ch % this.shiftTable.length] + 1;
192         if (index < nindex)
193           index = nindex;
194       }
195       return -1;
196     }
197
198     /**
199      *
200      * @return -1 if <var>chars</char> does not contain this pattern.
201      */

202     public int matches(char[] chars, int start, int limit)
203     {
204       if (this.ignoreCase)
205         return this.matchesIgnoreCase(chars, start, limit);
206       int plength = this.pattern.length;
207       if (plength == 0)
208         return start;
209       int index = start + plength;
210       while (index <= limit)
211       {
212         //System.err.println("Starts at "+index);
213
int pindex = plength;
214         int nindex = index + 1;
215         char ch;
216         do
217         {
218           if ((ch = chars[--index]) != this.pattern[--pindex])
219             break;
220           if (pindex == 0)
221             return index;
222         }
223         while (pindex > 0);
224         index += this.shiftTable[ch % this.shiftTable.length] + 1;
225         if (index < nindex)
226           index = nindex;
227       }
228       return -1;
229     }
230
231     int matchesIgnoreCase(CharacterIterator JavaDoc iterator, int start, int limit)
232     {
233       int plength = this.pattern.length;
234       if (plength == 0)
235         return start;
236       int index = start + plength;
237       while (index <= limit)
238       {
239         int pindex = plength;
240         int nindex = index + 1;
241         char ch;
242         do
243         {
244           char ch1 = ch = iterator.setIndex(--index);
245           char ch2 = this.pattern[--pindex];
246           if (ch1 != ch2)
247           {
248             ch1 = Character.toUpperCase(ch1);
249             ch2 = Character.toUpperCase(ch2);
250             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
251               break;
252           }
253           if (pindex == 0)
254             return index;
255         }
256         while (pindex > 0);
257         index += this.shiftTable[ch % this.shiftTable.length] + 1;
258         if (index < nindex)
259           index = nindex;
260       }
261       return -1;
262     }
263
264     int matchesIgnoreCase(String JavaDoc text, int start, int limit)
265     {
266       int plength = this.pattern.length;
267       if (plength == 0)
268         return start;
269       int index = start + plength;
270       while (index <= limit)
271       {
272         int pindex = plength;
273         int nindex = index + 1;
274         char ch;
275         do
276         {
277           char ch1 = ch = text.charAt(--index);
278           char ch2 = this.pattern[--pindex];
279           if (ch1 != ch2)
280           {
281             ch1 = Character.toUpperCase(ch1);
282             ch2 = Character.toUpperCase(ch2);
283             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
284               break;
285           }
286           if (pindex == 0)
287             return index;
288         }
289         while (pindex > 0);
290         index += this.shiftTable[ch % this.shiftTable.length] + 1;
291         if (index < nindex)
292           index = nindex;
293       }
294       return -1;
295     }
296
297     int matchesIgnoreCase(char[] chars, int start, int limit)
298     {
299       int plength = this.pattern.length;
300       if (plength == 0)
301         return start;
302       int index = start + plength;
303       while (index <= limit)
304       {
305         int pindex = plength;
306         int nindex = index + 1;
307         char ch;
308         do
309         {
310           char ch1 = ch = chars[--index];
311           char ch2 = this.pattern[--pindex];
312           if (ch1 != ch2)
313           {
314             ch1 = Character.toUpperCase(ch1);
315             ch2 = Character.toUpperCase(ch2);
316             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
317               break;
318           }
319           if (pindex == 0)
320             return index;
321         }
322         while (pindex > 0);
323         index += this.shiftTable[ch % this.shiftTable.length] + 1;
324         if (index < nindex)
325           index = nindex;
326       }
327       return -1;
328     }
329     /*
330      public static void main(String[] argv) {
331      try {
332      int[] shiftTable = new int[256];
333      initializeBoyerMoore(argv[0], shiftTable, true);
334      int o = -1;
335      CharacterIterator ite = new java.text.StringCharacterIterator(argv[1]);
336      long start = System.currentTimeMillis();
337      //for (int i = 0; i < 10000; i ++)
338      o = searchIgnoreCasesWithBoyerMoore(ite, 0, argv[0], shiftTable);
339      start = System.currentTimeMillis()-start;
340      System.out.println("Result: "+o+", Elapsed: "+start);
341      } catch (Exception ex) {
342      ex.printStackTrace();
343      }
344      }*/

345   }
346   
347   public static class Match implements Cloneable JavaDoc {
348     int[] beginpos = null;
349     int[] endpos = null;
350     int nofgroups = 0;
351
352     CharacterIterator JavaDoc ciSource = null;
353     String JavaDoc strSource = null;
354     char[] charSource = null;
355
356     /**
357      * Creates an instance.
358      */

359     public Match() {
360     }
361
362     /**
363      *
364      */

365     public synchronized Object JavaDoc clone() {
366         Match ma = new Match();
367         if (this.nofgroups > 0) {
368             ma.setNumberOfGroups(this.nofgroups);
369             if (this.ciSource != null) ma.setSource(this.ciSource);
370             if (this.strSource != null) ma.setSource(this.strSource);
371             for (int i = 0; i < this.nofgroups; i ++) {
372                 ma.setBeginning(i, this.getBeginning(i));
373                 ma.setEnd(i, this.getEnd(i));
374             }
375         }
376         return ma;
377     }
378
379     /**
380      *
381      */

382     protected void setNumberOfGroups(int n) {
383         int oldn = this.nofgroups;
384         this.nofgroups = n;
385         if (oldn <= 0
386             || oldn < n || n*2 < oldn) {
387             this.beginpos = new int[n];
388             this.endpos = new int[n];
389         }
390         for (int i = 0; i < n; i ++) {
391             this.beginpos[i] = -1;
392             this.endpos[i] = -1;
393         }
394     }
395
396     /**
397      *
398      */

399     protected void setSource(CharacterIterator JavaDoc ci) {
400         this.ciSource = ci;
401         this.strSource = null;
402         this.charSource = null;
403     }
404     /**
405      *
406      */

407     protected void setSource(String JavaDoc str) {
408         this.ciSource = null;
409         this.strSource = str;
410         this.charSource = null;
411     }
412     /**
413      *
414      */

415     protected void setSource(char[] chars) {
416         this.ciSource = null;
417         this.strSource = null;
418         this.charSource = chars;
419     }
420
421     /**
422      *
423      */

424     protected void setBeginning(int index, int v) {
425         this.beginpos[index] = v;
426     }
427
428     /**
429      *
430      */

431     protected void setEnd(int index, int v) {
432         this.endpos[index] = v;
433     }
434
435     /**
436      * Return the number of regular expression groups.
437      * This method returns 1 when the regular expression has no capturing-parenthesis.
438      */

439     public int getNumberOfGroups() {
440         if (this.nofgroups <= 0)
441             throw new IllegalStateException JavaDoc("A result is not set.");
442         return this.nofgroups;
443     }
444
445     /**
446      * Return a start position in the target text matched to specified regular expression group.
447      *
448      * @param index Less than <code>getNumberOfGroups()</code>.
449      */

450     public int getBeginning(int index) {
451         if (this.beginpos == null)
452             throw new IllegalStateException JavaDoc("A result is not set.");
453         if (index < 0 || this.nofgroups <= index)
454             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
455                                                +this.nofgroups+": "+index);
456         return this.beginpos[index];
457     }
458
459     /**
460      * Return an end position in the target text matched to specified regular expression group.
461      *
462      * @param index Less than <code>getNumberOfGroups()</code>.
463      */

464     public int getEnd(int index) {
465         if (this.endpos == null)
466             throw new IllegalStateException JavaDoc("A result is not set.");
467         if (index < 0 || this.nofgroups <= index)
468             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
469                                                +this.nofgroups+": "+index);
470         return this.endpos[index];
471     }
472
473     /**
474      * Return an substring of the target text matched to specified regular expression group.
475      *
476      * @param index Less than <code>getNumberOfGroups()</code>.
477      */

478     public String JavaDoc getCapturedText(int index) {
479         if (this.beginpos == null)
480             throw new IllegalStateException JavaDoc("match() has never been called.");
481         if (index < 0 || this.nofgroups <= index)
482             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
483                                                +this.nofgroups+": "+index);
484         String JavaDoc ret;
485         int begin = this.beginpos[index], end = this.endpos[index];
486         if (begin < 0 || end < 0) return null;
487         if (this.ciSource != null) {
488             ret = REUtil.substring(this.ciSource, begin, end);
489         } else if (this.strSource != null) {
490             ret = this.strSource.substring(begin, end);
491         } else {
492             ret = new String JavaDoc(this.charSource, begin, end-begin);
493         }
494         return ret;
495     }
496   }
497   
498   public final static class REUtil {
499     private REUtil() {
500     }
501
502     static final int composeFromSurrogates(int high, int low) {
503         return 0x10000 + ((high-0xd800)<<10) + low-0xdc00;
504     }
505
506     static final boolean isLowSurrogate(int ch) {
507         return (ch & 0xfc00) == 0xdc00;
508     }
509
510     static final boolean isHighSurrogate(int ch) {
511         return (ch & 0xfc00) == 0xd800;
512     }
513
514     static final String JavaDoc decomposeToSurrogates(int ch) {
515         char[] chs = new char[2];
516         ch -= 0x10000;
517         chs[0] = (char)((ch>>10)+0xd800);
518         chs[1] = (char)((ch&0x3ff)+0xdc00);
519         return new String JavaDoc(chs);
520     }
521
522     static final String JavaDoc substring(CharacterIterator JavaDoc iterator, int begin, int end) {
523         char[] src = new char[end-begin];
524         for (int i = 0; i < src.length; i ++)
525             src[i] = iterator.setIndex(i+begin);
526         return new String JavaDoc(src);
527     }
528
529     // ================================================================
530

531     static final int getOptionValue(int ch) {
532         int ret = 0;
533         switch (ch) {
534           case 'i':
535             ret = RegularExpression.IGNORE_CASE;
536             break;
537           case 'm':
538             ret = RegularExpression.MULTIPLE_LINES;
539             break;
540           case 's':
541             ret = RegularExpression.SINGLE_LINE;
542             break;
543           case 'x':
544             ret = RegularExpression.EXTENDED_COMMENT;
545             break;
546           case 'u':
547             ret = RegularExpression.USE_UNICODE_CATEGORY;
548             break;
549           case 'w':
550             ret = RegularExpression.UNICODE_WORD_BOUNDARY;
551             break;
552           case 'F':
553             ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
554             break;
555           case 'H':
556             ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
557             break;
558           case 'X':
559             ret = RegularExpression.XMLSCHEMA_MODE;
560             break;
561           case ',':
562             ret = RegularExpression.SPECIAL_COMMA;
563             break;
564           default:
565         }
566         return ret;
567     }
568
569     static final int parseOptions(String JavaDoc opts) throws ParseException {
570         if (opts == null) return 0;
571         int options = 0;
572         for (int i = 0; i < opts.length(); i ++) {
573             int v = getOptionValue(opts.charAt(i));
574             if (v == 0)
575                 throw new ParseException("Unknown Option: "+opts.substring(i), -1);
576             options |= v;
577         }
578         return options;
579     }
580
581     static final String JavaDoc createOptionString(int options) {
582         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(9);
583         if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
584             sb.append('F');
585         if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
586             sb.append('H');
587         if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
588             sb.append('X');
589         if ((options & RegularExpression.IGNORE_CASE) != 0)
590             sb.append('i');
591         if ((options & RegularExpression.MULTIPLE_LINES) != 0)
592             sb.append('m');
593         if ((options & RegularExpression.SINGLE_LINE) != 0)
594             sb.append('s');
595         if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
596             sb.append('u');
597         if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
598             sb.append('w');
599         if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
600             sb.append('x');
601         if ((options & RegularExpression.SPECIAL_COMMA) != 0)
602             sb.append(',');
603         return sb.toString().intern();
604     }
605
606     // ================================================================
607

608     static String JavaDoc stripExtendedComment(String JavaDoc regex) {
609         int len = regex.length();
610         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(len);
611         int offset = 0;
612         while (offset < len) {
613             int ch = regex.charAt(offset++);
614                                                 // Skips a white space.
615
if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ')
616                 continue;
617
618             if (ch == '#') { // Skips chracters between '#' and a line end.
619
while (offset < len) {
620                     ch = regex.charAt(offset++);
621                     if (ch == '\r' || ch == '\n')
622                         break;
623                 }
624                 continue;
625             }
626
627             int next; // Strips an escaped white space.
628
if (ch == '\\' && offset < len) {
629                 if ((next = regex.charAt(offset)) == '#'
630                     || next == '\t' || next == '\n' || next == '\f'
631                     || next == '\r' || next == ' ') {
632                     buffer.append((char)next);
633                     offset ++;
634                 } else { // Other escaped character.
635
buffer.append('\\');
636                     buffer.append((char)next);
637                     offset ++;
638                 }
639             } else // As is.
640
buffer.append((char)ch);
641         }
642         return buffer.toString();
643     }
644
645     // ================================================================
646

647     /**
648      * Sample entry.
649      * <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil &lt;regex&gt; &lt;string&gt;</KBD></div>
650      */

651     public static void main(String JavaDoc[] argv) {
652         String JavaDoc pattern = null;
653         try {
654             String JavaDoc options = "";
655             String JavaDoc target = null;
656             if( argv.length == 0 ) {
657                 System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" );
658                 System.exit( 0 );
659             }
660             for (int i = 0; i < argv.length; i ++) {
661                 if (argv[i].length() == 0 || argv[i].charAt(0) != '-') {
662                     if (pattern == null)
663                         pattern = argv[i];
664                     else if (target == null)
665                         target = argv[i];
666                     else
667                         System.err.println("Unnecessary: "+argv[i]);
668                 } else if (argv[i].equals("-i")) {
669                     options += "i";
670                 } else if (argv[i].equals("-m")) {
671                     options += "m";
672                 } else if (argv[i].equals("-s")) {
673                     options += "s";
674                 } else if (argv[i].equals("-u")) {
675                     options += "u";
676                 } else if (argv[i].equals("-w")) {
677                     options += "w";
678                 } else if (argv[i].equals("-X")) {
679                     options += "X";
680                 } else {
681                     System.err.println("Unknown option: "+argv[i]);
682                 }
683             }
684             RegularExpression reg = new RegularExpression(pattern, options);
685             System.out.println("RegularExpression: "+reg);
686             Match match = new Match();
687             reg.matches(target, match);
688             for (int i = 0; i < match.getNumberOfGroups(); i ++) {
689                 if (i == 0 ) System.out.print("Matched range for the whole pattern: ");
690                 else System.out.print("["+i+"]: ");
691                 if (match.getBeginning(i) < 0)
692                     System.out.println("-1");
693                 else {
694                     System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", ");
695                     System.out.println("\""+match.getCapturedText(i)+"\"");
696                 }
697             }
698         } catch (ParseException pe) {
699             if (pattern == null) {
700                 pe.printStackTrace();
701             } else {
702                 System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage());
703                 String JavaDoc indent = " ";
704                 System.err.println(indent+pattern);
705                 int loc = pe.getLocation();
706                 if (loc >= 0) {
707                     System.err.print(indent);
708                     for (int i = 0; i < loc; i ++) System.err.print("-");
709                     System.err.println("^");
710                 }
711             }
712         } catch (Exception JavaDoc e) {
713             e.printStackTrace();
714         }
715     }
716
717     static final int CACHESIZE = 20;
718     static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE];
719     /**
720      * Creates a RegularExpression instance.
721      * This method caches created instances.
722      *
723      * @see RegularExpression#RegularExpression(String, String)
724      */

725     public static RegularExpression createRegex(String JavaDoc pattern, String JavaDoc options)
726         throws ParseException {
727         RegularExpression re = null;
728         int intOptions = REUtil.parseOptions(options);
729         synchronized (REUtil.regexCache) {
730             int i;
731             for (i = 0; i < REUtil.CACHESIZE; i ++) {
732                 RegularExpression cached = REUtil.regexCache[i];
733                 if (cached == null) {
734                     i = -1;
735                     break;
736                 }
737                 if (cached.equals(pattern, intOptions)) {
738                     re = cached;
739                     break;
740                 }
741             }
742             if (re != null) {
743                 if (i != 0) {
744                     System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i);
745                     REUtil.regexCache[0] = re;
746                 }
747             } else {
748                 re = new RegularExpression(pattern, options);
749                 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1);
750                 REUtil.regexCache[0] = re;
751             }
752         }
753         return re;
754     }
755
756     /**
757      *
758      * @see RegularExpression#matches(String)
759      */

760     public static boolean matches(String JavaDoc regex, String JavaDoc target) throws ParseException {
761         return REUtil.createRegex(regex, null).matches(target);
762     }
763
764     /**
765      *
766      * @see RegularExpression#matches(String)
767      */

768     public static boolean matches(String JavaDoc regex, String JavaDoc options, String JavaDoc target) throws ParseException {
769         return REUtil.createRegex(regex, options).matches(target);
770     }
771
772     // ================================================================
773

774     /**
775      *
776      */

777     public static String JavaDoc quoteMeta(String JavaDoc literal) {
778         int len = literal.length();
779         StringBuffer JavaDoc buffer = null;
780         for (int i = 0; i < len; i ++) {
781             int ch = literal.charAt(i);
782             if (".*+?{[()|\\^$".indexOf(ch) >= 0) {
783                 if (buffer == null) {
784                     buffer = new StringBuffer JavaDoc(i+(len-i)*2);
785                     if (i > 0) buffer.append(literal.substring(0, i));
786                 }
787                 buffer.append('\\');
788                 buffer.append((char)ch);
789             } else if (buffer != null)
790                 buffer.append((char)ch);
791         }
792         return buffer != null ? buffer.toString() : literal;
793     }
794
795     // ================================================================
796

797     static void dumpString(String JavaDoc v) {
798         for (int i = 0; i < v.length(); i ++) {
799             System.out.print(Integer.toHexString(v.charAt(i)));
800             System.out.print(" ");
801         }
802         System.out.println();
803     }
804 }
805   
806
807   /**
808    * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
809    * This engine does not conform to the POSIX regular expression.
810    *
811    * <hr width="50%">
812    * <h3>How to use</h3>
813    *
814    * <dl>
815    * <dt>A. Standard way
816    * <dd>
817    * <pre>
818    * RegularExpression re = new RegularExpression(<var>regex</var>);
819    * if (re.matches(text)) { ... }
820    * </pre>
821    *
822    * <dt>B. Capturing groups
823    * <dd>
824    * <pre>
825    * RegularExpression re = new RegularExpression(<var>regex</var>);
826    * Match match = new Match();
827    * if (re.matches(text, match)) {
828    * ... // You can refer captured texts with methods of the <code>Match</code> class.
829    * }
830    * </pre>
831    *
832    * </dl>
833    *
834    * <h4>Case-insensitive matching</h4>
835    * <pre>
836    * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
837    * if (re.matches(text) >= 0) { ...}
838    * </pre>
839    *
840    * <h4>Options</h4>
841    * <p>You can specify options to <a HREF="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
842    * or <a HREF="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
843    * This <var>options</var> parameter consists of the following characters.
844    * </p>
845    * <dl>
846    * <dt><a name="I_OPTION"><code>"i"</code></a>
847    * <dd>This option indicates case-insensitive matching.
848    * <dt><a name="M_OPTION"><code>"m"</code></a>
849    * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
850    * <dt><a name="S_OPTION"><code>"s"</code></a>
851    * <dd class="REGEX"><kbd>.</kbd> matches any one character.
852    * <dt><a name="U_OPTION"><code>"u"</code></a>
853    * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
854    * <dt><a name="W_OPTION"><code>"w"</code></a>
855    * <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
856    * 'Unicode Regular Expression Guidelines' Revision 4.
857    * When "w" and "u" are specified at the same time,
858    * <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
859    * <dt><a name="COMMA_OPTION"><code>","</code></a>
860    * <dd>The parser treats a comma in a character class as a range separator.
861    * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
862    * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
863    *
864    * <dt><a name="X_OPTION"><code>"X"</code></a>
865    * <dd class="REGEX">
866    * By this option, the engine confoms to <a HREF="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
867    * The <code>match()</code> method does not do subsring matching
868    * but entire string matching.
869    *
870    * </dl>
871    *
872    * <hr width="50%">
873    * <h3>Syntax</h3>
874    * <table border="1" bgcolor="#ddeeff">
875    * <tr>
876    * <td>
877    * <h4>Differences from the Perl 5 regular expression</h4>
878    * <ul>
879    * <li>There is 6-digit hexadecimal character representation (<kbd>\v</kbd><var>HHHHHH</var>.)
880    * <li>Supports subtraction, union, and intersection operations for character classes.
881    * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
882    * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
883    * <kbd>\ u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
884    * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
885    * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
886    * </ul>
887    * </td>
888    * </tr>
889    * </table>
890    *
891    * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
892    * <ul>
893    * <li>Character
894    * <dl>
895    * <dt class="REGEX"><kbd>.</kbd> (A period)
896    * <dd>Matches any one character except the following characters.
897    * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
898    * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
899    * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
900    * <dd>When <a HREF="#S_OPTION">the "s" option</a> is specified,
901    * it matches any character including the above four characters.
902    *
903    * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
904    * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
905    * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
906    *
907    * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
908    * <dd>Matches a control character.
909    * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
910    * '<kbd>[</kbd>', '<kbd>\</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
911    * It matches a control character of which the character code is less than
912    * the character code of the <var>C</var> by 0x0040.
913    * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
914    * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
915    *
916    * <dt class="REGEX">a non-meta character
917    * <dd>Matches the character.
918    *
919    * <dt class="REGEX"><KBD>\</KBD> + a meta character
920    * <dd>Matches the meta character.
921    *
922    * <dt class="REGEX"><kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>
923    * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
924    * You can write just 2 digits for <kbd>\x</kbd><var>HH</var>, and
925    * variable length digits for <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>.
926    *
927    * <!--
928    * <dt class="REGEX"><kbd>\ u</kbd><var>HHHH</var>
929    * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
930    * -->
931    *
932    * <dt class="REGEX"><kbd>\v</kbd><var>HHHHHH</var>
933    * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
934    *
935    * <dt class="REGEX"><kbd>\g</kbd>
936    * <dd>Matches a grapheme.
937    * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
938    *
939    * <dt class="REGEX"><kbd>\X</kbd>
940    * <dd class="REGEX">Matches a combining character sequence.
941    * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
942    * </dl>
943    * </li>
944    *
945    * <li>Character class
946    * <dl>
947   + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a HREF="#COMMA_OPTION">"," option</a>)
948   + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a HREF="#COMMA_OPTION">"," option</a>)
949    * <dd>Positive character class. It matches a character in ranges.
950    * <dd><var>R<sub>n</sub></var>:
951    * <ul>
952    * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\ u</kbd><var>HHHH</var--> <kbd>\v</kbd><var>HHHHHH</var>)
953    * <p>This range matches the character.
954    * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
955    * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
956   + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
957   + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
958    * <p>...
959    * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
960    * <p>These expressions specifies the same ranges as the following expressions.
961    * </ul>
962    * <p class="REGEX">Enumerated ranges are merged (union operation).
963    * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
964    *
965    * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a HREF="#COMMA_OPTION">"," option</a>)
966    * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a HREF="#COMMA_OPTION">"," option</a>)
967    * <dd>Negative character class. It matches a character not in ranges.
968    *
969    * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
970    * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
971    * <dd>Subtraction or union or intersection for character classes.
972    * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
973    * <dd>The result of this operations is a <u>positive character class</u>
974    * even if an expression includes any negative character classes.
975    * You have to take care on this in case-insensitive matching.
976    * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
977    * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
978    * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
979    * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
980    * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
981    *
982    * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a HREF="#X_OPTION">"X" option</a>)</dt>
983    * <dd>Character class subtraction for the XML Schema.
984    * You can use this syntax when you specify an <a HREF="#X_OPTION">"X" option</a>.
985    *
986    * <dt class="REGEX"><kbd>\d</kbd>
987    * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
988    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
989    * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
990    *
991    * <dt class="REGEX"><kbd>\D</kbd>
992    * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
993    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
994    * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
995    *
996    * <dt class="REGEX"><kbd>\s</kbd>
997    * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
998    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
999    * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
1000   *
1001   * <dt class="REGEX"><kbd>\S</kbd>
1002   * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
1003   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1004   * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
1005   *
1006   * <dt class="REGEX"><kbd>\w</kbd>
1007   * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
1008   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1009   * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
1010   *
1011   * <dt class="REGEX"><kbd>\W</kbd>
1012   * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
1013   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1014   * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
1015   *
1016   * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
1017   * <dd>Matches one character in the specified General Category (the second field in <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
1018   * The following names are available:
1019   * <dl>
1020   * <dt>Unicode General Categories:
1021   * <dd><kbd>
1022   * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
1023   * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
1024   * </kbd>
1025   * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
1026   * <dt>Unicode Blocks:
1027   * <dd><kbd>
1028   * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
1029   * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
1030   * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
1031   * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
1032   * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
1033   * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
1034   * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
1035   * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
1036   * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
1037   * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
1038   * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
1039   * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
1040   * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
1041   * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
1042   * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
1043   * Small Form Variants, Arabic Presentation Forms-B, Specials,
1044   * Halfwidth and Fullwidth Forms
1045   * </kbd>
1046   * <dt>Others:
1047   * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u0000-\v10FFFF]</kbd>)
1048   * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
1049   * <dd><kbd>UNASSGINED</kbd>
1050   * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
1051   * </dl>
1052   *
1053   * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
1054   * <dd>Matches one character not in the specified General Category or the specified Block.
1055   * </dl>
1056   * </li>
1057   *
1058   * <li>Selection and Quantifier
1059   * <dl>
1060   * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
1061   * <dd>...
1062   *
1063   * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
1064   * <dd>Matches 0 or more <var>X</var>.
1065   *
1066   * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
1067   * <dd>Matches 1 or more <var>X</var>.
1068   *
1069   * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
1070   * <dd>Matches 0 or 1 <var>X</var>.
1071   *
1072   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
1073   * <dd>Matches <var>number</var> times.
1074   *
1075   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
1076   * <dd>...
1077   *
1078   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
1079   * <dd>...
1080   *
1081   * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
1082   * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
1083   * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
1084   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
1085   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
1086   * <dd>Non-greedy matching.
1087   * </dl>
1088   * </li>
1089   *
1090   * <li>Grouping, Capturing, and Back-reference
1091   * <dl>
1092   * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
1093   * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
1094   * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
1095   * you have to write "<KBD>(?:foo)+</KBD>".
1096   *
1097   * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
1098   * <dd>Grouping with capturing.
1099   * It make a group and applications can know
1100   * where in target text a group matched with methods of a <code>Match</code> instance
1101   * after <code><a HREF="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
1102   * The 0th group means whole of this regular expression.
1103   * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
1104   *
1105   * <p>For instance, a regular expression is
1106   * "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
1107   * and target text is
1108   * "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
1109   * <ul>
1110   * <li><code>Match.getCapturedText(0)</code>:
1111   * "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
1112   * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
1113   * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
1114   * </ul>
1115   *
1116   * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
1117   * <dd>
1118   *
1119   * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
1120   * <dd>Independent expression group. ................
1121   *
1122   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
1123   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
1124   * <dd>............................
1125   * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
1126   * Note that it can not contain 'u'.
1127   *
1128   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
1129   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
1130   * <dd>......
1131   * <dd>These expressions must be at the beginning of a group.
1132   * </dl>
1133   * </li>
1134   *
1135   * <li>Anchor
1136   * <dl>
1137   * <dt class="REGEX"><kbd>\A</kbd>
1138   * <dd>Matches the beginnig of the text.
1139   *
1140   * <dt class="REGEX"><kbd>\Z</kbd>
1141   * <dd>Matches the end of the text, or before an EOL character at the end of the text,
1142   * or CARRIAGE RETURN + LINE FEED at the end of the text.
1143   *
1144   * <dt class="REGEX"><kbd>\z</kbd>
1145   * <dd>Matches the end of the text.
1146   *
1147   * <dt class="REGEX"><kbd>^</kbd>
1148   * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
1149   * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
1150   * it matches the beginning of the text, or after one of EOL characters (
1151   * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
1152   * PARAGRAPH SEPARATOR (U+2029).)
1153   *
1154   * <dt class="REGEX"><kbd>$</kbd>
1155   * <dd>Matches the end of the text, or before an EOL character at the end of the text,
1156   * or CARRIAGE RETURN + LINE FEED at the end of the text.
1157   * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
1158   * it matches the end of the text, or before an EOL character.
1159   *
1160   * <dt class="REGEX"><kbd>\b</kbd>
1161   * <dd>Matches word boundary.
1162   * (See <a HREF="#W_OPTION">a "w" option</a>)
1163   *
1164   * <dt class="REGEX"><kbd>\B</kbd>
1165   * <dd>Matches non word boundary.
1166   * (See <a HREF="#W_OPTION">a "w" option</a>)
1167   *
1168   * <dt class="REGEX"><kbd>\&lt;</kbd>
1169   * <dd>Matches the beginning of a word.
1170   * (See <a HREF="#W_OPTION">a "w" option</a>)
1171   *
1172   * <dt class="REGEX"><kbd>\&gt;</kbd>
1173   * <dd>Matches the end of a word.
1174   * (See <a HREF="#W_OPTION">a "w" option</a>)
1175   * </dl>
1176   * </li>
1177   * <li>Lookahead and lookbehind
1178   * <dl>
1179   * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
1180   * <dd>Lookahead.
1181   *
1182   * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
1183   * <dd>Negative lookahead.
1184   *
1185   * <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
1186   * <dd>Lookbehind.
1187   * <dd>(Note for text capturing......)
1188   *
1189   * <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
1190   * <dd>Negative lookbehind.
1191   * </dl>
1192   * </li>
1193   *
1194   * <li>Misc.
1195   * <dl>
1196   * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
1197   * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
1198   * <dd>......
1199   * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
1200   * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
1201   * You can not write comments in character classes and before quantifiers.
1202   * </dl>
1203   * </li>
1204   * </ul>
1205   *
1206   *
1207   * <hr width="50%">
1208   * <h3>BNF for the regular expression</h3>
1209   * <pre>
1210   * regex ::= ('(?' options ')')? term ('|' term)*
1211   * term ::= factor+
1212   * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
1213   * | '(?#' [^)]* ')'
1214   * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
1215   * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
1216   * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
1217   * | '(?>' regex ')' | '(?' options ':' regex ')'
1218   * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
1219   * options ::= [imsw]* ('-' [imsw]+)?
1220   * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
1221   * looks ::= '(?=' regex ')' | '(?!' regex ')'
1222   * | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
1223   * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
1224   * category-block ::= '\' [pP] category-symbol-1
1225   * | ('\p{' | '\P{') (category-symbol | block-name
1226   * | other-properties) '}'
1227   * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
1228   * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
1229   * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
1230   * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
1231   * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
1232   * | 'Sm' | 'Sc' | 'Sk' | 'So'
1233   * block-name ::= (See above)
1234   * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
1235   * character-1 ::= (any character except meta-characters)
1236   *
1237   * char-class ::= '[' ranges ']'
1238   * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
1239   * ranges ::= '^'? (range <a HREF="#COMMA_OPTION">','?</a>)+
1240   * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
1241   * | range-char | range-char '-' range-char
1242   * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
1243   * code-point ::= '\x' hex-char hex-char
1244   * | '\x{' hex-char+ '}'
1245   * <!-- | '\ u' hex-char hex-char hex-char hex-char
1246   * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
1247   * hex-char ::= [0-9a-fA-F]
1248   * character-2 ::= (any character except \[]-,)
1249   * </pre>
1250   *
1251   * <hr width="50%">
1252   * <h3>to do</h3>
1253   * <ul>
1254   * <li><a HREF="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
1255   * <ul>
1256   * <li>2.4 Canonical Equivalents
1257   * <li>Level 3
1258   * </ul>
1259   * <li>Parsing performance
1260   * </ul>
1261   *
1262   * <hr width="50%">
1263   *
1264   * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
1265   * @version $Id: RegEx.java,v 1.8 2005/06/12 13:29:22 emerks Exp $
1266   */

1267  public static class RegularExpression implements java.io.Serializable JavaDoc {
1268      static final boolean DEBUG = false;
1269
1270      /**
1271       * Compiles a token tree into an operation flow.
1272       */

1273      private synchronized void compile(Token tok) {
1274          if (this.operations != null)
1275              return;
1276          this.numberOfClosures = 0;
1277          this.operations = this.compile(tok, null, false);
1278      }
1279
1280      /**
1281       * Converts a token to an operation.
1282       */

1283      private Op compile(Token tok, Op next, boolean reverse) {
1284          Op ret;
1285          switch (tok.type) {
1286          case Token.DOT:
1287              ret = Op.createDot();
1288              ret.next = next;
1289              break;
1290
1291          case Token.CHAR:
1292              ret = Op.createChar(tok.getChar());
1293              ret.next = next;
1294              break;
1295
1296          case Token.ANCHOR:
1297              ret = Op.createAnchor(tok.getChar());
1298              ret.next = next;
1299              break;
1300
1301          case Token.RANGE:
1302          case Token.NRANGE:
1303              ret = Op.createRange(tok);
1304              ret.next = next;
1305              break;
1306
1307          case Token.CONCAT:
1308              ret = next;
1309              if (!reverse) {
1310                  for (int i = tok.size()-1; i >= 0; i --) {
1311                      ret = compile(tok.getChild(i), ret, false);
1312                  }
1313              } else {
1314                  for (int i = 0; i < tok.size(); i ++) {
1315                      ret = compile(tok.getChild(i), ret, true);
1316                  }
1317              }
1318              break;
1319
1320          case Token.UNION:
1321              Op.UnionOp uni = Op.createUnion(tok.size());
1322              for (int i = 0; i < tok.size(); i ++) {
1323                  uni.addElement(compile(tok.getChild(i), next, reverse));
1324              }
1325              ret = uni; // ret.next is null.
1326
break;
1327
1328          case Token.CLOSURE:
1329          case Token.NONGREEDYCLOSURE:
1330              Token child = tok.getChild(0);
1331              int min = tok.getMin();
1332              int max = tok.getMax();
1333              if (min >= 0 && min == max) { // {n}
1334
ret = next;
1335                  for (int i = 0; i < min; i ++) {
1336                      ret = compile(child, ret, reverse);
1337                  }
1338                  break;
1339              }
1340              if (min > 0 && max > 0)
1341                  max -= min;
1342              if (max > 0) {
1343                  // X{2,6} -> XX(X(X(XX?)?)?)?
1344
ret = next;
1345                  for (int i = 0; i < max; i ++) {
1346                      Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
1347                      q.next = next;
1348                      q.setChild(compile(child, ret, reverse));
1349                      ret = q;
1350                  }
1351              } else {
1352                  Op.ChildOp op;
1353                  if (tok.type == Token.NONGREEDYCLOSURE) {
1354                      op = Op.createNonGreedyClosure();
1355                  } else { // Token.CLOSURE
1356
if (child.getMinLength() == 0)
1357                          op = Op.createClosure(this.numberOfClosures++);
1358                      else
1359                          op = Op.createClosure(-1);
1360                  }
1361                  op.next = next;
1362                  op.setChild(compile(child, op, reverse));
1363                  ret = op;
1364              }
1365              if (min > 0) {
1366                  for (int i = 0; i < min; i ++) {
1367                      ret = compile(child, ret, reverse);
1368                  }
1369              }
1370              break;
1371
1372          case Token.EMPTY:
1373              ret = next;
1374              break;
1375
1376          case Token.STRING:
1377              ret = Op.createString(tok.getString());
1378              ret.next = next;
1379              break;
1380
1381          case Token.BACKREFERENCE:
1382              ret = Op.createBackReference(tok.getReferenceNumber());
1383              ret.next = next;
1384              break;
1385
1386          case Token.PAREN:
1387              if (tok.getParenNumber() == 0) {
1388                  ret = compile(tok.getChild(0), next, reverse);
1389              } else if (reverse) {
1390                  next = Op.createCapture(tok.getParenNumber(), next);
1391                  next = compile(tok.getChild(0), next, reverse);
1392                  ret = Op.createCapture(-tok.getParenNumber(), next);
1393              } else {
1394                  next = Op.createCapture(-tok.getParenNumber(), next);
1395                  next = compile(tok.getChild(0), next, reverse);
1396                  ret = Op.createCapture(tok.getParenNumber(), next);
1397              }
1398              break;
1399
1400          case Token.LOOKAHEAD:
1401              ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
1402              break;
1403          case Token.NEGATIVELOOKAHEAD:
1404              ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
1405              break;
1406          case Token.LOOKBEHIND:
1407              ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
1408              break;
1409          case Token.NEGATIVELOOKBEHIND:
1410              ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
1411              break;
1412
1413          case Token.INDEPENDENT:
1414              ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
1415              break;
1416
1417          case Token.MODIFIERGROUP:
1418              ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
1419                                      ((Token.ModifierToken)tok).getOptions(),
1420                                      ((Token.ModifierToken)tok).getOptionsMask());
1421              break;
1422
1423          case Token.CONDITION:
1424              Token.ConditionToken ctok = (Token.ConditionToken)tok;
1425              int ref = ctok.refNumber;
1426              Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
1427              Op yes = compile(ctok.yes, next, reverse);
1428              Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
1429              ret = Op.createCondition(next, ref, condition, yes, no);
1430              break;
1431
1432          default:
1433              throw new RuntimeException JavaDoc("Unknown token type: "+tok.type);
1434          } // switch (tok.type)
1435
return ret;
1436      }
1437
1438
1439// Public
1440

1441      /**
1442       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1443       *
1444       * @return true if the target is matched to this regular expression.
1445       */

1446      public boolean matches(char[] target) {
1447          return this.matches(target, 0, target .length , (Match)null);
1448      }
1449
1450      /**
1451       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1452       * in specified range or not.
1453       *
1454       * @param start Start offset of the range.
1455       * @param end End offset +1 of the range.
1456       * @return true if the target is matched to this regular expression.
1457       */

1458      public boolean matches(char[] target, int start, int end) {
1459          return this.matches(target, start, end, (Match)null);
1460      }
1461
1462      /**
1463       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1464       *
1465       * @param match A Match instance for storing matching result.
1466       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1467       */

1468      public boolean matches(char[] target, Match match) {
1469          return this.matches(target, 0, target .length , match);
1470      }
1471
1472
1473      /**
1474       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1475       * in specified range or not.
1476       *
1477       * @param start Start offset of the range.
1478       * @param end End offset +1 of the range.
1479       * @param match A Match instance for storing matching result.
1480       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1481       */

1482      public boolean matches(char[] target, int start, int end, Match match) {
1483
1484          synchronized (this) {
1485              if (this.operations == null)
1486                  this.prepare();
1487              if (this.context == null)
1488                  this.context = new Context();
1489          }
1490          Context con = null;
1491          synchronized (this.context) {
1492              con = this.context.inuse ? new Context() : this.context;
1493              con.reset(target, start, end, this.numberOfClosures);
1494          }
1495          if (match != null) {
1496              match.setNumberOfGroups(this.nofparen);
1497              match.setSource(target);
1498          } else if (this.hasBackReferences) {
1499              match = new Match();
1500              match.setNumberOfGroups(this.nofparen);
1501              // Need not to call setSource() because
1502
// a caller can not access this match instance.
1503
}
1504          con.match = match;
1505
1506          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1507              int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
1508              //System.err.println("DEBUG: matchEnd="+matchEnd);
1509
if (matchEnd == con.limit) {
1510                  if (con.match != null) {
1511                      con.match.setBeginning(0, con.start);
1512                      con.match.setEnd(0, matchEnd);
1513                  }
1514                  con.inuse = false;
1515                  return true;
1516              }
1517              return false;
1518          }
1519
1520          /*
1521           * The pattern has only fixed string.
1522           * The engine uses Boyer-Moore.
1523           */

1524          if (this.fixedStringOnly) {
1525              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1526
int o = this.fixedStringTable.matches(target, con.start, con.limit);
1527              if (o >= 0) {
1528                  if (con.match != null) {
1529                      con.match.setBeginning(0, o);
1530                      con.match.setEnd(0, o+this.fixedString.length());
1531                  }
1532                  con.inuse = false;
1533                  return true;
1534              }
1535              con.inuse = false;
1536              return false;
1537          }
1538
1539          /*
1540           * The pattern contains a fixed string.
1541           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1542           * If not, it return with false.
1543           */

1544          if (this.fixedString != null) {
1545              int o = this.fixedStringTable.matches(target, con.start, con.limit);
1546              if (o < 0) {
1547                  //System.err.println("Non-match in fixed-string search.");
1548
con.inuse = false;
1549                  return false;
1550              }
1551          }
1552
1553          int limit = con.limit-this.minlength;
1554          int matchStart;
1555          int matchEnd = -1;
1556
1557          /*
1558           * Checks whether the expression starts with ".*".
1559           */

1560          if (this.operations != null
1561              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1562              if (isSet(this.options, SINGLE_LINE)) {
1563                  matchStart = con.start;
1564                  matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
1565              } else {
1566                  boolean previousIsEOL = true;
1567                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1568                      int ch = target [ matchStart ] ;
1569                      if (isEOLChar(ch)) {
1570                          previousIsEOL = true;
1571                      } else {
1572                          if (previousIsEOL) {
1573                              if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1574                                                                         matchStart, 1, this.options)))
1575                                  break;
1576                          }
1577                          previousIsEOL = false;
1578                      }
1579                  }
1580              }
1581          }
1582
1583          /*
1584           * Optimization against the first character.
1585           */

1586          else if (this.firstChar != null) {
1587              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1588
RangeToken range = this.firstChar;
1589              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
1590                  range = this.firstChar.getCaseInsensitiveToken();
1591                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1592                      int ch = target [ matchStart ] ;
1593                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1594                          ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
1595                          if (!range.match(ch)) continue;
1596                      } else {
1597                          if (!range.match(ch)) {
1598                              char ch1 = Character.toUpperCase((char)ch);
1599                              if (!range.match(ch1))
1600                                  if (!range.match(Character.toLowerCase(ch1)))
1601                                      continue;
1602                          }
1603                      }
1604                      if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1605                                                                 matchStart, 1, this.options)))
1606                          break;
1607                  }
1608              } else {
1609                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1610                      int ch = target [ matchStart ] ;
1611                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
1612                          ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
1613                      if (!range.match(ch)) continue;
1614                      if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1615                                                                 matchStart, 1, this.options)))
1616                          break;
1617                  }
1618              }
1619          }
1620
1621          /*
1622           * Straightforward matching.
1623           */

1624          else {
1625              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1626                  if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
1627                      break;
1628              }
1629          }
1630
1631          if (matchEnd >= 0) {
1632              if (con.match != null) {
1633                  con.match.setBeginning(0, matchStart);
1634                  con.match.setEnd(0, matchEnd);
1635              }
1636              con.inuse = false;
1637              return true;
1638          } else {
1639              con.inuse = false;
1640              return false;
1641          }
1642      }
1643
1644  /**
1645   * @return -1 when not match; offset of the end of matched string when match.
1646   */

1647      private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
1648
1649          char[] target = con.charTarget;
1650
1651
1652          while (true) {
1653              if (op == null)
1654                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1655              if (offset > con.limit || offset < con.start)
1656                  return -1;
1657              switch (op.type) {
1658              case Op.CHAR:
1659                  if (isSet(opts, IGNORE_CASE)) {
1660                      int ch = op.getData();
1661                      if (dx > 0) {
1662                          if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
1663                              return -1;
1664                          offset ++;
1665                      } else {
1666                          int o1 = offset-1;
1667                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
1668                              return -1;
1669                          offset = o1;
1670                      }
1671                  } else {
1672                      int ch = op.getData();
1673                      if (dx > 0) {
1674                          if (offset >= con.limit || ch != target [ offset ] )
1675                              return -1;
1676                          offset ++;
1677                      } else {
1678                          int o1 = offset-1;
1679                          if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
1680                              return -1;
1681                          offset = o1;
1682                      }
1683                  }
1684                  op = op.next;
1685                  break;
1686
1687              case Op.DOT:
1688                  if (dx > 0) {
1689                      if (offset >= con.limit)
1690                          return -1;
1691                      int ch = target [ offset ] ;
1692                      if (isSet(opts, SINGLE_LINE)) {
1693                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1694                              offset ++;
1695                      } else {
1696                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1697                              ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
1698                          if (isEOLChar(ch))
1699                              return -1;
1700                      }
1701                      offset ++;
1702                  } else {
1703                      int o1 = offset-1;
1704                      if (o1 >= con.limit || o1 < 0)
1705                          return -1;
1706                      int ch = target [ o1 ] ;
1707                      if (isSet(opts, SINGLE_LINE)) {
1708                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1709                              o1 --;
1710                      } else {
1711                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1712                              ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
1713                          if (!isEOLChar(ch))
1714                              return -1;
1715                      }
1716                      offset = o1;
1717                  }
1718                  op = op.next;
1719                  break;
1720
1721              case Op.RANGE:
1722              case Op.NRANGE:
1723                  if (dx > 0) {
1724                      if (offset >= con.limit)
1725                          return -1;
1726                      int ch = target [ offset ] ;
1727                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1728                          ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
1729                      RangeToken tok = op.getToken();
1730                      if (isSet(opts, IGNORE_CASE)) {
1731                          tok = tok.getCaseInsensitiveToken();
1732                          if (!tok.match(ch)) {
1733                              if (ch >= 0x10000) return -1;
1734                              char uch;
1735                              if (!tok.match(uch = Character.toUpperCase((char)ch))
1736                                  && !tok.match(Character.toLowerCase(uch)))
1737                                  return -1;
1738                          }
1739                      } else {
1740                          if (!tok.match(ch)) return -1;
1741                      }
1742                      offset ++;
1743                  } else {
1744                      int o1 = offset-1;
1745                      if (o1 >= con.limit || o1 < 0)
1746                          return -1;
1747                      int ch = target [ o1 ] ;
1748                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1749                          ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
1750                      RangeToken tok = op.getToken();
1751                      if (isSet(opts, IGNORE_CASE)) {
1752                          tok = tok.getCaseInsensitiveToken();
1753                          if (!tok.match(ch)) {
1754                              if (ch >= 0x10000) return -1;
1755                              char uch;
1756                              if (!tok.match(uch = Character.toUpperCase((char)ch))
1757                                  && !tok.match(Character.toLowerCase(uch)))
1758                                  return -1;
1759                          }
1760                      } else {
1761                          if (!tok.match(ch)) return -1;
1762                      }
1763                      offset = o1;
1764                  }
1765                  op = op.next;
1766                  break;
1767
1768              case Op.ANCHOR:
1769                  boolean go = false;
1770                  switch (op.getData()) {
1771                  case '^':
1772                      if (isSet(opts, MULTIPLE_LINES)) {
1773                          if (!(offset == con.start
1774                                || offset > con.start && isEOLChar( target [ offset-1 ] )))
1775                              return -1;
1776                      } else {
1777                          if (offset != con.start)
1778                              return -1;
1779                      }
1780                      break;
1781
1782                  case '@': // Internal use only.
1783
// The @ always matches line beginnings.
1784
if (!(offset == con.start
1785                            || offset > con.start && isEOLChar( target [ offset-1 ] )))
1786                          return -1;
1787                      break;
1788
1789                  case '$':
1790                      if (isSet(opts, MULTIPLE_LINES)) {
1791                          if (!(offset == con.limit
1792                                || offset < con.limit && isEOLChar( target [ offset ] )))
1793                              return -1;
1794                      } else {
1795                          if (!(offset == con.limit
1796                                || offset+1 == con.limit && isEOLChar( target [ offset ] )
1797                                || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1798                                && target [ offset+1 ] == LINE_FEED))
1799                              return -1;
1800                      }
1801                      break;
1802
1803                  case 'A':
1804                      if (offset != con.start) return -1;
1805                      break;
1806
1807                  case 'Z':
1808                      if (!(offset == con.limit
1809                            || offset+1 == con.limit && isEOLChar( target [ offset ] )
1810                            || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1811                            && target [ offset+1 ] == LINE_FEED))
1812                          return -1;
1813                      break;
1814
1815                  case 'z':
1816                      if (offset != con.limit) return -1;
1817                      break;
1818
1819                  case 'b':
1820                      if (con.length == 0) return -1;
1821                      {
1822                          int after = getWordType(target, con.start, con.limit, offset, opts);
1823                          if (after == WT_IGNORE) return -1;
1824                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1825                          if (after == before) return -1;
1826                      }
1827                      break;
1828
1829                  case 'B':
1830                      if (con.length == 0)
1831                          go = true;
1832                      else {
1833                          int after = getWordType(target, con.start, con.limit, offset, opts);
1834                          go = after == WT_IGNORE
1835                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1836                      }
1837                      if (!go) return -1;
1838                      break;
1839
1840                  case '<':
1841                      if (con.length == 0 || offset == con.limit) return -1;
1842                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1843                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1844                          return -1;
1845                      break;
1846
1847                  case '>':
1848                      if (con.length == 0 || offset == con.start) return -1;
1849                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1850                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1851                          return -1;
1852                      break;
1853                  } // switch anchor type
1854
op = op.next;
1855                  break;
1856
1857              case Op.BACKREFERENCE:
1858                  {
1859                      int refno = op.getData();
1860                      if (refno <= 0 || refno >= this.nofparen)
1861                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
1862                      if (con.match.getBeginning(refno) < 0
1863                          || con.match.getEnd(refno) < 0)
1864                          return -1; // ********
1865
int o2 = con.match.getBeginning(refno);
1866                      int literallen = con.match.getEnd(refno)-o2;
1867                      if (!isSet(opts, IGNORE_CASE)) {
1868                          if (dx > 0) {
1869                              if (!regionMatches(target, offset, con.limit, o2, literallen))
1870                                  return -1;
1871                              offset += literallen;
1872                          } else {
1873                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1874                                  return -1;
1875                              offset -= literallen;
1876                          }
1877                      } else {
1878                          if (dx > 0) {
1879                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1880                                  return -1;
1881                              offset += literallen;
1882                          } else {
1883                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1884                                                           o2, literallen))
1885                                  return -1;
1886                              offset -= literallen;
1887                          }
1888                      }
1889                  }
1890                  op = op.next;
1891                  break;
1892              case Op.STRING:
1893                  {
1894                      String JavaDoc literal = op.getString();
1895                      int literallen = literal.length();
1896                      if (!isSet(opts, IGNORE_CASE)) {
1897                          if (dx > 0) {
1898                              if (!regionMatches(target, offset, con.limit, literal, literallen))
1899                                  return -1;
1900                              offset += literallen;
1901                          } else {
1902                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1903                                  return -1;
1904                              offset -= literallen;
1905                          }
1906                      } else {
1907                          if (dx > 0) {
1908                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1909                                  return -1;
1910                              offset += literallen;
1911                          } else {
1912                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1913                                                           literal, literallen))
1914                                  return -1;
1915                              offset -= literallen;
1916                          }
1917                      }
1918                  }
1919                  op = op.next;
1920                  break;
1921
1922              case Op.CLOSURE:
1923                  {
1924                      /*
1925                       * Saves current position to avoid
1926                       * zero-width repeats.
1927                       */

1928                      int id = op.getData();
1929                      if (id >= 0) {
1930                          int previousOffset = con.offsets[id];
1931                          if (previousOffset < 0 || previousOffset != offset) {
1932                              con.offsets[id] = offset;
1933                          } else {
1934                              con.offsets[id] = -1;
1935                              op = op.next;
1936                              break;
1937                          }
1938                      }
1939
1940                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1941                      if (id >= 0) con.offsets[id] = -1;
1942                      if (ret >= 0) return ret;
1943                      op = op.next;
1944                  }
1945                  break;
1946
1947              case Op.QUESTION:
1948                  {
1949                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1950                      if (ret >= 0) return ret;
1951                      op = op.next;
1952                  }
1953                  break;
1954
1955              case Op.NONGREEDYCLOSURE:
1956              case Op.NONGREEDYQUESTION:
1957                  {
1958                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1959                      if (ret >= 0) return ret;
1960                      op = op.getChild();
1961                  }
1962                  break;
1963
1964              case Op.UNION:
1965                  for (int i = 0; i < op.size(); i ++) {
1966                      int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1967                      if (DEBUG) {
1968                          System.err.println("UNION: "+i+", ret="+ret);
1969                      }
1970                      if (ret >= 0) return ret;
1971                  }
1972                  return -1;
1973
1974              case Op.CAPTURE:
1975                  int refno = op.getData();
1976                  if (con.match != null && refno > 0) {
1977                      int save = con.match.getBeginning(refno);
1978                      con.match.setBeginning(refno, offset);
1979                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1980                      if (ret < 0) con.match.setBeginning(refno, save);
1981                      return ret;
1982                  } else if (con.match != null && refno < 0) {
1983                      int index = -refno;
1984                      int save = con.match.getEnd(index);
1985                      con.match.setEnd(index, offset);
1986                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1987                      if (ret < 0) con.match.setEnd(index, save);
1988                      return ret;
1989                  }
1990                  op = op.next;
1991                  break;
1992
1993              case Op.LOOKAHEAD:
1994                  if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1995                  op = op.next;
1996                  break;
1997              case Op.NEGATIVELOOKAHEAD:
1998                  if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1999                  op = op.next;
2000                  break;
2001              case Op.LOOKBEHIND:
2002                  if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
2003                  op = op.next;
2004                  break;
2005              case Op.NEGATIVELOOKBEHIND:
2006                  if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
2007                  op = op.next;
2008                  break;
2009
2010              case Op.INDEPENDENT:
2011                  {
2012                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
2013                      if (ret < 0) return ret;
2014                      offset = ret;
2015                      op = op.next;
2016                  }
2017                  break;
2018
2019              case Op.MODIFIER:
2020                  {
2021                      int localopts = opts;
2022                      localopts |= op.getData();
2023                      localopts &= ~op.getData2();
2024                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2025
int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
2026                      if (ret < 0) return ret;
2027                      offset = ret;
2028                      op = op.next;
2029                  }
2030                  break;
2031
2032              case Op.CONDITION:
2033                  {
2034                      Op.ConditionOp cop = (Op.ConditionOp)op;
2035                      boolean matchp = false;
2036                      if (cop.refNumber > 0) {
2037                          if (cop.refNumber >= this.nofparen)
2038                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2039                          matchp = con.match.getBeginning(cop.refNumber) >= 0
2040                                   && con.match.getEnd(cop.refNumber) >= 0;
2041                      } else {
2042                          matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
2043                      }
2044
2045                      if (matchp) {
2046                          op = cop.yes;
2047                      } else if (cop.no != null) {
2048                          op = cop.no;
2049                      } else {
2050                          op = cop.next;
2051                      }
2052                  }
2053                  break;
2054
2055              default:
2056                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
2057              } // switch (op.type)
2058
} // while
2059
}
2060
2061      private static final int getPreviousWordType(char[] target, int begin, int end,
2062                                                   int offset, int opts) {
2063          int ret = getWordType(target, begin, end, --offset, opts);
2064          while (ret == WT_IGNORE)
2065              ret = getWordType(target, begin, end, --offset, opts);
2066          return ret;
2067      }
2068
2069      private static final int getWordType(char[] target, int begin, int end,
2070                                           int offset, int opts) {
2071          if (offset < begin || offset >= end) return WT_OTHER;
2072          return getWordType0( target [ offset ] , opts);
2073      }
2074
2075
2076
2077      private static final boolean regionMatches(char[] target, int offset, int limit,
2078                                                 String JavaDoc part, int partlen) {
2079          if (offset < 0) return false;
2080          if (limit-offset < partlen)
2081              return false;
2082          int i = 0;
2083          while (partlen-- > 0) {
2084              if ( target [ offset++ ] != part.charAt(i++))
2085                  return false;
2086          }
2087          return true;
2088      }
2089
2090      private static final boolean regionMatches(char[] target, int offset, int limit,
2091                                                 int offset2, int partlen) {
2092          if (offset < 0) return false;
2093          if (limit-offset < partlen)
2094              return false;
2095          int i = offset2;
2096          while (partlen-- > 0) {
2097              if ( target [ offset++ ] != target [ i++ ] )
2098                  return false;
2099          }
2100          return true;
2101      }
2102
2103  /**
2104   * @see java.lang.String#regionMatches
2105   */

2106      private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
2107                                                           String JavaDoc part, int partlen) {
2108          if (offset < 0) return false;
2109          if (limit-offset < partlen)
2110              return false;
2111          int i = 0;
2112          while (partlen-- > 0) {
2113              char ch1 = target [ offset++ ] ;
2114              char ch2 = part.charAt(i++);
2115              if (ch1 == ch2)
2116                  continue;
2117              char uch1 = Character.toUpperCase(ch1);
2118              char uch2 = Character.toUpperCase(ch2);
2119              if (uch1 == uch2)
2120                  continue;
2121              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2122                  return false;
2123          }
2124          return true;
2125      }
2126
2127      private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
2128                                                           int offset2, int partlen) {
2129          if (offset < 0) return false;
2130          if (limit-offset < partlen)
2131              return false;
2132          int i = offset2;
2133          while (partlen-- > 0) {
2134              char ch1 = target [ offset++ ] ;
2135              char ch2 = target [ i++ ] ;
2136              if (ch1 == ch2)
2137                  continue;
2138              char uch1 = Character.toUpperCase(ch1);
2139              char uch2 = Character.toUpperCase(ch2);
2140              if (uch1 == uch2)
2141                  continue;
2142              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2143                  return false;
2144          }
2145          return true;
2146      }
2147
2148
2149
2150
2151      /**
2152       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2153       *
2154       * @return true if the target is matched to this regular expression.
2155       */

2156      public boolean matches(String JavaDoc target) {
2157          return this.matches(target, 0, target .length() , (Match)null);
2158      }
2159
2160      /**
2161       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
2162       * in specified range or not.
2163       *
2164       * @param start Start offset of the range.
2165       * @param end End offset +1 of the range.
2166       * @return true if the target is matched to this regular expression.
2167       */

2168      public boolean matches(String JavaDoc target, int start, int end) {
2169          return this.matches(target, start, end, (Match)null);
2170      }
2171
2172      /**
2173       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2174       *
2175       * @param match A Match instance for storing matching result.
2176       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2177       */

2178      public boolean matches(String JavaDoc target, Match match) {
2179          return this.matches(target, 0, target .length() , match);
2180      }
2181
2182      /**
2183       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
2184       * in specified range or not.
2185       *
2186       * @param start Start offset of the range.
2187       * @param end End offset +1 of the range.
2188       * @param match A Match instance for storing matching result.
2189       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2190       */

2191      public boolean matches(String JavaDoc target, int start, int end, Match match) {
2192
2193          synchronized (this) {
2194              if (this.operations == null)
2195                  this.prepare();
2196              if (this.context == null)
2197                  this.context = new Context();
2198          }
2199          Context con = null;
2200          synchronized (this.context) {
2201              con = this.context.inuse ? new Context() : this.context;
2202              con.reset(target, start, end, this.numberOfClosures);
2203          }
2204          if (match != null) {
2205              match.setNumberOfGroups(this.nofparen);
2206              match.setSource(target);
2207          } else if (this.hasBackReferences) {
2208              match = new Match();
2209              match.setNumberOfGroups(this.nofparen);
2210              // Need not to call setSource() because
2211
// a caller can not access this match instance.
2212
}
2213          con.match = match;
2214
2215          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
2216              if (DEBUG) {
2217                  System.err.println("target string="+target);
2218              }
2219              int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
2220              if (DEBUG) {
2221                  System.err.println("matchEnd="+matchEnd);
2222                  System.err.println("con.limit="+con.limit);
2223              }
2224              if (matchEnd == con.limit) {
2225                  if (con.match != null) {
2226                      con.match.setBeginning(0, con.start);
2227                      con.match.setEnd(0, matchEnd);
2228                  }
2229                  con.inuse = false;
2230                  return true;
2231              }
2232              return false;
2233          }
2234
2235          /*
2236           * The pattern has only fixed string.
2237           * The engine uses Boyer-Moore.
2238           */

2239          if (this.fixedStringOnly) {
2240              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2241
int o = this.fixedStringTable.matches(target, con.start, con.limit);
2242              if (o >= 0) {
2243                  if (con.match != null) {
2244                      con.match.setBeginning(0, o);
2245                      con.match.setEnd(0, o+this.fixedString.length());
2246                  }
2247                  con.inuse = false;
2248                  return true;
2249              }
2250              con.inuse = false;
2251              return false;
2252          }
2253
2254          /*
2255           * The pattern contains a fixed string.
2256           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2257           * If not, it return with false.
2258           */

2259          if (this.fixedString != null) {
2260              int o = this.fixedStringTable.matches(target, con.start, con.limit);
2261              if (o < 0) {
2262                  //System.err.println("Non-match in fixed-string search.");
2263
con.inuse = false;
2264                  return false;
2265              }
2266          }
2267
2268          int limit = con.limit-this.minlength;
2269          int matchStart;
2270          int matchEnd = -1;
2271
2272          /*
2273           * Checks whether the expression starts with ".*".
2274           */

2275          if (this.operations != null
2276              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2277              if (isSet(this.options, SINGLE_LINE)) {
2278                  matchStart = con.start;
2279                  matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
2280              } else {
2281                  boolean previousIsEOL = true;
2282                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2283                      int ch = target .charAt( matchStart ) ;
2284                      if (isEOLChar(ch)) {
2285                          previousIsEOL = true;
2286                      } else {
2287                          if (previousIsEOL) {
2288                              if (0 <= (matchEnd = this. matchString (con, this.operations,
2289                                                                      matchStart, 1, this.options)))
2290                                  break;
2291                          }
2292                          previousIsEOL = false;
2293                      }
2294                  }
2295              }
2296          }
2297
2298          /*
2299           * Optimization against the first character.
2300           */

2301          else if (this.firstChar != null) {
2302              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2303
RangeToken range = this.firstChar;
2304              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
2305                  range = this.firstChar.getCaseInsensitiveToken();
2306                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2307                      int ch = target .charAt( matchStart ) ;
2308                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2309                          ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
2310                          if (!range.match(ch)) continue;
2311                      } else {
2312                          if (!range.match(ch)) {
2313                              char ch1 = Character.toUpperCase((char)ch);
2314                              if (!range.match(ch1))
2315                                  if (!range.match(Character.toLowerCase(ch1)))
2316                                      continue;
2317                          }
2318                      }
2319                      if (0 <= (matchEnd = this. matchString (con, this.operations,
2320                                                              matchStart, 1, this.options)))
2321                          break;
2322                  }
2323              } else {
2324                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2325                      int ch = target .charAt( matchStart ) ;
2326                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2327                          ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
2328                      if (!range.match(ch)) continue;
2329                      if (0 <= (matchEnd = this. matchString (con, this.operations,
2330                                                              matchStart, 1, this.options)))
2331                          break;
2332                  }
2333              }
2334          }
2335
2336          /*
2337           * Straightforward matching.
2338           */

2339          else {
2340              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2341                  if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
2342                      break;
2343              }
2344          }
2345
2346          if (matchEnd >= 0) {
2347              if (con.match != null) {
2348                  con.match.setBeginning(0, matchStart);
2349                  con.match.setEnd(0, matchEnd);
2350              }
2351              con.inuse = false;
2352              return true;
2353          } else {
2354              con.inuse = false;
2355              return false;
2356          }
2357      }
2358
2359      /**
2360       * @return -1 when not match; offset of the end of matched string when match.
2361       */

2362      private int matchString (Context con, Op op, int offset, int dx, int opts) {
2363
2364
2365
2366
2367          String JavaDoc target = con.strTarget;
2368
2369
2370
2371
2372          while (true) {
2373              if (op == null)
2374                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
2375              if (offset > con.limit || offset < con.start)
2376                  return -1;
2377              switch (op.type) {
2378              case Op.CHAR:
2379                  if (isSet(opts, IGNORE_CASE)) {
2380                      int ch = op.getData();
2381                      if (dx > 0) {
2382                          if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) ))
2383                              return -1;
2384                          offset ++;
2385                      } else {
2386                          int o1 = offset-1;
2387                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) ))
2388                              return -1;
2389                          offset = o1;
2390                      }
2391                  } else {
2392                      int ch = op.getData();
2393                      if (dx > 0) {
2394                          if (offset >= con.limit || ch != target .charAt( offset ) )
2395                              return -1;
2396                          offset ++;
2397                      } else {
2398                          int o1 = offset-1;
2399                          if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) )
2400                              return -1;
2401                          offset = o1;
2402                      }
2403                  }
2404                  op = op.next;
2405                  break;
2406
2407              case Op.DOT:
2408                  if (dx > 0) {
2409                      if (offset >= con.limit)
2410                          return -1;
2411                      int ch = target .charAt( offset ) ;
2412                      if (isSet(opts, SINGLE_LINE)) {
2413                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2414                              offset ++;
2415                      } else {
2416                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2417                              ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
2418                          if (isEOLChar(ch))
2419                              return -1;
2420                      }
2421                      offset ++;
2422                  } else {
2423                      int o1 = offset-1;
2424                      if (o1 >= con.limit || o1 < 0)
2425                          return -1;
2426                      int ch = target .charAt( o1 ) ;
2427                      if (isSet(opts, SINGLE_LINE)) {
2428                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2429                              o1 --;
2430                      } else {
2431                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2432                              ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
2433                          if (!isEOLChar(ch))
2434                              return -1;
2435                      }
2436                      offset = o1;
2437                  }
2438                  op = op.next;
2439                  break;
2440
2441              case Op.RANGE:
2442              case Op.NRANGE:
2443                  if (dx > 0) {
2444                      if (offset >= con.limit)
2445                          return -1;
2446                      int ch = target .charAt( offset ) ;
2447                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2448                          ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
2449                      RangeToken tok = op.getToken();
2450                      if (isSet(opts, IGNORE_CASE)) {
2451                          tok = tok.getCaseInsensitiveToken();
2452                          if (!tok.match(ch)) {
2453                              if (ch >= 0x10000) return -1;
2454                              char uch;
2455                              if (!tok.match(uch = Character.toUpperCase((char)ch))
2456                                  && !tok.match(Character.toLowerCase(uch)))
2457                                  return -1;
2458                          }
2459                      } else {
2460                          if (!tok.match(ch)) return -1;
2461                      }
2462                      offset ++;
2463                  } else {
2464                      int o1 = offset-1;
2465                      if (o1 >= con.limit || o1 < 0)
2466                          return -1;
2467                      int ch = target .charAt( o1 ) ;
2468                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2469                          ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
2470                      RangeToken tok = op.getToken();
2471                      if (isSet(opts, IGNORE_CASE)) {
2472                          tok = tok.getCaseInsensitiveToken();
2473                          if (!tok.match(ch)) {
2474                              if (ch >= 0x10000) return -1;
2475                              char uch;
2476                              if (!tok.match(uch = Character.toUpperCase((char)ch))
2477                                  && !tok.match(Character.toLowerCase(uch)))
2478                                  return -1;
2479                          }
2480                      } else {
2481                          if (!tok.match(ch)) return -1;
2482                      }
2483                      offset = o1;
2484                  }
2485                  op = op.next;
2486                  break;
2487
2488              case Op.ANCHOR:
2489                  boolean go = false;
2490                  switch (op.getData()) {
2491                  case '^':
2492                      if (isSet(opts, MULTIPLE_LINES)) {
2493                          if (!(offset == con.start
2494                                || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
2495                              return -1;
2496                      } else {
2497                          if (offset != con.start)
2498                              return -1;
2499                      }
2500                      break;
2501
2502                  case '@': // Internal use only.
2503
// The @ always matches line beginnings.
2504
if (!(offset == con.start
2505                            || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
2506                          return -1;
2507                      break;
2508
2509                  case '$':
2510                      if (isSet(opts, MULTIPLE_LINES)) {
2511                          if (!(offset == con.limit
2512                                || offset < con.limit && isEOLChar( target .charAt( offset ) )))
2513                              return -1;
2514                      } else {
2515                          if (!(offset == con.limit
2516                                || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
2517                                || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
2518                                && target .charAt( offset+1 ) == LINE_FEED))
2519                              return -1;
2520                      }
2521                      break;
2522
2523                  case 'A':
2524                      if (offset != con.start) return -1;
2525                      break;
2526
2527                  case 'Z':
2528                      if (!(offset == con.limit
2529                            || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
2530                            || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
2531                            && target .charAt( offset+1 ) == LINE_FEED))
2532                          return -1;
2533                      break;
2534
2535                  case 'z':
2536                      if (offset != con.limit) return -1;
2537                      break;
2538
2539                  case 'b':
2540                      if (con.length == 0) return -1;
2541                      {
2542                          int after = getWordType(target, con.start, con.limit, offset, opts);
2543                          if (after == WT_IGNORE) return -1;
2544                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
2545                          if (after == before) return -1;
2546                      }
2547                      break;
2548
2549                  case 'B':
2550                      if (con.length == 0)
2551                          go = true;
2552                      else {
2553                          int after = getWordType(target, con.start, con.limit, offset, opts);
2554                          go = after == WT_IGNORE
2555                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
2556                      }
2557                      if (!go) return -1;
2558                      break;
2559
2560                  case '<':
2561                      if (con.length == 0 || offset == con.limit) return -1;
2562                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
2563                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
2564                          return -1;
2565                      break;
2566
2567                  case '>':
2568                      if (con.length == 0 || offset == con.start) return -1;
2569                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
2570                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
2571                          return -1;
2572                      break;
2573                  } // switch anchor type
2574
op = op.next;
2575                  break;
2576
2577              case Op.BACKREFERENCE:
2578                  {
2579                      int refno = op.getData();
2580                      if (refno <= 0 || refno >= this.nofparen)
2581                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
2582                      if (con.match.getBeginning(refno) < 0
2583                          || con.match.getEnd(refno) < 0)
2584                          return -1; // ********
2585
int o2 = con.match.getBeginning(refno);
2586                      int literallen = con.match.getEnd(refno)-o2;
2587                      if (!isSet(opts, IGNORE_CASE)) {
2588                          if (dx > 0) {
2589                              if (!regionMatches(target, offset, con.limit, o2, literallen))
2590                                  return -1;
2591                              offset += literallen;
2592                          } else {
2593                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
2594                                  return -1;
2595                              offset -= literallen;
2596                          }
2597                      } else {
2598                          if (dx > 0) {
2599                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
2600                                  return -1;
2601                              offset += literallen;
2602                          } else {
2603                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2604                                                           o2, literallen))
2605                                  return -1;
2606                              offset -= literallen;
2607                          }
2608                      }
2609                  }
2610                  op = op.next;
2611                  break;
2612              case Op.STRING:
2613                  {
2614                      String JavaDoc literal = op.getString();
2615                      int literallen = literal.length();
2616                      if (!isSet(opts, IGNORE_CASE)) {
2617                          if (dx > 0) {
2618                              if (!regionMatches(target, offset, con.limit, literal, literallen))
2619                                  return -1;
2620                              offset += literallen;
2621                          } else {
2622                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
2623                                  return -1;
2624                              offset -= literallen;
2625                          }
2626                      } else {
2627                          if (dx > 0) {
2628                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
2629                                  return -1;
2630                              offset += literallen;
2631                          } else {
2632                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2633                                                           literal, literallen))
2634                                  return -1;
2635                              offset -= literallen;
2636                          }
2637                      }
2638                  }
2639                  op = op.next;
2640                  break;
2641
2642              case Op.CLOSURE:
2643                  {
2644                      /*
2645                       * Saves current position to avoid
2646                       * zero-width repeats.
2647                       */

2648                      int id = op.getData();
2649                      if (id >= 0) {
2650                          int previousOffset = con.offsets[id];
2651                          if (previousOffset < 0 || previousOffset != offset) {
2652                              con.offsets[id] = offset;
2653                          } else {
2654                              con.offsets[id] = -1;
2655                              op = op.next;
2656                              break;
2657                          }
2658                      }
2659                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2660                      if (id >= 0) con.offsets[id] = -1;
2661                      if (ret >= 0) return ret;
2662                      op = op.next;
2663                  }
2664                  break;
2665
2666              case Op.QUESTION:
2667                  {
2668                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2669                      if (ret >= 0) return ret;
2670                      op = op.next;
2671                  }
2672                  break;
2673
2674              case Op.NONGREEDYCLOSURE:
2675              case Op.NONGREEDYQUESTION:
2676                  {
2677                      int ret = this. matchString (con, op.next, offset, dx, opts);
2678                      if (ret >= 0) return ret;
2679                      op = op.getChild();
2680                  }
2681                  break;
2682
2683              case Op.UNION:
2684                  for (int i = 0; i < op.size(); i ++) {
2685                      int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
2686                      if (DEBUG) {
2687                          System.err.println("UNION: "+i+", ret="+ret);
2688                      }
2689                      if (ret >= 0) return ret;
2690                  }
2691                  return -1;
2692
2693              case Op.CAPTURE:
2694                  int refno = op.getData();
2695                  if (con.match != null && refno > 0) {
2696                      int save = con.match.getBeginning(refno);
2697                      con.match.setBeginning(refno, offset);
2698                      int ret = this. matchString (con, op.next, offset, dx, opts);
2699                      if (ret < 0) con.match.setBeginning(refno, save);
2700                      return ret;
2701                  } else if (con.match != null && refno < 0) {
2702                      int index = -refno;
2703                      int save = con.match.getEnd(index);
2704                      con.match.setEnd(index, offset);
2705                      int ret = this. matchString (con, op.next, offset, dx, opts);
2706                      if (ret < 0) con.match.setEnd(index, save);
2707                      return ret;
2708                  }
2709                  op = op.next;
2710                  break;
2711
2712              case Op.LOOKAHEAD:
2713                  if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
2714                  op = op.next;
2715                  break;
2716              case Op.NEGATIVELOOKAHEAD:
2717                  if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
2718                  op = op.next;
2719                  break;
2720              case Op.LOOKBEHIND:
2721                  if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
2722                  op = op.next;
2723                  break;
2724              case Op.NEGATIVELOOKBEHIND:
2725                  if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
2726                  op = op.next;
2727                  break;
2728
2729              case Op.INDEPENDENT:
2730                  {
2731                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2732                      if (ret < 0) return ret;
2733                      offset = ret;
2734                      op = op.next;
2735                  }
2736                  break;
2737
2738              case Op.MODIFIER:
2739                  {
2740                      int localopts = opts;
2741                      localopts |= op.getData();
2742                      localopts &= ~op.getData2();
2743                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2744
int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
2745                      if (ret < 0) return ret;
2746                      offset = ret;
2747                      op = op.next;
2748                  }
2749                  break;
2750
2751              case Op.CONDITION:
2752                  {
2753                      Op.ConditionOp cop = (Op.ConditionOp)op;
2754                      boolean matchp = false;
2755                      if (cop.refNumber > 0) {
2756                          if (cop.refNumber >= this.nofparen)
2757                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2758                          matchp = con.match.getBeginning(cop.refNumber) >= 0
2759                                   && con.match.getEnd(cop.refNumber) >= 0;
2760                      } else {
2761                          matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
2762                      }
2763
2764                      if (matchp) {
2765                          op = cop.yes;
2766                      } else if (cop.no != null) {
2767                          op = cop.no;
2768                      } else {
2769                          op = cop.next;
2770                      }
2771                  }
2772                  break;
2773
2774              default:
2775                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
2776              } // switch (op.type)
2777
} // while
2778
}
2779
2780      private static final int getPreviousWordType(String JavaDoc target, int begin, int end,
2781                                                   int offset, int opts) {
2782          int ret = getWordType(target, begin, end, --offset, opts);
2783          while (ret == WT_IGNORE)
2784              ret = getWordType(target, begin, end, --offset, opts);
2785          return ret;
2786      }
2787
2788      private static final int getWordType(String JavaDoc target, int begin, int end,
2789                                           int offset, int opts) {
2790          if (offset < begin || offset >= end) return WT_OTHER;
2791          return getWordType0( target .charAt( offset ) , opts);
2792      }
2793
2794
2795      private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2796                                                 String JavaDoc part, int partlen) {
2797          if (limit-offset < partlen) return false;
2798          return text.regionMatches(offset, part, 0, partlen);
2799      }
2800
2801      private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2802                                                 int offset2, int partlen) {
2803          if (limit-offset < partlen) return false;
2804          return text.regionMatches(offset, text, offset2, partlen);
2805      }
2806
2807      private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2808                                                           String JavaDoc part, int partlen) {
2809          return text.regionMatches(true, offset, part, 0, partlen);
2810      }
2811
2812      private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2813                                                           int offset2, int partlen) {
2814          if (limit-offset < partlen) return false;
2815          return text.regionMatches(true, offset, text, offset2, partlen);
2816      }
2817
2818
2819
2820
2821
2822
2823
2824      /**
2825       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2826       *
2827       * @return true if the target is matched to this regular expression.
2828       */

2829      public boolean matches(CharacterIterator JavaDoc target) {
2830          return this.matches(target, (Match)null);
2831      }
2832
2833
2834      /**
2835       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2836       *
2837       * @param match A Match instance for storing matching result.
2838       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2839       */

2840      public boolean matches(CharacterIterator JavaDoc target, Match match) {
2841          int start = target.getBeginIndex();
2842          int end = target.getEndIndex();
2843
2844
2845
2846          synchronized (this) {
2847              if (this.operations == null)
2848                  this.prepare();
2849              if (this.context == null)
2850                  this.context = new Context();
2851          }
2852          Context con = null;
2853          synchronized (this.context) {
2854              con = this.context.inuse ? new Context() : this.context;
2855              con.reset(target, start, end, this.numberOfClosures);
2856          }
2857          if (match != null) {
2858              match.setNumberOfGroups(this.nofparen);
2859              match.setSource(target);
2860          } else if (this.hasBackReferences) {
2861              match = new Match();
2862              match.setNumberOfGroups(this.nofparen);
2863              // Need not to call setSource() because
2864
// a caller can not access this match instance.
2865
}
2866          con.match = match;
2867
2868          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
2869              int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2870              //System.err.println("DEBUG: matchEnd="+matchEnd);
2871
if (matchEnd == con.limit) {
2872                  if (con.match != null) {
2873                      con.match.setBeginning(0, con.start);
2874                      con.match.setEnd(0, matchEnd);
2875                  }
2876                  con.inuse = false;
2877                  return true;
2878              }
2879              return false;
2880          }
2881
2882          /*
2883           * The pattern has only fixed string.
2884           * The engine uses Boyer-Moore.
2885           */

2886          if (this.fixedStringOnly) {
2887              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2888
int o = this.fixedStringTable.matches(target, con.start, con.limit);
2889              if (o >= 0) {
2890                  if (con.match != null) {
2891                      con.match.setBeginning(0, o);
2892                      con.match.setEnd(0, o+this.fixedString.length());
2893                  }
2894                  con.inuse = false;
2895                  return true;
2896              }
2897              con.inuse = false;
2898              return false;
2899          }
2900
2901          /*
2902           * The pattern contains a fixed string.
2903           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2904           * If not, it return with false.
2905           */

2906          if (this.fixedString != null) {
2907              int o = this.fixedStringTable.matches(target, con.start, con.limit);
2908              if (o < 0) {
2909                  //System.err.println("Non-match in fixed-string search.");
2910
con.inuse = false;
2911                  return false;
2912              }
2913          }
2914
2915          int limit = con.limit-this.minlength;
2916          int matchStart;
2917          int matchEnd = -1;
2918
2919          /*
2920           * Checks whether the expression starts with ".*".
2921           */

2922          if (this.operations != null
2923              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2924              if (isSet(this.options, SINGLE_LINE)) {
2925                  matchStart = con.start;
2926                  matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2927              } else {
2928                  boolean previousIsEOL = true;
2929                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2930                      int ch = target .setIndex( matchStart ) ;
2931                      if (isEOLChar(ch)) {
2932                          previousIsEOL = true;
2933                      } else {
2934                          if (previousIsEOL) {
2935                              if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2936                                                                                 matchStart, 1, this.options)))
2937                                  break;
2938                          }
2939                          previousIsEOL = false;
2940                      }
2941                  }
2942              }
2943          }
2944
2945          /*
2946           * Optimization against the first character.
2947           */

2948          else if (this.firstChar != null) {
2949              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2950
RangeToken range = this.firstChar;
2951              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
2952                  range = this.firstChar.getCaseInsensitiveToken();
2953                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2954                      int ch = target .setIndex( matchStart ) ;
2955                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2956                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2957                          if (!range.match(ch)) continue;
2958                      } else {
2959                          if (!range.match(ch)) {
2960                              char ch1 = Character.toUpperCase((char)ch);
2961                              if (!range.match(ch1))
2962                                  if (!range.match(Character.toLowerCase(ch1)))
2963                                      continue;
2964                          }
2965                      }
2966                      if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2967                                                                         matchStart, 1, this.options)))
2968                          break;
2969                  }
2970              } else {
2971                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2972                      int ch = target .setIndex( matchStart ) ;
2973                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2974                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2975                      if (!range.match(ch)) continue;
2976                      if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2977                                                                         matchStart, 1, this.options)))
2978                          break;
2979                  }
2980              }
2981          }
2982
2983          /*
2984           * Straightforward matching.
2985           */

2986          else {
2987              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2988                  if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
2989                      break;
2990              }
2991          }
2992
2993          if (matchEnd >= 0) {
2994              if (con.match != null) {
2995                  con.match.setBeginning(0, matchStart);
2996                  con.match.setEnd(0, matchEnd);
2997              }
2998              con.inuse = false;
2999              return true;
3000          } else {
3001              con.inuse = false;
3002              return false;
3003          }
3004      }
3005
3006      /**
3007       * @return -1 when not match; offset of the end of matched string when match.
3008       */

3009      private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
3010
3011
3012          CharacterIterator JavaDoc target = con.ciTarget;
3013
3014
3015
3016
3017
3018
3019          while (true) {
3020              if (op == null)
3021                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
3022              if (offset > con.limit || offset < con.start)
3023                  return -1;
3024              switch (op.type) {
3025              case Op.CHAR:
3026                  if (isSet(opts, IGNORE_CASE)) {
3027                      int ch = op.getData();
3028                      if (dx > 0) {
3029                          if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) ))
3030                              return -1;
3031                          offset ++;
3032                      } else {
3033                          int o1 = offset-1;
3034                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) ))
3035                              return -1;
3036                          offset = o1;
3037                      }
3038                  } else {
3039                      int ch = op.getData();
3040                      if (dx > 0) {
3041                          if (offset >= con.limit || ch != target .setIndex( offset ) )
3042                              return -1;
3043                          offset ++;
3044                      } else {
3045                          int o1 = offset-1;
3046                          if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) )
3047                              return -1;
3048                          offset = o1;
3049                      }
3050                  }
3051                  op = op.next;
3052                  break;
3053
3054              case Op.DOT:
3055                  if (dx > 0) {
3056                      if (offset >= con.limit)
3057                          return -1;
3058                      int ch = target .setIndex( offset ) ;
3059                      if (isSet(opts, SINGLE_LINE)) {
3060                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3061                              offset ++;
3062                      } else {
3063                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3064                              ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
3065                          if (isEOLChar(ch))
3066                              return -1;
3067                      }
3068                      offset ++;
3069                  } else {
3070                      int o1 = offset-1;
3071                      if (o1 >= con.limit || o1 < 0)
3072                          return -1;
3073                      int ch = target .setIndex( o1 ) ;
3074                      if (isSet(opts, SINGLE_LINE)) {
3075                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3076                              o1 --;
3077                      } else {
3078                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3079                              ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
3080                          if (!isEOLChar(ch))
3081                              return -1;
3082                      }
3083                      offset = o1;
3084                  }
3085                  op = op.next;
3086                  break;
3087
3088              case Op.RANGE:
3089              case Op.NRANGE:
3090                  if (dx > 0) {
3091                      if (offset >= con.limit)
3092                          return -1;
3093                      int ch = target .setIndex( offset ) ;
3094                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3095                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
3096                      RangeToken tok = op.getToken();
3097                      if (isSet(opts, IGNORE_CASE)) {
3098                          tok = tok.getCaseInsensitiveToken();
3099                          if (!tok.match(ch)) {
3100                              if (ch >= 0x10000) return -1;
3101                              char uch;
3102                              if (!tok.match(uch = Character.toUpperCase((char)ch))
3103                                  && !tok.match(Character.toLowerCase(uch)))
3104                                  return -1;
3105                          }
3106                      } else {
3107                          if (!tok.match(ch)) return -1;
3108                      }
3109                      offset ++;
3110                  } else {
3111                      int o1 = offset-1;
3112                      if (o1 >= con.limit || o1 < 0)
3113                          return -1;
3114                      int ch = target .setIndex( o1 ) ;
3115                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3116                          ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
3117                      RangeToken tok = op.getToken();
3118                      if (isSet(opts, IGNORE_CASE)) {
3119                          tok = tok.getCaseInsensitiveToken();
3120                          if (!tok.match(ch)) {
3121                              if (ch >= 0x10000) return -1;
3122                              char uch;
3123                              if (!tok.match(uch = Character.toUpperCase((char)ch))
3124                                  && !tok.match(Character.toLowerCase(uch)))
3125                                  return -1;
3126                          }
3127                      } else {
3128                          if (!tok.match(ch)) return -1;
3129                      }
3130                      offset = o1;
3131                  }
3132                  op = op.next;
3133                  break;
3134
3135              case Op.ANCHOR:
3136                  boolean go = false;
3137                  switch (op.getData()) {
3138                  case '^':
3139                      if (isSet(opts, MULTIPLE_LINES)) {
3140                          if (!(offset == con.start
3141                                || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
3142                              return -1;
3143                      } else {
3144                          if (offset != con.start)
3145                              return -1;
3146                      }
3147                      break;
3148
3149                  case '@': // Internal use only.
3150
// The @ always matches line beginnings.
3151
if (!(offset == con.start
3152                            || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
3153                          return -1;
3154                      break;
3155
3156                  case '$':
3157                      if (isSet(opts, MULTIPLE_LINES)) {
3158                          if (!(offset == con.limit
3159                                || offset < con.limit && isEOLChar( target .setIndex( offset ) )))
3160                              return -1;
3161                      } else {
3162                          if (!(offset == con.limit
3163                                || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
3164                                || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
3165                                && target .setIndex( offset+1 ) == LINE_FEED))
3166                              return -1;
3167                      }
3168                      break;
3169
3170                  case 'A':
3171                      if (offset != con.start) return -1;
3172                      break;
3173
3174                  case 'Z':
3175                      if (!(offset == con.limit
3176                            || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
3177                            || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
3178                            && target .setIndex( offset+1 ) == LINE_FEED))
3179                          return -1;
3180                      break;
3181
3182                  case 'z':
3183                      if (offset != con.limit) return -1;
3184                      break;
3185
3186                  case 'b':
3187                      if (con.length == 0) return -1;
3188                      {
3189                          int after = getWordType(target, con.start, con.limit, offset, opts);
3190                          if (after == WT_IGNORE) return -1;
3191                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
3192                          if (after == before) return -1;
3193                      }
3194                      break;
3195
3196                  case 'B':
3197                      if (con.length == 0)
3198                          go = true;
3199                      else {
3200                          int after = getWordType(target, con.start, con.limit, offset, opts);
3201                          go = after == WT_IGNORE
3202                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
3203                      }
3204                      if (!go) return -1;
3205                      break;
3206
3207                  case '<':
3208                      if (con.length == 0 || offset == con.limit) return -1;
3209                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
3210                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
3211                          return -1;
3212                      break;
3213
3214                  case '>':
3215                      if (con.length == 0 || offset == con.start) return -1;
3216                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
3217                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
3218                          return -1;
3219                      break;
3220                  } // switch anchor type
3221
op = op.next;
3222                  break;
3223
3224              case Op.BACKREFERENCE:
3225                  {
3226                      int refno = op.getData();
3227                      if (refno <= 0 || refno >= this.nofparen)
3228                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
3229                      if (con.match.getBeginning(refno) < 0
3230                          || con.match.getEnd(refno) < 0)
3231                          return -1; // ********
3232
int o2 = con.match.getBeginning(refno);
3233                      int literallen = con.match.getEnd(refno)-o2;
3234                      if (!isSet(opts, IGNORE_CASE)) {
3235                          if (dx > 0) {
3236                              if (!regionMatches(target, offset, con.limit, o2, literallen))
3237                                  return -1;
3238                              offset += literallen;
3239                          } else {
3240                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
3241                                  return -1;
3242                              offset -= literallen;
3243                          }
3244                      } else {
3245                          if (dx > 0) {
3246                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
3247                                  return -1;
3248                              offset += literallen;
3249                          } else {
3250                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
3251                                                           o2, literallen))
3252                                  return -1;
3253                              offset -= literallen;
3254                          }
3255                      }
3256                  }
3257                  op = op.next;
3258                  break;
3259              case Op.STRING:
3260                  {
3261                      String JavaDoc literal = op.getString();
3262                      int literallen = literal.length();
3263                      if (!isSet(opts, IGNORE_CASE)) {
3264                          if (dx > 0) {
3265                              if (!regionMatches(target, offset, con.limit, literal, literallen))
3266                                  return -1;
3267                              offset += literallen;
3268                          } else {
3269                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
3270                                  return -1;
3271                              offset -= literallen;
3272                          }
3273                      } else {
3274                          if (dx > 0) {
3275                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
3276                                  return -1;
3277                              offset += literallen;
3278                          } else {
3279                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
3280                                                           literal, literallen))
3281                                  return -1;
3282                              offset -= literallen;
3283                          }
3284                      }
3285                  }
3286                  op = op.next;
3287                  break;
3288
3289              case Op.CLOSURE:
3290                  {
3291                      /*
3292                       * Saves current position to avoid
3293                       * zero-width repeats.
3294                       */

3295                      int id = op.getData();
3296                      if (id >= 0) {
3297                          int previousOffset = con.offsets[id];
3298                          if (previousOffset < 0 || previousOffset != offset) {
3299                              con.offsets[id] = offset;
3300                          } else {
3301                              con.offsets[id] = -1;
3302                              op = op.next;
3303                              break;
3304                          }
3305                      }
3306                      
3307                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3308                      if (id >= 0) con.offsets[id] = -1;
3309                      if (ret >= 0) return ret;
3310                      op = op.next;
3311                  }
3312                  break;
3313
3314              case Op.QUESTION:
3315                  {
3316                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3317                      if (ret >= 0) return ret;
3318                      op = op.next;
3319                  }
3320                  break;
3321
3322              case Op.NONGREEDYCLOSURE:
3323              case Op.NONGREEDYQUESTION:
3324                  {
3325                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3326                      if (ret >= 0) return ret;
3327                      op = op.getChild();
3328                  }
3329                  break;
3330
3331              case Op.UNION:
3332                  for (int i = 0; i < op.size(); i ++) {
3333                      int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
3334                      if (DEBUG) {
3335                          System.err.println("UNION: "+i+", ret="+ret);
3336                      }
3337                      if (ret >= 0) return ret;
3338                  }
3339                  return -1;
3340
3341              case Op.CAPTURE:
3342                  int refno = op.getData();
3343                  if (con.match != null && refno > 0) {
3344                      int save = con.match.getBeginning(refno);
3345                      con.match.setBeginning(refno, offset);
3346                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3347                      if (ret < 0) con.match.setBeginning(refno, save);
3348                      return ret;
3349                  } else if (con.match != null && refno < 0) {
3350                      int index = -refno;
3351                      int save = con.match.getEnd(index);
3352                      con.match.setEnd(index, offset);
3353                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3354                      if (ret < 0) con.match.setEnd(index, save);
3355                      return ret;
3356                  }
3357                  op = op.next;
3358                  break;
3359
3360              case Op.LOOKAHEAD:
3361                  if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
3362                  op = op.next;
3363                  break;
3364              case Op.NEGATIVELOOKAHEAD:
3365                  if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
3366                  op = op.next;
3367                  break;
3368              case Op.LOOKBEHIND:
3369                  if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
3370                  op = op.next;
3371                  break;
3372              case Op.NEGATIVELOOKBEHIND:
3373                  if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
3374                  op = op.next;
3375                  break;
3376
3377              case Op.INDEPENDENT:
3378                  {
3379                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3380                      if (ret < 0) return ret;
3381                      offset = ret;
3382                      op = op.next;
3383                  }
3384                  break;
3385
3386              case Op.MODIFIER:
3387                  {
3388                      int localopts = opts;
3389                      localopts |= op.getData();
3390                      localopts &= ~op.getData2();
3391                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
3392
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
3393                      if (ret < 0) return ret;
3394                      offset = ret;
3395                      op = op.next;
3396                  }
3397                  break;
3398
3399              case Op.CONDITION:
3400                  {
3401                      Op.ConditionOp cop = (Op.ConditionOp)op;
3402                      boolean matchp = false;
3403                      if (cop.refNumber > 0) {
3404                          if (cop.refNumber >= this.nofparen)
3405                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
3406                          matchp = con.match.getBeginning(cop.refNumber) >= 0
3407                                   && con.match.getEnd(cop.refNumber) >= 0;
3408                      } else {
3409                          matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
3410                      }
3411
3412                      if (matchp) {
3413                          op = cop.yes;
3414                      } else if (cop.no != null) {
3415                          op = cop.no;
3416                      } else {
3417                          op = cop.next;
3418                      }
3419                  }
3420                  break;
3421
3422              default:
3423                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
3424              } // switch (op.type)
3425
} // while
3426
}
3427
3428      private static final int getPreviousWordType(CharacterIterator JavaDoc target, int begin, int end,
3429                                                   int offset, int opts) {
3430          int ret = getWordType(target, begin, end, --offset, opts);
3431          while (ret == WT_IGNORE)
3432              ret = getWordType(target, begin, end, --offset, opts);
3433          return ret;
3434      }
3435
3436      private static final int getWordType(CharacterIterator JavaDoc target, int begin, int end,
3437                                           int offset, int opts) {
3438          if (offset < begin || offset >= end) return WT_OTHER;
3439          return getWordType0( target .setIndex( offset ) , opts);
3440      }
3441
3442
3443
3444      private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
3445                                                 String JavaDoc part, int partlen) {
3446          if (offset < 0) return false;
3447          if (limit-offset < partlen)
3448              return false;
3449          int i = 0;
3450          while (partlen-- > 0) {
3451              if ( target .setIndex( offset++ ) != part.charAt(i++))
3452                  return false;
3453          }
3454          return true;
3455      }
3456
3457      private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
3458                                                 int offset2, int partlen) {
3459          if (offset < 0) return false;
3460          if (limit-offset < partlen)
3461              return false;
3462          int i = offset2;
3463          while (partlen-- > 0) {
3464              if ( target .setIndex( offset++ ) != target .setIndex( i++ ) )
3465                  return false;
3466          }
3467          return true;
3468      }
3469
3470      /**
3471       * @see java.lang.String#regionMatches
3472       */

3473      private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
3474                                                           String JavaDoc part, int partlen) {
3475          if (offset < 0) return false;
3476          if (limit-offset < partlen)
3477              return false;
3478          int i = 0;
3479          while (partlen-- > 0) {
3480              char ch1 = target .setIndex( offset++ ) ;
3481              char ch2 = part.charAt(i++);
3482              if (ch1 == ch2)
3483                  continue;
3484              char uch1 = Character.toUpperCase(ch1);
3485              char uch2 = Character.toUpperCase(ch2);
3486              if (uch1 == uch2)
3487                  continue;
3488              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
3489                  return false;
3490          }
3491          return true;
3492      }
3493
3494      private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
3495                                                           int offset2, int partlen) {
3496          if (offset < 0) return false;
3497          if (limit-offset < partlen)
3498              return false;
3499          int i = offset2;
3500          while (partlen-- > 0) {
3501              char ch1 = target .setIndex( offset++ ) ;
3502              char ch2 = target .setIndex( i++ ) ;
3503              if (ch1 == ch2)
3504                  continue;
3505              char uch1 = Character.toUpperCase(ch1);
3506              char uch2 = Character.toUpperCase(ch2);
3507              if (uch1 == uch2)
3508                  continue;
3509              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
3510                  return false;
3511          }
3512          return true;
3513      }
3514
3515
3516
3517
3518      // ================================================================
3519

3520      /**
3521       * A regular expression.
3522       * @serial
3523       */

3524      String JavaDoc regex;
3525      /**
3526       * @serial
3527       */

3528      int options;
3529
3530      /**
3531       * The number of parenthesis in the regular expression.
3532       * @serial
3533       */

3534      int nofparen;
3535      /**
3536       * Internal representation of the regular expression.
3537       * @serial
3538       */

3539      Token tokentree;
3540
3541      boolean hasBackReferences = false;
3542
3543      transient int minlength;
3544      transient Op operations = null;
3545      transient int numberOfClosures;
3546      transient Context context = null;
3547      transient RangeToken firstChar = null;
3548
3549      transient String JavaDoc fixedString = null;
3550      transient int fixedStringOptions;
3551      transient BMPattern fixedStringTable = null;
3552      transient boolean fixedStringOnly = false;
3553
3554
3555      static final class Context {
3556          CharacterIterator JavaDoc ciTarget;
3557          String JavaDoc strTarget;
3558          char[] charTarget;
3559          int start;
3560          int limit;
3561          int length;
3562          Match match;
3563          boolean inuse = false;
3564          int[] offsets;
3565
3566          Context() {
3567          }
3568
3569          private void resetCommon(int nofclosures) {
3570              this.length = this.limit-this.start;
3571              this.inuse = true;
3572              this.match = null;
3573              if (this.offsets == null || this.offsets.length != nofclosures)
3574                  this.offsets = new int[nofclosures];
3575              for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1;
3576          }
3577          void reset(CharacterIterator JavaDoc target, int start, int limit, int nofclosures) {
3578              this.ciTarget = target;
3579              this.start = start;
3580              this.limit = limit;
3581              this.resetCommon(nofclosures);
3582          }
3583          void reset(String JavaDoc target, int start, int limit, int nofclosures) {
3584              this.strTarget = target;
3585              this.start = start;
3586              this.limit = limit;
3587              this.resetCommon(nofclosures);
3588          }
3589          void reset(char[] target, int start, int limit, int nofclosures) {
3590              this.charTarget = target;
3591              this.start = start;
3592              this.limit = limit;
3593              this.resetCommon(nofclosures);
3594          }
3595      }
3596
3597      /**
3598       * Prepares for matching. This method is called just before starting matching.
3599       */

3600      void prepare() {
3601          if (Op.COUNT) Op.nofinstances = 0;
3602          this.compile(this.tokentree);
3603          /*
3604          if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
3605              Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
3606              anchor.next = this.operations;
3607              this.operations = anchor;
3608          }
3609          */

3610          if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
3611
3612          this.minlength = this.tokentree.getMinLength();
3613
3614          this.firstChar = null;
3615          if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
3616              && !isSet(this.options, XMLSCHEMA_MODE)) {
3617              RangeToken firstChar = Token.createRange();
3618              int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
3619              if (fresult == Token.FC_TERMINAL) {
3620                  firstChar.compactRanges();
3621                  this.firstChar = firstChar;
3622                  if (DEBUG)
3623                      System.err.println("DEBUG: Use the first character optimization: "+firstChar);
3624              }
3625          }
3626
3627          if (this.operations != null
3628              && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
3629              && this.operations.next == null) {
3630              if (DEBUG)
3631                  System.err.print(" *** Only fixed string! *** ");
3632              this.fixedStringOnly = true;
3633              if (this.operations.type == Op.STRING)
3634                  this.fixedString = this.operations.getString();
3635              else if (this.operations.getData() >= 0x10000) { // Op.CHAR
3636
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
3637              } else {
3638                  char[] ac = new char[1];
3639                  ac[0] = (char)this.operations.getData();
3640                  this.fixedString = new String JavaDoc(ac);
3641              }
3642              this.fixedStringOptions = this.options;
3643              this.fixedStringTable = new BMPattern(this.fixedString, 256,
3644                                                    isSet(this.fixedStringOptions, IGNORE_CASE));
3645          } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
3646                     && !isSet(this.options, XMLSCHEMA_MODE)) {
3647              Token.FixedStringContainer container = new Token.FixedStringContainer();
3648              this.tokentree.findFixedString(container, this.options);
3649              this.fixedString = container.token == null ? null : container.token.getString();
3650              this.fixedStringOptions = container.options;
3651              if (this.fixedString != null && this.fixedString.length() < 2)
3652                  this.fixedString = null;
3653              // This pattern has a fixed string of which length is more than one.
3654
if (this.fixedString != null) {
3655                  this.fixedStringTable = new BMPattern(this.fixedString, 256,
3656                                                        isSet(this.fixedStringOptions, IGNORE_CASE));
3657                  if (DEBUG) {
3658                      System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
3659                                         +"/" //+this.fixedString
3660
+"/"+REUtil.createOptionString(this.fixedStringOptions));
3661                      System.err.print("String: ");
3662                      REUtil.dumpString(this.fixedString);
3663                  }
3664              }
3665          }
3666      }
3667
3668      /**
3669       * An option.
3670       * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
3671       * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
3672       * does not capture.
3673       *
3674       * @see #RegularExpression(java.lang.String,int)
3675       * @see #setPattern(java.lang.String,int)
3676      static final int MARK_PARENS = 1<<0;
3677       */

3678
3679      /**
3680       * "i"
3681       */

3682      static final int IGNORE_CASE = 1<<1;
3683
3684      /**
3685       * "s"
3686       */

3687      static final int SINGLE_LINE = 1<<2;
3688
3689      /**
3690       * "m"
3691       */

3692      static final int MULTIPLE_LINES = 1<<3;
3693
3694      /**
3695       * "x"
3696       */

3697      static final int EXTENDED_COMMENT = 1<<4;
3698
3699      /**
3700       * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
3701       *
3702       * @see #RegularExpression(java.lang.String,int)
3703       * @see #setPattern(java.lang.String,int)
3704       * @see #UNICODE_WORD_BOUNDARY
3705       */

3706      static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
3707

3708      /**
3709       * An option.
3710       * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
3711       * <p>By default, the engine considers a position between a word character
3712       * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
3713       * is a word boundary.
3714       * <p>By this option, the engine checks word boundaries with the method of
3715       * 'Unicode Regular Expression Guidelines' Revision 4.
3716       *
3717       * @see #RegularExpression(java.lang.String,int)
3718       * @see #setPattern(java.lang.String,int)
3719       */

3720      static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
3721

3722      /**
3723       * "H"
3724       */

3725      static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
3726      /**
3727       * "F"
3728       */

3729      static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
3730      /**
3731       * "X". XML Schema mode.
3732       */

3733      static final int XMLSCHEMA_MODE = 1<<9;
3734      /**
3735       * ",".
3736       */

3737      static final int SPECIAL_COMMA = 1<<10;
3738
3739
3740      private static final boolean isSet(int options, int flag) {
3741          return (options & flag) == flag;
3742      }
3743
3744      /**
3745       * Creates a new RegularExpression instance.
3746       *
3747       * @param regex A regular expression
3748       * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3749       */

3750      public RegularExpression(String JavaDoc regex) throws ParseException {
3751          this.setPattern(regex, null);
3752      }
3753
3754      /**
3755       * Creates a new RegularExpression instance with options.
3756       *
3757       * @param regex A regular expression
3758       * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
3759       * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3760       */

3761      public RegularExpression(String JavaDoc regex, String JavaDoc options) throws ParseException {
3762          this.setPattern(regex, options);
3763      }
3764
3765      RegularExpression(String JavaDoc regex, Token tok, int parens, boolean hasBackReferences, int options) {
3766          this.regex = regex;
3767          this.tokentree = tok;
3768          this.nofparen = parens;
3769          this.options = options;
3770          this.hasBackReferences = hasBackReferences;
3771      }
3772
3773      /**
3774       *
3775       */

3776      public void setPattern(String JavaDoc newPattern) throws ParseException {
3777          this.setPattern(newPattern, this.options);
3778      }
3779
3780      private void setPattern(String JavaDoc newPattern, int options) throws ParseException {
3781          this.regex = newPattern;
3782          this.options = options;
3783          RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
3784                           ? new ParserForXMLSchema() : new RegexParser();
3785          this.tokentree = rp.parse(this.regex, this.options);
3786          this.nofparen = rp.parennumber;
3787          this.hasBackReferences = rp.hasBackReferences;
3788
3789          this.operations = null;
3790          this.context = null;
3791      }
3792      /**
3793       *
3794       */

3795      public void setPattern(String JavaDoc newPattern, String JavaDoc options) throws ParseException {
3796          this.setPattern(newPattern, REUtil.parseOptions(options));
3797      }
3798
3799      /**
3800       *
3801       */

3802      public String JavaDoc getPattern() {
3803          return this.regex;
3804      }
3805
3806      /**
3807       * Represents this instence in String.
3808       */

3809      public String JavaDoc toString() {
3810          return this.tokentree.toString(this.options);
3811      }
3812
3813      /**
3814       * Returns a option string.
3815       * The order of letters in it may be different from a string specified
3816       * in a constructor or <code>setPattern()</code>.
3817       *
3818       * @see #RegularExpression(String, String)
3819       * @see #setPattern(java.lang.String,java.lang.String)
3820       */

3821      public String JavaDoc getOptions() {
3822          return REUtil.createOptionString(this.options);
3823      }
3824
3825      /**
3826       * Return true if patterns are the same and the options are equivalent.
3827       */

3828      public boolean equals(Object JavaDoc obj) {
3829          if (obj == null) return false;
3830          if (!(obj instanceof RegularExpression))
3831              return false;
3832          RegularExpression r = (RegularExpression)obj;
3833          return this.regex.equals(r.regex) && this.options == r.options;
3834      }
3835
3836      boolean equals(String JavaDoc pattern, int options) {
3837          return this.regex.equals(pattern) && this.options == options;
3838      }
3839
3840      /**
3841       *
3842       */

3843      public int hashCode() {
3844          return (this.regex+"/"+this.getOptions()).hashCode();
3845      }
3846
3847      /**
3848       * Return the number of regular expression groups.
3849       * This method returns 1 when the regular expression has no capturing-parenthesis.
3850       *
3851       */

3852      public int getNumberOfGroups() {
3853          return this.nofparen;
3854      }
3855
3856      // ================================================================
3857

3858      private static final int WT_IGNORE = 0;
3859      private static final int WT_LETTER = 1;
3860      private static final int WT_OTHER = 2;
3861      private static final int getWordType0(char ch, int opts) {
3862          if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3863              if (isSet(opts, USE_UNICODE_CATEGORY)) {
3864                  return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
3865              }
3866              return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3867          }
3868
3869          switch (Character.getType(ch)) {
3870          case Character.UPPERCASE_LETTER: // L
3871
case Character.LOWERCASE_LETTER: // L
3872
case Character.TITLECASE_LETTER: // L
3873
case Character.MODIFIER_LETTER: // L
3874
case Character.OTHER_LETTER: // L
3875
case Character.LETTER_NUMBER: // N
3876
case Character.DECIMAL_DIGIT_NUMBER: // N
3877
case Character.OTHER_NUMBER: // N
3878
case Character.COMBINING_SPACING_MARK: // Mc
3879
return WT_LETTER;
3880
3881          case Character.FORMAT: // Cf
3882
case Character.NON_SPACING_MARK: // Mn
3883
case Character.ENCLOSING_MARK: // Mc
3884
return WT_IGNORE;
3885
3886          case Character.CONTROL: // Cc
3887
switch (ch) {
3888              case '\t':
3889              case '\n':
3890              case '\u000B':
3891              case '\f':
3892              case '\r':
3893                  return WT_OTHER;
3894              default:
3895                  return WT_IGNORE;
3896              }
3897
3898          default:
3899              return WT_OTHER;
3900          }
3901      }
3902
3903      // ================================================================
3904

3905      static final int LINE_FEED = 0x000A;
3906      static final int CARRIAGE_RETURN = 0x000D;
3907      static final int LINE_SEPARATOR = 0x2028;
3908      static final int PARAGRAPH_SEPARATOR = 0x2029;
3909
3910      private static final boolean isEOLChar(int ch) {
3911          return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
3912          || ch == PARAGRAPH_SEPARATOR;
3913      }
3914
3915      private static final boolean isWordChar(int ch) { // Legacy word characters
3916
if (ch == '_') return true;
3917          if (ch < '0') return false;
3918          if (ch > 'z') return false;
3919          if (ch <= '9') return true;
3920          if (ch < 'A') return false;
3921          if (ch <= 'Z') return true;
3922          if (ch < 'a') return false;
3923          return true;
3924      }
3925
3926      private static final boolean matchIgnoreCase(int chardata, int ch) {
3927          if (chardata == ch) return true;
3928          if (chardata > 0xffff || ch > 0xffff) return false;
3929          char uch1 = Character.toUpperCase((char)chardata);
3930          char uch2 = Character.toUpperCase((char)ch);
3931          if (uch1 == uch2) return true;
3932          return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
3933      }
3934  }
3935  
3936  public static class ParseException extends RuntimeException JavaDoc {
3937    int location;
3938
3939    /*
3940    public ParseException(String mes) {
3941        this(mes, -1);
3942    }
3943    */

3944    /**
3945     *
3946     */

3947    public ParseException(String JavaDoc mes, int location) {
3948        super(mes);
3949        this.location = location;
3950    }
3951
3952    /**
3953     *
3954     * @return -1 if location information is not available.
3955     */

3956    public int getLocation() {
3957        return this.location;
3958    }
3959}
3960
3961  static class Op {
3962    static final int DOT = 0;
3963    static final int CHAR = 1; // Single character
3964
static final int RANGE = 3; // [a-zA-Z]
3965
static final int NRANGE = 4; // [^a-zA-Z]
3966
static final int ANCHOR = 5; // ^ $ ...
3967
static final int STRING = 6; // literal String
3968
static final int CLOSURE = 7; // X*
3969
static final int NONGREEDYCLOSURE = 8; // X*?
3970
static final int QUESTION = 9; // X?
3971
static final int NONGREEDYQUESTION = 10; // X??
3972
static final int UNION = 11; // X|Y
3973
static final int CAPTURE = 15; // ( and )
3974
static final int BACKREFERENCE = 16; // \1 \2 ...
3975
static final int LOOKAHEAD = 20; // (?=...)
3976
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
3977
static final int LOOKBEHIND = 22; // (?<=...)
3978
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
3979
static final int INDEPENDENT = 24; // (?>...)
3980
static final int MODIFIER = 25; // (?ims-ims:...)
3981
static final int CONDITION = 26; // (?(..)yes|no)
3982

3983    static int nofinstances = 0;
3984    static final boolean COUNT = false;
3985
3986    static Op createDot() {
3987        if (Op.COUNT) Op.nofinstances ++;
3988        return new Op(Op.DOT);
3989    }
3990    static CharOp createChar(int data) {
3991        if (Op.COUNT) Op.nofinstances ++;
3992        return new CharOp(Op.CHAR, data);
3993    }
3994    static CharOp createAnchor(int data) {
3995        if (Op.COUNT) Op.nofinstances ++;
3996        return new CharOp(Op.ANCHOR, data);
3997    }
3998    static CharOp createCapture(int number, Op next) {
3999        if (Op.COUNT) Op.nofinstances ++;
4000        CharOp op = new CharOp(Op.CAPTURE, number);
4001        op.next = next;
4002        return op;
4003    }
4004    static UnionOp createUnion(int size) {
4005        if (Op.COUNT) Op.nofinstances ++;
4006        //System.err.println("Creates UnionOp");
4007
return new UnionOp(Op.UNION, size);
4008    }
4009    static ChildOp createClosure(int id) {
4010        if (Op.COUNT) Op.nofinstances ++;
4011        return new ModifierOp(Op.CLOSURE, id, -1);
4012    }
4013    static ChildOp createNonGreedyClosure() {
4014        if (Op.COUNT) Op.nofinstances ++;
4015        return new ChildOp(Op.NONGREEDYCLOSURE);
4016    }
4017    static ChildOp createQuestion(boolean nongreedy) {
4018        if (Op.COUNT) Op.nofinstances ++;
4019        return new ChildOp(nongreedy ? Op.NONGREEDYQUESTION : Op.QUESTION);
4020    }
4021    static RangeOp createRange(Token tok) {
4022        if (Op.COUNT) Op.nofinstances ++;
4023        return new RangeOp(Op.RANGE, tok);
4024    }
4025    static ChildOp createLook(int type, Op next, Op branch) {
4026        if (Op.COUNT) Op.nofinstances ++;
4027        ChildOp op = new ChildOp(type);
4028        op.setChild(branch);
4029        op.next = next;
4030        return op;
4031    }
4032    static CharOp createBackReference(int refno) {
4033        if (Op.COUNT) Op.nofinstances ++;
4034        return new CharOp(Op.BACKREFERENCE, refno);
4035    }
4036    static StringOp createString(String JavaDoc literal) {
4037        if (Op.COUNT) Op.nofinstances ++;
4038        return new StringOp(Op.STRING, literal);
4039    }
4040    static ChildOp createIndependent(Op next, Op branch) {
4041        if (Op.COUNT) Op.nofinstances ++;
4042        ChildOp op = new ChildOp(Op.INDEPENDENT);
4043        op.setChild(branch);
4044        op.next = next;
4045        return op;
4046    }
4047    static ModifierOp createModifier(Op next, Op branch, int add, int mask) {
4048        if (Op.COUNT) Op.nofinstances ++;
4049        ModifierOp op = new ModifierOp(Op.MODIFIER, add, mask);
4050        op.setChild(branch);
4051        op.next = next;
4052        return op;
4053    }
4054    static ConditionOp createCondition(Op next, int ref, Op conditionflow, Op yesflow, Op noflow) {
4055        if (Op.COUNT) Op.nofinstances ++;
4056        ConditionOp op = new ConditionOp(Op.CONDITION, ref, conditionflow, yesflow, noflow);
4057        op.next = next;
4058        return op;
4059    }
4060
4061    int type;
4062    Op next = null;
4063
4064    protected Op(int type) {
4065        this.type = type;
4066    }
4067
4068    int size() { // for UNION
4069
return 0;
4070    }
4071    Op elementAt(int index) { // for UNIoN
4072
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4073    }
4074    Op getChild() { // for CLOSURE, QUESTION
4075
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4076    }
4077                                                // ModifierOp
4078
int getData() { // CharOp for CHAR, BACKREFERENCE, CAPTURE, ANCHOR,
4079
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4080    }
4081    int getData2() { // ModifierOp
4082
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4083    }
4084    RangeToken getToken() { // RANGE, NRANGE
4085
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4086    }
4087    String JavaDoc getString() { // STRING
4088
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4089    }
4090
4091    // ================================================================
4092
static class CharOp extends Op {
4093        int charData;
4094        CharOp(int type, int data) {
4095            super(type);
4096            this.charData = data;
4097        }
4098        int getData() {
4099            return this.charData;
4100        }
4101    }
4102
4103    // ================================================================
4104
static class UnionOp extends Op {
4105        Vector JavaDoc branches;
4106        UnionOp(int type, int size) {
4107            super(type);
4108            this.branches = new Vector JavaDoc(size);
4109        }
4110        void addElement(Op op) {
4111            this.branches.addElement(op);
4112        }
4113        int size() {
4114            return this.branches.size();
4115        }
4116        Op elementAt(int index) {
4117            return (Op)this.branches.elementAt(index);
4118        }
4119    }
4120
4121    // ================================================================
4122
static class ChildOp extends Op {
4123        Op child;
4124        ChildOp(int type) {
4125            super(type);
4126        }
4127        void setChild(Op child) {
4128            this.child = child;
4129        }
4130        Op getChild() {
4131            return this.child;
4132        }
4133    }
4134    // ================================================================
4135
static class ModifierOp extends ChildOp {
4136        int v1;
4137        int v2;
4138        ModifierOp(int type, int v1, int v2) {
4139            super(type);
4140            this.v1 = v1;
4141            this.v2 = v2;
4142        }
4143        int getData() {
4144            return this.v1;
4145        }
4146        int getData2() {
4147            return this.v2;
4148        }
4149    }
4150    // ================================================================
4151
static class RangeOp extends Op {
4152        Token tok;
4153        RangeOp(int type, Token tok) {
4154            super(type);
4155            this.tok = tok;
4156        }
4157        RangeToken getToken() {
4158            return (RangeToken)this.tok;
4159        }
4160    }
4161    // ================================================================
4162
static class StringOp extends Op {
4163        String JavaDoc string;
4164        StringOp(int type, String JavaDoc literal) {
4165            super(type);
4166            this.string = literal;
4167        }
4168        String JavaDoc getString() {
4169            return this.string;
4170        }
4171    }
4172    // ================================================================
4173
static class ConditionOp extends Op {
4174        int refNumber;
4175        Op condition;
4176        Op yes;
4177        Op no;
4178        ConditionOp(int type, int refno, Op conditionflow, Op yesflow, Op noflow) {
4179            super(type);
4180            this.refNumber = refno;
4181            this.condition = conditionflow;
4182            this.yes = yesflow;
4183            this.no = noflow;
4184        }
4185    }
4186}
4187
4188  final static class RangeToken extends Token implements java.io.Serializable JavaDoc {
4189
4190    int[] ranges;
4191    boolean sorted;
4192    boolean compacted;
4193    RangeToken icaseCache = null;
4194    int[] map = null;
4195    int nonMapIndex;
4196
4197    RangeToken(int type) {
4198        super(type);
4199        this.setSorted(false);
4200    }
4201
4202                                                // for RANGE or NRANGE
4203
protected void addRange(int start, int end) {
4204        this.icaseCache = null;
4205        //System.err.println("Token#addRange(): "+start+" "+end);
4206
int r1, r2;
4207        if (start <= end) {
4208            r1 = start;
4209            r2 = end;
4210        } else {
4211            r1 = end;
4212            r2 = start;
4213        }
4214
4215        int pos = 0;
4216        if (this.ranges == null) {
4217            this.ranges = new int[2];
4218            this.ranges[0] = r1;
4219            this.ranges[1] = r2;
4220            this.setSorted(true);
4221        } else {
4222            pos = this.ranges.length;
4223            if (this.ranges[pos-1]+1 == r1) {
4224                this.ranges[pos-1] = r2;
4225                return;
4226            }
4227            int[] temp = new int[pos+2];
4228            System.arraycopy(this.ranges, 0, temp, 0, pos);
4229            this.ranges = temp;
4230            if (this.ranges[pos-1] >= r1)
4231                this.setSorted(false);
4232            this.ranges[pos++] = r1;
4233            this.ranges[pos] = r2;
4234            if (!this.sorted)
4235                this.sortRanges();
4236        }
4237    }
4238
4239    private final boolean isSorted() {
4240        return this.sorted;
4241    }
4242    private final void setSorted(boolean sort) {
4243        this.sorted = sort;
4244        if (!sort) this.compacted = false;
4245    }
4246    private final boolean isCompacted() {
4247        return this.compacted;
4248    }
4249    private final void setCompacted() {
4250        this.compacted = true;
4251    }
4252
4253    protected void sortRanges() {
4254        if (this.isSorted())
4255            return;
4256        if (this.ranges == null)
4257            return;
4258        //System.err.println("Do sorting: "+this.ranges.length);
4259

4260                                                // Bubble sort
4261
// Why? -- In many cases,
4262
// this.ranges has few elements.
4263
for (int i = this.ranges.length-4; i >= 0; i -= 2) {
4264            for (int j = 0; j <= i; j += 2) {
4265                if (this.ranges[j] > this.ranges[j+2]
4266                    || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
4267                    int tmp;
4268                    tmp = this.ranges[j+2];
4269                    this.ranges[j+2] = this.ranges[j];
4270                    this.ranges[j] = tmp;
4271                    tmp = this.ranges[j+3];
4272                    this.ranges[j+3] = this.ranges[j+1];
4273                    this.ranges[j+1] = tmp;
4274                }
4275            }
4276        }
4277        this.setSorted(true);
4278    }
4279
4280    /**
4281     * this.ranges is sorted.
4282     */

4283    protected void compactRanges() {
4284        boolean DEBUG = false;
4285        if (this.ranges == null || this.ranges.length <= 2)
4286            return;
4287        if (this.isCompacted())
4288            return;
4289        int base = 0; // Index of writing point
4290
int target = 0; // Index of processing point
4291

4292        while (target < this.ranges.length) {
4293            if (base != target) {
4294                this.ranges[base] = this.ranges[target++];
4295                this.ranges[base+1] = this.ranges[target++];
4296            } else
4297                target += 2;
4298            int baseend = this.ranges[base+1];
4299            while (target < this.ranges.length) {
4300                if (baseend+1 < this.ranges[target])
4301                    break;
4302                if (baseend+1 == this.ranges[target]) {
4303                    if (DEBUG)
4304                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4305                                           +", "+this.ranges[base+1]
4306                                           +"], ["+this.ranges[target]
4307                                           +", "+this.ranges[target+1]
4308                                           +"] -> ["+this.ranges[base]
4309                                           +", "+this.ranges[target+1]
4310                                           +"]");
4311                    this.ranges[base+1] = this.ranges[target+1];
4312                    baseend = this.ranges[base+1];
4313                    target += 2;
4314                } else if (baseend >= this.ranges[target+1]) {
4315                    if (DEBUG)
4316                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4317                                           +", "+this.ranges[base+1]
4318                                           +"], ["+this.ranges[target]
4319                                           +", "+this.ranges[target+1]
4320                                           +"] -> ["+this.ranges[base]
4321                                           +", "+this.ranges[base+1]
4322                                           +"]");
4323                    target += 2;
4324                } else if (baseend < this.ranges[target+1]) {
4325                    if (DEBUG)
4326                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4327                                           +", "+this.ranges[base+1]
4328                                           +"], ["+this.ranges[target]
4329                                           +", "+this.ranges[target+1]
4330                                           +"] -> ["+this.ranges[base]
4331                                           +", "+this.ranges[target+1]
4332                                           +"]");
4333                    this.ranges[base+1] = this.ranges[target+1];
4334                    baseend = this.ranges[base+1];
4335                    target += 2;
4336                } else {
4337                    throw new RuntimeException JavaDoc("Token#compactRanges(): Internel Error: ["
4338                                               +this.ranges[base]
4339                                               +","+this.ranges[base+1]
4340                                               +"] ["+this.ranges[target]
4341                                               +","+this.ranges[target+1]+"]");
4342                }
4343            } // while
4344
base += 2;
4345        }
4346
4347        if (base != this.ranges.length) {
4348            int[] result = new int[base];
4349            System.arraycopy(this.ranges, 0, result, 0, base);
4350            this.ranges = result;
4351        }
4352        this.setCompacted();
4353    }
4354
4355    protected void mergeRanges(Token token) {
4356        RangeToken tok = (RangeToken)token;
4357        this.sortRanges();
4358        tok.sortRanges();
4359        if (tok.ranges == null)
4360            return;
4361        this.icaseCache = null;
4362        this.setSorted(true);
4363        if (this.ranges == null) {
4364            this.ranges = new int[tok.ranges.length];
4365            System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
4366            return;
4367        }
4368        int[] result = new int[this.ranges.length+tok.ranges.length];
4369        for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
4370            if (i >= this.ranges.length) {
4371                result[k++] = tok.ranges[j++];
4372                result[k++] = tok.ranges[j++];
4373            } else if (j >= tok.ranges.length) {
4374                result[k++] = this.ranges[i++];
4375                result[k++] = this.ranges[i++];
4376            } else if (tok.ranges[j] < this.ranges[i]
4377                       || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
4378                result[k++] = tok.ranges[j++];
4379                result[k++] = tok.ranges[j++];
4380            } else {
4381                result[k++] = this.ranges[i++];
4382                result[k++] = this.ranges[i++];
4383            }
4384        }
4385        this.ranges = result;
4386    }
4387
4388    protected void subtractRanges(Token token) {
4389        if (token.type == NRANGE) {
4390            this.intersectRanges(token);
4391            return;
4392        }
4393        RangeToken tok = (RangeToken)token;
4394        if (tok.ranges == null || this.ranges == null)
4395            return;
4396        this.icaseCache = null;
4397        this.sortRanges();
4398        this.compactRanges();
4399        tok.sortRanges();
4400        tok.compactRanges();
4401
4402        //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
4403

4404        int[] result = new int[this.ranges.length+tok.ranges.length];
4405        int wp = 0, src = 0, sub = 0;
4406        while (src < this.ranges.length && sub < tok.ranges.length) {
4407            int srcbegin = this.ranges[src];
4408            int srcend = this.ranges[src+1];
4409            int subbegin = tok.ranges[sub];
4410            int subend = tok.ranges[sub+1];
4411            if (srcend < subbegin) { // Not overlapped
4412
// src: o-----o
4413
// sub: o-----o
4414
// res: o-----o
4415
// Reuse sub
4416
result[wp++] = this.ranges[src++];
4417                result[wp++] = this.ranges[src++];
4418            } else if (srcend >= subbegin
4419                       && srcbegin <= subend) { // Overlapped
4420
// src: o--------o
4421
// sub: o----o
4422
// sub: o----o
4423
// sub: o----o
4424
// sub: o------------o
4425
if (subbegin <= srcbegin && srcend <= subend) {
4426                                                // src: o--------o
4427
// sub: o------------o
4428
// res: empty
4429
// Reuse sub
4430
src += 2;
4431                } else if (subbegin <= srcbegin) {
4432                                                // src: o--------o
4433
// sub: o----o
4434
// res: o-----o
4435
// Reuse src(=res)
4436
this.ranges[src] = subend+1;
4437                    sub += 2;
4438                } else if (srcend <= subend) {
4439                                                // src: o--------o
4440
// sub: o----o
4441
// res: o-----o
4442
// Reuse sub
4443
result[wp++] = srcbegin;
4444                    result[wp++] = subbegin-1;
4445                    src += 2;
4446                } else {
4447                                                // src: o--------o
4448
// sub: o----o
4449
// res: o-o o-o
4450
// Reuse src(=right res)
4451
result[wp++] = srcbegin;
4452                    result[wp++] = subbegin-1;
4453                    this.ranges[src] = subend+1;
4454                    sub += 2;
4455                }
4456            } else if (subend < srcbegin) {
4457                                                // Not overlapped
4458
// src: o-----o
4459
// sub: o----o
4460
sub += 2;
4461            } else {
4462                throw new RuntimeException JavaDoc("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
4463                                           +","+this.ranges[src+1]
4464                                           +"] - ["+tok.ranges[sub]
4465                                           +","+tok.ranges[sub+1]
4466                                           +"]");
4467            }
4468        }
4469        while (src < this.ranges.length) {
4470            result[wp++] = this.ranges[src++];
4471            result[wp++] = this.ranges[src++];
4472        }
4473        this.ranges = new int[wp];
4474        System.arraycopy(result, 0, this.ranges, 0, wp);
4475                                                // this.ranges is sorted and compacted.
4476
}
4477
4478    /**
4479     * @param tok Ignore whether it is NRANGE or not.
4480     */

4481    protected void intersectRanges(Token token) {
4482        RangeToken tok = (RangeToken)token;
4483        if (tok.ranges == null || this.ranges == null)
4484            return;
4485        this.icaseCache = null;
4486        this.sortRanges();
4487        this.compactRanges();
4488        tok.sortRanges();
4489        tok.compactRanges();
4490
4491        int[] result = new int[this.ranges.length+tok.ranges.length];
4492        int wp = 0, src1 = 0, src2 = 0;
4493        while (src1 < this.ranges.length && src2 < tok.ranges.length) {
4494            int src1begin = this.ranges[src1];
4495            int src1end = this.ranges[src1+1];
4496            int src2begin = tok.ranges[src2];
4497            int src2end = tok.ranges[src2+1];
4498            if (src1end < src2begin) { // Not overlapped
4499
// src1: o-----o
4500
// src2: o-----o
4501
// res: empty
4502
// Reuse src2
4503
src1 += 2;
4504            } else if (src1end >= src2begin
4505                       && src1begin <= src2end) { // Overlapped
4506
// src1: o--------o
4507
// src2: o----o
4508
// src2: o----o
4509
// src2: o----o
4510
// src2: o------------o
4511
if (src2begin <= src2begin && src1end <= src2end) {
4512                                                // src1: o--------o
4513
// src2: o------------o
4514
// res: o--------o
4515
// Reuse src2
4516
result[wp++] = src1begin;
4517                    result[wp++] = src1end;
4518                    src1 += 2;
4519                } else if (src2begin <= src1begin) {
4520                                                // src1: o--------o
4521
// src2: o----o
4522
// res: o--o
4523
// Reuse the rest of src1
4524
result[wp++] = src1begin;
4525                    result[wp++] = src2end;
4526                    this.ranges[src1] = src2end+1;
4527                    src2 += 2;
4528                } else if (src1end <= src2end) {
4529                                                // src1: o--------o
4530
// src2: o----o
4531
// res: o--o
4532
// Reuse src2
4533
result[wp++] = src2begin;
4534                    result[wp++] = src1end;
4535                    src1 += 2;
4536                } else {
4537                                                // src1: o--------o
4538
// src2: o----o
4539
// res: o----o
4540
// Reuse the rest of src1
4541
result[wp++] = src2begin;
4542                    result[wp++] = src2end;
4543                    this.ranges[src1] = src2end+1;
4544                }
4545            } else if (src2end < src1begin) {
4546                                                // Not overlapped
4547
// src1: o-----o
4548
// src2: o----o
4549
src2 += 2;
4550            } else {
4551                throw new RuntimeException JavaDoc("Token#intersectRanges(): Internal Error: ["
4552                                           +this.ranges[src1]
4553                                           +","+this.ranges[src1+1]
4554                                           +"] & ["+tok.ranges[src2]
4555                                           +","+tok.ranges[src2+1]
4556                                           +"]");
4557            }
4558        }
4559        while (src1 < this.ranges.length) {
4560            result[wp++] = this.ranges[src1++];
4561            result[wp++] = this.ranges[src1++];
4562        }
4563        this.ranges = new int[wp];
4564        System.arraycopy(result, 0, this.ranges, 0, wp);
4565                                                // this.ranges is sorted and compacted.
4566
}
4567
4568    /**
4569     * for RANGE: Creates complement.
4570     * for NRANGE: Creates the same meaning RANGE.
4571     */

4572    static Token complementRanges(Token token) {
4573        if (token.type != RANGE && token.type != NRANGE)
4574            throw new IllegalArgumentException JavaDoc("Token#complementRanges(): must be RANGE: "+token.type);
4575        RangeToken tok = (RangeToken)token;
4576        tok.sortRanges();
4577        tok.compactRanges();
4578        int len = tok.ranges.length+2;
4579        if (tok.ranges[0] == 0)
4580            len -= 2;
4581        int last = tok.ranges[tok.ranges.length-1];
4582        if (last == UTF16_MAX)
4583            len -= 2;
4584        RangeToken ret = Token.createRange();
4585        ret.ranges = new int[len];
4586        int wp = 0;
4587        if (tok.ranges[0] > 0) {
4588            ret.ranges[wp++] = 0;
4589            ret.ranges[wp++] = tok.ranges[0]-1;
4590        }
4591        for (int i = 1; i < tok.ranges.length-2; i += 2) {
4592            ret.ranges[wp++] = tok.ranges[i]+1;
4593            ret.ranges[wp++] = tok.ranges[i+1]-1;
4594        }
4595        if (last != UTF16_MAX) {
4596            ret.ranges[wp++] = last+1;
4597            ret.ranges[wp] = UTF16_MAX;
4598        }
4599        ret.setCompacted();
4600        return ret;
4601    }
4602
4603    synchronized RangeToken getCaseInsensitiveToken() {
4604        if (this.icaseCache != null)
4605            return this.icaseCache;
4606            
4607        RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
4608        for (int i = 0; i < this.ranges.length; i += 2) {
4609            for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
4610                if (ch > 0xffff)
4611                    uppers.addRange(ch, ch);
4612                else {
4613                    char uch = Character.toUpperCase((char)ch);
4614                    uppers.addRange(uch, uch);
4615                }
4616            }
4617        }
4618        RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
4619        for (int i = 0; i < uppers.ranges.length; i += 2) {
4620            for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
4621                if (ch > 0xffff)
4622                    lowers.addRange(ch, ch);
4623                else {
4624                    char uch = Character.toUpperCase((char)ch);
4625                    lowers.addRange(uch, uch);
4626                }
4627            }
4628        }
4629        lowers.mergeRanges(uppers);
4630        lowers.mergeRanges(this);
4631        lowers.compactRanges();
4632
4633        this.icaseCache = lowers;
4634        return lowers;
4635    }
4636
4637    void dumpRanges() {
4638        System.err.print("RANGE: ");
4639        if (this.ranges == null)
4640            System.err.println(" NULL");
4641        for (int i = 0; i < this.ranges.length; i += 2) {
4642            System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
4643        }
4644        System.err.println("");
4645    }
4646
4647    boolean match(int ch) {
4648        if (this.map == null) this.createMap();
4649        boolean ret;
4650        if (this.type == RANGE) {
4651            if (ch < MAPSIZE)
4652                return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
4653            ret = false;
4654            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
4655                if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
4656                    return true;
4657            }
4658        } else {
4659            if (ch < MAPSIZE)
4660                return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
4661            ret = true;
4662            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
4663                if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
4664                    return false;
4665            }
4666        }
4667        return ret;
4668    }
4669
4670    private static final int MAPSIZE = 256;
4671    private void createMap() {
4672        int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
4673
this.map = new int[asize];
4674        this.nonMapIndex = this.ranges.length;
4675        for (int i = 0; i < asize; i ++) this.map[i] = 0;
4676        for (int i = 0; i < this.ranges.length; i += 2) {
4677            int s = this.ranges[i];
4678            int e = this.ranges[i+1];
4679            if (s < MAPSIZE) {
4680                for (int j = s; j <= e && j < MAPSIZE; j ++)
4681                    this.map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
4682
} else {
4683                this.nonMapIndex = i;
4684                break;
4685            }
4686            if (e >= MAPSIZE) {
4687                this.nonMapIndex = i;
4688                break;
4689            }
4690        }
4691        //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
4692
}
4693
4694    public String JavaDoc toString(int options) {
4695        String JavaDoc ret;
4696        if (this.type == RANGE) {
4697            if (this == Token.token_dot)
4698                ret = ".";
4699            else if (this == Token.token_0to9)
4700                ret = "\\d";
4701            else if (this == Token.token_wordchars)
4702                ret = "\\w";
4703            else if (this == Token.token_spaces)
4704                ret = "\\s";
4705            else {
4706                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
4707                sb.append("[");
4708                for (int i = 0; i < this.ranges.length; i += 2) {
4709                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
4710                    if (this.ranges[i] == this.ranges[i+1]) {
4711                        sb.append(escapeCharInCharClass(this.ranges[i]));
4712                    } else {
4713                        sb.append(escapeCharInCharClass(this.ranges[i]));
4714                        sb.append('-');
4715                        sb.append(escapeCharInCharClass(this.ranges[i+1]));
4716                    }
4717                }
4718                sb.append("]");
4719                ret = sb.toString();
4720            }
4721        } else {
4722            if (this == Token.token_not_0to9)
4723                ret = "\\D";
4724            else if (this == Token.token_not_wordchars)
4725                ret = "\\W";
4726            else if (this == Token.token_not_spaces)
4727                ret = "\\S";
4728            else {
4729                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
4730                sb.append("[^");
4731                for (int i = 0; i < this.ranges.length; i += 2) {
4732                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
4733                    if (this.ranges[i] == this.ranges[i+1]) {
4734                        sb.append(escapeCharInCharClass(this.ranges[i]));
4735                    } else {
4736                        sb.append(escapeCharInCharClass(this.ranges[i]));
4737                        sb.append('-');
4738                        sb.append(escapeCharInCharClass(this.ranges[i+1]));
4739                    }
4740                }
4741                sb.append("]");
4742                ret = sb.toString();
4743            }
4744        }
4745        return ret;
4746    }
4747
4748    private static String JavaDoc escapeCharInCharClass(int ch) {
4749        String JavaDoc ret;
4750        switch (ch) {
4751          case '[': case ']': case '-': case '^':
4752          case ',': case '\\':
4753            ret = "\\"+(char)ch;
4754            break;
4755          case '\f': ret = "\\f"; break;
4756          case '\n': ret = "\\n"; break;
4757          case '\r': ret = "\\r"; break;
4758          case '\t': ret = "\\t"; break;
4759          case 0x1b: ret = "\\e"; break;
4760          //case 0x0b: ret = "\\v"; break;
4761
default:
4762            if (ch < 0x20) {
4763                String JavaDoc pre = "0"+Integer.toHexString(ch);
4764                ret = "\\x"+pre.substring(pre.length()-2, pre.length());
4765            } else if (ch >= 0x10000) {
4766                String JavaDoc pre = "0"+Integer.toHexString(ch);
4767                ret = "\\v"+pre.substring(pre.length()-6, pre.length());
4768            } else
4769                ret = ""+(char)ch;
4770        }
4771        return ret;
4772    }
4773
4774}
4775
4776  static class RegexParser {
4777      static final int T_CHAR = 0;
4778      static final int T_EOF = 1;
4779      static final int T_OR = 2; // '|'
4780
static final int T_STAR = 3; // '*'
4781
static final int T_PLUS = 4; // '+'
4782
static final int T_QUESTION = 5; // '?'
4783
static final int T_LPAREN = 6; // '('
4784
static final int T_RPAREN = 7; // ')'
4785
static final int T_DOT = 8; // '.'
4786
static final int T_LBRACKET = 9; // '['
4787
static final int T_BACKSOLIDUS = 10; // '\'
4788
static final int T_CARET = 11; // '^'
4789
static final int T_DOLLAR = 12; // '$'
4790
static final int T_LPAREN2 = 13; // '(?:'
4791
static final int T_LOOKAHEAD = 14; // '(?='
4792
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
4793
static final int T_LOOKBEHIND = 16; // '(?<='
4794
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
4795
static final int T_INDEPENDENT = 18; // '(?>'
4796
static final int T_SET_OPERATIONS = 19; // '(?['
4797
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
4798
static final int T_COMMENT = 21; // '(?#'
4799
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
4800
static final int T_CONDITION = 23; // '(?('
4801
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
4802

4803      static class ReferencePosition {
4804          int refNumber;
4805          int position;
4806          ReferencePosition(int n, int pos) {
4807              this.refNumber = n;
4808              this.position = pos;
4809          }
4810      }
4811
4812      int offset;
4813      String JavaDoc regex;
4814      int regexlen;
4815      int options;
4816      ResourceBundle JavaDoc resources;
4817      int chardata;
4818      int nexttoken;
4819      static protected final int S_NORMAL = 0;
4820      static protected final int S_INBRACKETS = 1;
4821      static protected final int S_INXBRACKETS = 2;
4822      int context = S_NORMAL;
4823      int parennumber = 1;
4824      boolean hasBackReferences;
4825      Vector JavaDoc references = null;
4826
4827      public RegexParser() {
4828          //this.setLocale(Locale.getDefault());
4829
}
4830      public RegexParser(Locale JavaDoc locale) {
4831          //this.setLocale(locale);
4832
}
4833
4834      public void setLocale(Locale JavaDoc locale) {
4835      }
4836
4837      final ParseException ex(String JavaDoc key, int loc) {
4838          return new ParseException(EcorePlugin.INSTANCE.getString(key), loc);
4839      }
4840
4841      private final boolean isSet(int flag) {
4842          return (this.options & flag) == flag;
4843      }
4844
4845      synchronized Token parse(String JavaDoc regex, int options) throws ParseException {
4846          this.options = options;
4847          this.offset = 0;
4848          this.setContext(S_NORMAL);
4849          this.parennumber = 1;
4850          this.hasBackReferences = false;
4851          this.regex = regex;
4852          if (this.isSet(RegularExpression.EXTENDED_COMMENT))
4853              this.regex = REUtil.stripExtendedComment(this.regex);
4854          this.regexlen = this.regex.length();
4855
4856
4857          this.next();
4858          Token ret = this.parseRegex();
4859          if (this.offset != this.regexlen)
4860              throw ex("parser.parse.1", this.offset);
4861          if (this.references != null) {
4862              for (int i = 0; i < this.references.size(); i ++) {
4863                  ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
4864                  if (this.parennumber <= position.refNumber)
4865                      throw ex("parser.parse.2", position.position);
4866              }
4867              this.references.removeAllElements();
4868          }
4869          return ret;
4870      }
4871
4872      /*
4873      public RegularExpression createRegex(String regex, int options) throws ParseException {
4874          Token tok = this.parse(regex, options);
4875          return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
4876      }
4877      */

4878
4879      protected final void setContext(int con) {
4880          this.context = con;
4881      }
4882
4883      final int read() {
4884          return this.nexttoken;
4885      }
4886
4887      final void next() {
4888          if (this.offset >= this.regexlen) {
4889              this.chardata = -1;
4890              this.nexttoken = T_EOF;
4891              return;
4892          }
4893
4894          int ret;
4895          int ch = this.regex.charAt(this.offset++);
4896          this.chardata = ch;
4897
4898          if (this.context == S_INBRACKETS) {
4899              // In a character class, this.chardata has one character, that is to say,
4900
// a pair of surrogates is composed and stored to this.chardata.
4901
switch (ch) {
4902                case '\\':
4903                  ret = T_BACKSOLIDUS;
4904                  if (this.offset >= this.regexlen)
4905                      throw ex("parser.next.1", this.offset-1);
4906                  this.chardata = this.regex.charAt(this.offset++);
4907                  break;
4908
4909                case '-':
4910                  if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
4911                      && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
4912                      this.offset++;
4913                      ret = T_XMLSCHEMA_CC_SUBTRACTION;
4914                  } else
4915                      ret = T_CHAR;
4916                  break;
4917
4918                case '[':
4919                  if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
4920                      && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
4921                      this.offset++;
4922                      ret = T_POSIX_CHARCLASS_START;
4923                      break;
4924                  } // Through down
4925
default:
4926                  if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
4927                      int low = this.regex.charAt(this.offset);
4928                      if (REUtil.isLowSurrogate(low)) {
4929                          this.chardata = REUtil.composeFromSurrogates(ch, low);
4930                          this.offset ++;
4931                      }
4932                  }
4933                  ret = T_CHAR;
4934              }
4935              this.nexttoken = ret;
4936              return;
4937          }
4938
4939          switch (ch) {
4940            case '|': ret = T_OR; break;
4941            case '*': ret = T_STAR; break;
4942            case '+': ret = T_PLUS; break;
4943            case '?': ret = T_QUESTION; break;
4944            case ')': ret = T_RPAREN; break;
4945            case '.': ret = T_DOT; break;
4946            case '[': ret = T_LBRACKET; break;
4947            case '^': ret = T_CARET; break;
4948            case '$': ret = T_DOLLAR; break;
4949            case '(':
4950              ret = T_LPAREN;
4951              if (this.offset >= this.regexlen)
4952                  break;
4953              if (this.regex.charAt(this.offset) != '?')
4954                  break;
4955              if (++this.offset >= this.regexlen)
4956                  throw ex("parser.next.2", this.offset-1);
4957              ch = this.regex.charAt(this.offset++);
4958              switch (ch) {
4959                case ':': ret = T_LPAREN2; break;
4960                case '=': ret = T_LOOKAHEAD; break;
4961                case '!': ret = T_NEGATIVELOOKAHEAD; break;
4962                case '[': ret = T_SET_OPERATIONS; break;
4963                case '>': ret = T_INDEPENDENT; break;
4964                case '<':
4965                  if (this.offset >= this.regexlen)
4966                      throw ex("parser.next.2", this.offset-3);
4967                  ch = this.regex.charAt(this.offset++);
4968                  if (ch == '=') {
4969                      ret = T_LOOKBEHIND;
4970                  } else if (ch == '!') {
4971                      ret = T_NEGATIVELOOKBEHIND;
4972                  } else
4973                      throw ex("parser.next.3", this.offset-3);
4974                  break;
4975                case '#':
4976                  while (this.offset < this.regexlen) {
4977                      ch = this.regex.charAt(this.offset++);
4978                      if (ch == ')') break;
4979                  }
4980                  if (ch != ')')
4981                      throw ex("parser.next.4", this.offset-1);
4982                  ret = T_COMMENT;
4983                  break;
4984                default:
4985                  if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
4986
this.offset --;
4987                      ret = T_MODIFIERS;
4988                      break;
4989                  } else if (ch == '(') { // conditional
4990
ret = T_CONDITION; // this.offsets points the next of '('.
4991
break;
4992                  }
4993                  throw ex("parser.next.2", this.offset-2);
4994              }
4995              break;
4996              
4997            case '\\':
4998              ret = T_BACKSOLIDUS;
4999              if (this.offset >= this.regexlen)
5000                  throw ex("parser.next.1", this.offset-1);
5001              this.chardata = this.regex.charAt(this.offset++);
5002              break;
5003
5004            default:
5005              ret = T_CHAR;
5006          }
5007          this.nexttoken = ret;
5008      }
5009
5010      /**
5011       * regex ::= term (`|` term)*
5012       * term ::= factor+
5013       * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
5014       * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
5015       * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
5016       * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
5017       * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
5018       */

5019      Token parseRegex() throws ParseException {
5020          Token tok = this.parseTerm();
5021          Token parent = null;
5022          while (this.read() == T_OR) {
5023              this.next(); // '|'
5024
if (parent == null) {
5025                  parent = Token.createUnion();
5026                  parent.addChild(tok);
5027                  tok = parent;
5028              }
5029              tok.addChild(this.parseTerm());
5030          }
5031          return tok;
5032      }
5033
5034      /**
5035       * term ::= factor+
5036       */

5037      Token parseTerm() throws ParseException {
5038          int ch = this.read();
5039          if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
5040              return Token.createEmpty();
5041          } else {
5042              Token tok = this.parseFactor();
5043              Token concat = null;
5044              while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
5045                  if (concat == null) {
5046                      concat = Token.createConcat();
5047                      concat.addChild(tok);
5048                      tok = concat;
5049                  }
5050                  concat.addChild(this.parseFactor());
5051                  //tok = Token.createConcat(tok, this.parseFactor());
5052
}
5053              return tok;
5054          }
5055      }
5056
5057      // ----------------------------------------------------------------
5058

5059      Token processCaret() throws ParseException {
5060          this.next();
5061          return Token.token_linebeginning;
5062      }
5063      Token processDollar() throws ParseException {
5064          this.next();
5065          return Token.token_lineend;
5066      }
5067      Token processLookahead() throws ParseException {
5068          this.next();
5069          Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
5070          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5071          this.next(); // ')'
5072
return tok;
5073      }
5074      Token processNegativelookahead() throws ParseException {
5075          this.next();
5076          Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
5077          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5078          this.next(); // ')'
5079
return tok;
5080      }
5081      Token processLookbehind() throws ParseException {
5082          this.next();
5083          Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
5084          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5085          this.next(); // ')'
5086
return tok;
5087      }
5088      Token processNegativelookbehind() throws ParseException {
5089          this.next();
5090          Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
5091          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5092          this.next(); // ')'
5093
return tok;
5094      }
5095      Token processBacksolidus_A() throws ParseException {
5096          this.next();
5097          return Token.token_stringbeginning;
5098      }
5099      Token processBacksolidus_Z() throws ParseException {
5100          this.next();
5101          return Token.token_stringend2;
5102      }
5103      Token processBacksolidus_z() throws ParseException {
5104          this.next();
5105          return Token.token_stringend;
5106      }
5107      Token processBacksolidus_b() throws ParseException {
5108          this.next();
5109          return Token.token_wordedge;
5110      }
5111      Token processBacksolidus_B() throws ParseException {
5112          this.next();
5113          return Token.token_not_wordedge;
5114      }
5115      Token processBacksolidus_lt() throws ParseException {
5116          this.next();
5117          return Token.token_wordbeginning;
5118      }
5119      Token processBacksolidus_gt() throws ParseException {
5120          this.next();
5121          return Token.token_wordend;
5122      }
5123      Token processStar(Token tok) throws ParseException {
5124          this.next();
5125          if (this.read() == T_QUESTION) {
5126              this.next();
5127              return Token.createNGClosure(tok);
5128          } else
5129              return Token.createClosure(tok);
5130      }
5131      Token processPlus(Token tok) throws ParseException {
5132          // X+ -> XX*
5133
this.next();
5134          if (this.read() == T_QUESTION) {
5135              this.next();
5136              return Token.createConcat(tok, Token.createNGClosure(tok));
5137          } else
5138              return Token.createConcat(tok, Token.createClosure(tok));
5139      }
5140      Token processQuestion(Token tok) throws ParseException {
5141          // X? -> X|
5142
this.next();
5143          Token par = Token.createUnion();
5144          if (this.read() == T_QUESTION) {
5145              this.next();
5146              par.addChild(Token.createEmpty());
5147              par.addChild(tok);
5148          } else {
5149              par.addChild(tok);
5150              par.addChild(Token.createEmpty());
5151          }
5152          return par;
5153      }
5154      boolean checkQuestion(int off) {
5155          return off < this.regexlen && this.regex.charAt(off) == '?';
5156      }
5157      Token processParen() throws ParseException {
5158          this.next();
5159          int p = this.parennumber++;
5160          Token tok = Token.createParen(this.parseRegex(), p);
5161          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5162          this.next(); // Skips ')'
5163
return tok;
5164      }
5165      Token processParen2() throws ParseException {
5166          this.next();
5167          Token tok = Token.createParen(this.parseRegex(), 0);
5168          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5169          this.next(); // Skips ')'
5170
return tok;
5171      }
5172      Token processCondition() throws ParseException {
5173                                                  // this.offset points the next of '('
5174
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
5175                                                  // Parses a condition.
5176
int refno = -1;
5177          Token condition = null;
5178          int ch = this.regex.charAt(this.offset);
5179          if ('1' <= ch && ch <= '9') {
5180              refno = ch-'0';
5181              this.hasBackReferences = true;
5182              if (this.references == null) this.references = new Vector JavaDoc();
5183              this.references.addElement(new ReferencePosition(refno, this.offset));
5184              this.offset ++;
5185              if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
5186              this.offset ++;
5187          } else {
5188              if (ch == '?') this.offset --; // Points '('.
5189
this.next();
5190              condition = this.parseFactor();
5191              switch (condition.type) {
5192                case Token.LOOKAHEAD:
5193                case Token.NEGATIVELOOKAHEAD:
5194                case Token.LOOKBEHIND:
5195                case Token.NEGATIVELOOKBEHIND:
5196                  break;
5197                case Token.ANCHOR:
5198                  if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5199                  break;
5200                default:
5201                  throw ex("parser.factor.5", this.offset);
5202              }
5203          }
5204                                                  // Parses yes/no-patterns.
5205
this.next();
5206          Token yesPattern = this.parseRegex();
5207          Token noPattern = null;
5208          if (yesPattern.type == Token.UNION) {
5209              if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
5210              noPattern = yesPattern.getChild(1);
5211              yesPattern = yesPattern.getChild(0);
5212          }
5213          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5214          this.next();
5215          return Token.createCondition(refno, condition, yesPattern, noPattern);
5216      }
5217      Token processModifiers() throws ParseException {
5218                                                  // this.offset points the next of '?'.
5219
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
5220
int add = 0, mask = 0, ch = -1;
5221          while (this.offset < this.regexlen) {
5222              ch = this.regex.charAt(this.offset);
5223              int v = REUtil.getOptionValue(ch);
5224              if (v == 0) break; // '-' or ':'?
5225
add |= v;
5226              this.offset ++;
5227          }
5228          if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
5229          if (ch == '-') {
5230              this.offset ++;
5231              while (this.offset < this.regexlen) {
5232                  ch = this.regex.charAt(this.offset);
5233                  int v = REUtil.getOptionValue(ch);
5234                  if (v == 0) break; // ':'?
5235
mask |= v;
5236                  this.offset ++;
5237              }
5238              if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
5239          }
5240          Token tok;
5241          if (ch == ':') {
5242              this.offset ++;
5243              this.next();
5244              tok = Token.createModifierGroup(this.parseRegex(), add, mask);
5245              if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5246              this.next();
5247          } else if (ch == ')') { // such as (?-i)
5248
this.offset ++;
5249              this.next();
5250              tok = Token.createModifierGroup(this.parseRegex(), add, mask);
5251          } else
5252              throw ex("parser.factor.3", this.offset);
5253
5254          return tok;
5255      }
5256      Token processIndependent() throws ParseException {
5257          this.next();
5258          Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
5259          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5260          this.next(); // Skips ')'
5261
return tok;
5262      }
5263      Token processBacksolidus_c() throws ParseException {
5264          int ch2; // Must be in 0x0040-0x005f
5265
if (this.offset >= this.regexlen
5266              || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
5267              throw ex("parser.atom.1", this.offset-1);
5268          this.next();
5269          return Token.createChar(ch2-0x40);
5270      }
5271      Token processBacksolidus_C() throws ParseException {
5272          throw ex("parser.process.1", this.offset);
5273      }
5274      Token processBacksolidus_i() throws ParseException {
5275          Token tok = Token.createChar('i');
5276          this.next();
5277          return tok;
5278      }
5279      Token processBacksolidus_I() throws ParseException {
5280          throw ex("parser.process.1", this.offset);
5281      }
5282      Token processBacksolidus_g() throws ParseException {
5283          this.next();
5284          return Token.getGraphemePattern();
5285      }
5286      Token processBacksolidus_X() throws ParseException {
5287          this.next();
5288          return Token.getCombiningCharacterSequence();
5289      }
5290      Token processBackreference() throws ParseException {
5291          int refnum = this.chardata-'0';
5292          Token tok = Token.createBackReference(refnum);
5293          this.hasBackReferences = true;
5294          if (this.references == null) this.references = new Vector JavaDoc();
5295          this.references.addElement(new ReferencePosition(refnum, this.offset-2));
5296          this.next();
5297          return tok;
5298      }
5299
5300      // ----------------------------------------------------------------
5301

5302      /**
5303       * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
5304       * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
5305       * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
5306       * | '(?#' [^)]* ')'
5307       * minmax ::= '{' min (',' max?)? '}'
5308       * min ::= [0-9]+
5309       * max ::= [0-9]+
5310       */

5311      Token parseFactor() throws ParseException {
5312          int ch = this.read();
5313          Token tok;
5314          switch (ch) {
5315            case T_CARET: return this.processCaret();
5316            case T_DOLLAR: return this.processDollar();
5317            case T_LOOKAHEAD: return this.processLookahead();
5318            case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
5319            case T_LOOKBEHIND: return this.processLookbehind();
5320            case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
5321
5322            case T_COMMENT:
5323              this.next();
5324              return Token.createEmpty();
5325
5326            case T_BACKSOLIDUS:
5327              switch (this.chardata) {
5328                case 'A': return this.processBacksolidus_A();
5329                case 'Z': return this.processBacksolidus_Z();
5330                case 'z': return this.processBacksolidus_z();
5331                case 'b': return this.processBacksolidus_b();
5332                case 'B': return this.processBacksolidus_B();
5333                case '<': return this.processBacksolidus_lt();
5334                case '>': return this.processBacksolidus_gt();
5335              }
5336                                                  // through down
5337
}
5338          tok = this.parseAtom();
5339          ch = this.read();
5340          switch (ch) {
5341            case T_STAR: return this.processStar(tok);
5342            case T_PLUS: return this.processPlus(tok);
5343            case T_QUESTION: return this.processQuestion(tok);
5344            case T_CHAR:
5345              if (this.chardata == '{' && this.offset < this.regexlen) {
5346
5347                  int off = this.offset; // this.offset -> next of '{'
5348
int min = 0, max = -1;
5349
5350                  if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5351
5352                      min = ch -'0';
5353                      while (off < this.regexlen
5354                             && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5355                          min = min*10 +ch-'0';
5356                          if (min < 0)
5357                              throw ex("parser.quantifier.5", this.offset);
5358                      }
5359                  }
5360                  else {
5361                      throw ex("parser.quantifier.1", this.offset);
5362                  }
5363
5364                  max = min;
5365                  if (ch == ',') {
5366
5367                     if (off >= this.regexlen) {
5368                         throw ex("parser.quantifier.3", this.offset);
5369                     }
5370                     else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5371
5372                          max = ch -'0'; // {min,max}
5373
while (off < this.regexlen
5374                                 && (ch = this.regex.charAt(off++)) >= '0'
5375                                 && ch <= '9') {
5376                              max = max*10 +ch-'0';
5377                              if (max < 0)
5378                                  throw ex("parser.quantifier.5", this.offset);
5379                          }
5380
5381                          if (min > max)
5382                              throw ex("parser.quantifier.4", this.offset);
5383                     }
5384                     else { // assume {min,}
5385
max = -1;
5386                      }
5387                  }
5388
5389                 if (ch != '}')
5390                     throw ex("parser.quantifier.2", this.offset);
5391
5392                 if (this.checkQuestion(off)) { // off -> next of '}'
5393
tok = Token.createNGClosure(tok);
5394                      this.offset = off+1;
5395                  } else {
5396                      tok = Token.createClosure(tok);
5397                      this.offset = off;
5398                  }
5399
5400                  tok.setMin(min);
5401                  tok.setMax(max);
5402                  //System.err.println("CLOSURE: "+min+", "+max);
5403
this.next();
5404              }
5405          }
5406          return tok;
5407      }
5408
5409      /**
5410       * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
5411       * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
5412       * | '(?>' regex ')'
5413       * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
5414       */

5415      Token parseAtom() throws ParseException {
5416          int ch = this.read();
5417          Token tok = null;
5418          switch (ch) {
5419            case T_LPAREN: return this.processParen();
5420            case T_LPAREN2: return this.processParen2(); // '(?:'
5421
case T_CONDITION: return this.processCondition(); // '(?('
5422
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
5423
case T_INDEPENDENT: return this.processIndependent();
5424            case T_DOT:
5425              this.next(); // Skips '.'
5426
tok = Token.token_dot;
5427              break;
5428
5429              /**
5430               * char-class ::= '[' ( '^'? range ','?)+ ']'
5431               * range ::= '\d' | '\w' | '\s' | category-block | range-char
5432               * | range-char '-' range-char
5433               * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
5434               * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
5435               */

5436            case T_LBRACKET: return this.parseCharacterClass(true);
5437            case T_SET_OPERATIONS: return this.parseSetOperations();
5438
5439            case T_BACKSOLIDUS:
5440              switch (this.chardata) {
5441                case 'd': case 'D':
5442                case 'w': case 'W':
5443                case 's': case 'S':
5444                  tok = this.getTokenForShorthand(this.chardata);
5445                  this.next();
5446                  return tok;
5447
5448                case 'e': case 'f': case 'n': case 'r':
5449                case 't': case 'u': case 'v': case 'x':
5450                  {
5451                      int ch2 = this.decodeEscaped();
5452                      if (ch2 < 0x10000) {
5453                          tok = Token.createChar(ch2);
5454                      } else {
5455                          tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
5456                      }
5457                  }
5458                  break;
5459
5460                case 'c': return this.processBacksolidus_c();
5461                case 'C': return this.processBacksolidus_C();
5462                case 'i': return this.processBacksolidus_i();
5463                case 'I': return this.processBacksolidus_I();
5464                case 'g': return this.processBacksolidus_g();
5465                case 'X': return this.processBacksolidus_X();
5466                case '1': case '2': case '3': case '4':
5467                case '5': case '6': case '7': case '8': case '9':
5468                  return this.processBackreference();
5469
5470                case 'P':
5471                case 'p':
5472                  int pstart = this.offset;
5473                  tok = processBacksolidus_pP(this.chardata);
5474                  if (tok == null) throw this.ex("parser.atom.5", pstart);
5475                  break;
5476
5477                default:
5478                  tok = Token.createChar(this.chardata);
5479              }
5480              this.next();
5481              break;
5482
5483            case T_CHAR:
5484              if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
5485                  throw this.ex("parser.atom.4", this.offset-1);
5486              tok = Token.createChar(this.chardata);
5487              int high = this.chardata;
5488              this.next();
5489              if (REUtil.isHighSurrogate(high)
5490                  && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
5491                  char[] sur = new char[2];
5492                  sur[0] = (char)high;
5493                  sur[1] = (char)this.chardata;
5494                  tok = Token.createParen(Token.createString(new String JavaDoc(sur)), 0);
5495                  this.next();
5496              }
5497              break;
5498
5499            default:
5500              throw this.ex("parser.atom.4", this.offset-1);
5501          }
5502          return tok;
5503      }
5504
5505      protected RangeToken processBacksolidus_pP(int c) throws ParseException {
5506
5507          this.next();
5508          if (this.read() != T_CHAR || this.chardata != '{')
5509              throw this.ex("parser.atom.2", this.offset-1);
5510
5511          // handle category escape
5512
boolean positive = c == 'p';
5513          int namestart = this.offset;
5514          int nameend = this.regex.indexOf('}', namestart);
5515
5516          if (nameend < 0)
5517              throw this.ex("parser.atom.3", this.offset);
5518
5519          String JavaDoc pname = this.regex.substring(namestart, nameend);
5520          this.offset = nameend+1;
5521
5522          return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
5523      }
5524
5525      int processCIinCharacterClass(RangeToken tok, int c) {
5526          return this.decodeEscaped();
5527      }
5528
5529      /**
5530       * char-class ::= '[' ( '^'? range ','?)+ ']'
5531       * range ::= '\d' | '\w' | '\s' | category-block | range-char
5532       * | range-char '-' range-char
5533       * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
5534       * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
5535       */

5536      protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
5537          this.setContext(S_INBRACKETS);
5538          this.next(); // '['
5539
boolean nrange = false;
5540          RangeToken base = null;
5541          RangeToken tok;
5542          if (this.read() == T_CHAR && this.chardata == '^') {
5543              nrange = true;
5544              this.next(); // '^'
5545
if (useNrange) {
5546                  tok = Token.createNRange();
5547              } else {
5548                  base = Token.createRange();
5549                  base.addRange(0, Token.UTF16_MAX);
5550                  tok = Token.createRange();
5551              }
5552          } else {
5553              tok = Token.createRange();
5554          }
5555          int type;
5556          boolean firstloop = true;
5557          while ((type = this.read()) != T_EOF) {
5558              if (type == T_CHAR && this.chardata == ']' && !firstloop)
5559                  break;
5560              firstloop = false;
5561              int c = this.chardata;
5562              boolean end = false;
5563              if (type == T_BACKSOLIDUS) {
5564                  switch (c) {
5565                    case 'd': case 'D':
5566                    case 'w': case 'W':
5567                    case 's': case 'S':
5568                      tok.mergeRanges(this.getTokenForShorthand(c));
5569                      end = true;
5570                      break;
5571
5572                    case 'i': case 'I':
5573                    case 'c': case 'C':
5574                      c = this.processCIinCharacterClass(tok, c);
5575                      if (c < 0) end = true;
5576                      break;
5577                      
5578                    case 'p':
5579                    case 'P':
5580                      int pstart = this.offset;
5581                      RangeToken tok2 = this.processBacksolidus_pP(c);
5582                      if (tok2 == null) throw this.ex("parser.atom.5", pstart);
5583                      tok.mergeRanges(tok2);
5584                      end = true;
5585                      break;
5586
5587                    default:
5588                      c = this.decodeEscaped();
5589                  } // \ + c
5590
} // backsolidus
5591
// POSIX Character class such as [:alnum:]
5592
else if (type == T_POSIX_CHARCLASS_START) {
5593                  int nameend = this.regex.indexOf(':', this.offset);
5594                  if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
5595                  boolean positive = true;
5596                  if (this.regex.charAt(this.offset) == '^') {
5597                      this.offset ++;
5598                      positive = false;
5599                  }
5600                  String JavaDoc name = this.regex.substring(this.offset, nameend);
5601                  RangeToken range = Token.getRange(name, positive,
5602                                                    this.isSet(RegularExpression.XMLSCHEMA_MODE));
5603                  if (range == null) throw this.ex("parser.cc.3", this.offset);
5604                  tok.mergeRanges(range);
5605                  end = true;
5606                  if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
5607                      throw this.ex("parser.cc.1", nameend);
5608                  this.offset = nameend+2;
5609              }
5610              this.next();
5611              if (!end) { // if not shorthands...
5612
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
5613
tok.addRange(c, c);
5614                  } else {
5615                      this.next(); // Skips '-'
5616
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
5617                      if (type == T_CHAR && this.chardata == ']') {
5618                          tok.addRange(c, c);
5619                          tok.addRange('-', '-');
5620                      } else {
5621                          int rangeend = this.chardata;
5622                          if (type == T_BACKSOLIDUS)
5623                              rangeend = this.decodeEscaped();
5624                          this.next();
5625                          tok.addRange(c, rangeend);
5626                      }
5627                  }
5628              }
5629              if (this.isSet(RegularExpression.SPECIAL_COMMA)
5630                  && this.read() == T_CHAR && this.chardata == ',')
5631                  this.next();
5632          }
5633          if (this.read() == T_EOF)
5634              throw this.ex("parser.cc.2", this.offset);
5635          if (!useNrange && nrange) {
5636              base.subtractRanges(tok);
5637              tok = base;
5638          }
5639          tok.sortRanges();
5640          tok.compactRanges();
5641          //tok.dumpRanges();
5642
/*
5643          if (this.isSet(RegularExpression.IGNORE_CASE))
5644              tok = RangeToken.createCaseInsensitiveToken(tok);
5645          */

5646          this.setContext(S_NORMAL);
5647          this.next(); // Skips ']'
5648

5649          return tok;
5650      }
5651
5652      /**
5653       * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
5654       */

5655      protected RangeToken parseSetOperations() throws ParseException {
5656          RangeToken tok = this.parseCharacterClass(false);
5657          int type;
5658          while ((type = this.read()) != T_RPAREN) {
5659              int ch = this.chardata;
5660              if (type == T_CHAR && (ch == '-' || ch == '&')
5661                  || type == T_PLUS) {
5662                  this.next();
5663                  if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
5664                  RangeToken t2 = this.parseCharacterClass(false);
5665                  if (type == T_PLUS)
5666                      tok.mergeRanges(t2);
5667                  else if (ch == '-')
5668                      tok.subtractRanges(t2);
5669                  else if (ch == '&')
5670                      tok.intersectRanges(t2);
5671                  else