KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > emf > ecore > xml > type > internal > RegEx


1 /**
2  * <copyright>
3  *
4  * Copyright (c) 2004-2005 IBM Corporation and others.
5  * All rights reserved. This program and the accompanying materials
6  * are made available under the terms of the Eclipse Public License v1.0
7  * which accompanies this distribution, and is available at
8  * http://www.eclipse.org/legal/epl-v10.html
9  *
10  * Contributors:
11  * IBM - Initial API and implementation
12  *
13  * </copyright>
14  *
15  * $Id: RegEx.java,v 1.8 2005/06/12 13:29:22 emerks Exp $
16  *
17  * ---------------------------------------------------------------------
18  *
19  * The Apache Software License, Version 1.1
20  *
21  *
22  * Copyright (c) 1999-2004 The Apache Software Foundation. All rights
23  * reserved.
24  *
25  * Redistribution and use in source and binary forms, with or without
26  * modification, are permitted provided that the following conditions
27  * are met:
28  *
29  * 1. Redistributions of source code must retain the above copyright
30  * notice, this list of conditions and the following disclaimer.
31  *
32  * 2. Redistributions in binary form must reproduce the above copyright
33  * notice, this list of conditions and the following disclaimer in
34  * the documentation and/or other materials provided with the
35  * distribution.
36  *
37  * 3. The end-user documentation included with the redistribution,
38  * if any, must include the following acknowledgment:
39  * "This product includes software developed by the
40  * Apache Software Foundation (http://www.apache.org/)."
41  * Alternately, this acknowledgment may appear in the software itself,
42  * if and wherever such third-party acknowledgments normally appear.
43  *
44  * 4. The names "Xerces" and "Apache Software Foundation" must
45  * not be used to endorse or promote products derived from this
46  * software without prior written permission. For written
47  * permission, please contact apache@apache.org.
48  *
49  * 5. Products derived from this software may not be called "Apache",
50  * nor may "Apache" appear in their name, without prior written
51  * permission of the Apache Software Foundation.
52  *
53  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
54  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
56  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
57  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
60  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
61  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
62  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
63  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  * ====================================================================
66  *
67  * This software consists of voluntary contributions made by many
68  * individuals on behalf of the Apache Software Foundation and was
69  * originally based on software copyright (c) 1999-2003, International
70  * Business Machines, Inc., http://www.apache.org. For more
71  * information on the Apache Software Foundation, please see
72  * <http://www.apache.org/>.
73  */

74
75 package org.eclipse.emf.ecore.xml.type.internal;
76
77
78 import java.text.CharacterIterator JavaDoc;
79 import java.util.Hashtable JavaDoc;
80 import java.util.Locale JavaDoc;
81 import java.util.ResourceBundle JavaDoc;
82 import java.util.Vector JavaDoc;
83
84 import org.eclipse.emf.ecore.plugin.EcorePlugin;
85
86 /**
87  * NOTE: this class is for internal use only.
88  */

89 public final class RegEx
90 {
91   static class BMPattern
92   {
93     char[] pattern;
94
95     int[] shiftTable;
96
97     boolean ignoreCase;
98
99     public BMPattern(String JavaDoc pat, boolean ignoreCase)
100     {
101       this(pat, 256, ignoreCase);
102     }
103
104     public BMPattern(String JavaDoc pat, int tableSize, boolean ignoreCase)
105     {
106       this.pattern = pat.toCharArray();
107       this.shiftTable = new int [tableSize];
108       this.ignoreCase = ignoreCase;
109       int length = pattern.length;
110       for (int i = 0; i < this.shiftTable.length; i++)
111         this.shiftTable[i] = length;
112       for (int i = 0; i < length; i++)
113       {
114         char ch = this.pattern[i];
115         int diff = length - i - 1;
116         int index = ch % this.shiftTable.length;
117         if (diff < this.shiftTable[index])
118           this.shiftTable[index] = diff;
119         if (this.ignoreCase)
120         {
121           ch = Character.toUpperCase(ch);
122           index = ch % this.shiftTable.length;
123           if (diff < this.shiftTable[index])
124             this.shiftTable[index] = diff;
125           ch = Character.toLowerCase(ch);
126           index = ch % this.shiftTable.length;
127           if (diff < this.shiftTable[index])
128             this.shiftTable[index] = diff;
129         }
130       }
131     }
132
133     /**
134      *
135      * @return -1 if <var>iterator</var> does not contain this pattern.
136      */

137     public int matches(CharacterIterator JavaDoc iterator, int start, int limit)
138     {
139       if (this.ignoreCase)
140         return this.matchesIgnoreCase(iterator, start, limit);
141       int plength = this.pattern.length;
142       if (plength == 0)
143         return start;
144       int index = start + plength;
145       while (index <= limit)
146       {
147         int pindex = plength;
148         int nindex = index + 1;
149         char ch;
150         do
151         {
152           if ((ch = iterator.setIndex(--index)) != this.pattern[--pindex])
153             break;
154           if (pindex == 0)
155             return index;
156         }
157         while (pindex > 0);
158         index += this.shiftTable[ch % this.shiftTable.length] + 1;
159         if (index < nindex)
160           index = nindex;
161       }
162       return -1;
163     }
164
165     /**
166      *
167      * @return -1 if <var>str</var> does not contain this pattern.
168      */

169     public int matches(String JavaDoc str, int start, int limit)
170     {
171       if (this.ignoreCase)
172         return this.matchesIgnoreCase(str, start, limit);
173       int plength = this.pattern.length;
174       if (plength == 0)
175         return start;
176       int index = start + plength;
177       while (index <= limit)
178       {
179         //System.err.println("Starts at "+index);
180
int pindex = plength;
181         int nindex = index + 1;
182         char ch;
183         do
184         {
185           if ((ch = str.charAt(--index)) != this.pattern[--pindex])
186             break;
187           if (pindex == 0)
188             return index;
189         }
190         while (pindex > 0);
191         index += this.shiftTable[ch % this.shiftTable.length] + 1;
192         if (index < nindex)
193           index = nindex;
194       }
195       return -1;
196     }
197
198     /**
199      *
200      * @return -1 if <var>chars</char> does not contain this pattern.
201      */

202     public int matches(char[] chars, int start, int limit)
203     {
204       if (this.ignoreCase)
205         return this.matchesIgnoreCase(chars, start, limit);
206       int plength = this.pattern.length;
207       if (plength == 0)
208         return start;
209       int index = start + plength;
210       while (index <= limit)
211       {
212         //System.err.println("Starts at "+index);
213
int pindex = plength;
214         int nindex = index + 1;
215         char ch;
216         do
217         {
218           if ((ch = chars[--index]) != this.pattern[--pindex])
219             break;
220           if (pindex == 0)
221             return index;
222         }
223         while (pindex > 0);
224         index += this.shiftTable[ch % this.shiftTable.length] + 1;
225         if (index < nindex)
226           index = nindex;
227       }
228       return -1;
229     }
230
231     int matchesIgnoreCase(CharacterIterator JavaDoc iterator, int start, int limit)
232     {
233       int plength = this.pattern.length;
234       if (plength == 0)
235         return start;
236       int index = start + plength;
237       while (index <= limit)
238       {
239         int pindex = plength;
240         int nindex = index + 1;
241         char ch;
242         do
243         {
244           char ch1 = ch = iterator.setIndex(--index);
245           char ch2 = this.pattern[--pindex];
246           if (ch1 != ch2)
247           {
248             ch1 = Character.toUpperCase(ch1);
249             ch2 = Character.toUpperCase(ch2);
250             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
251               break;
252           }
253           if (pindex == 0)
254             return index;
255         }
256         while (pindex > 0);
257         index += this.shiftTable[ch % this.shiftTable.length] + 1;
258         if (index < nindex)
259           index = nindex;
260       }
261       return -1;
262     }
263
264     int matchesIgnoreCase(String JavaDoc text, int start, int limit)
265     {
266       int plength = this.pattern.length;
267       if (plength == 0)
268         return start;
269       int index = start + plength;
270       while (index <= limit)
271       {
272         int pindex = plength;
273         int nindex = index + 1;
274         char ch;
275         do
276         {
277           char ch1 = ch = text.charAt(--index);
278           char ch2 = this.pattern[--pindex];
279           if (ch1 != ch2)
280           {
281             ch1 = Character.toUpperCase(ch1);
282             ch2 = Character.toUpperCase(ch2);
283             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
284               break;
285           }
286           if (pindex == 0)
287             return index;
288         }
289         while (pindex > 0);
290         index += this.shiftTable[ch % this.shiftTable.length] + 1;
291         if (index < nindex)
292           index = nindex;
293       }
294       return -1;
295     }
296
297     int matchesIgnoreCase(char[] chars, int start, int limit)
298     {
299       int plength = this.pattern.length;
300       if (plength == 0)
301         return start;
302       int index = start + plength;
303       while (index <= limit)
304       {
305         int pindex = plength;
306         int nindex = index + 1;
307         char ch;
308         do
309         {
310           char ch1 = ch = chars[--index];
311           char ch2 = this.pattern[--pindex];
312           if (ch1 != ch2)
313           {
314             ch1 = Character.toUpperCase(ch1);
315             ch2 = Character.toUpperCase(ch2);
316             if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2))
317               break;
318           }
319           if (pindex == 0)
320             return index;
321         }
322         while (pindex > 0);
323         index += this.shiftTable[ch % this.shiftTable.length] + 1;
324         if (index < nindex)
325           index = nindex;
326       }
327       return -1;
328     }
329     /*
330      public static void main(String[] argv) {
331      try {
332      int[] shiftTable = new int[256];
333      initializeBoyerMoore(argv[0], shiftTable, true);
334      int o = -1;
335      CharacterIterator ite = new java.text.StringCharacterIterator(argv[1]);
336      long start = System.currentTimeMillis();
337      //for (int i = 0; i < 10000; i ++)
338      o = searchIgnoreCasesWithBoyerMoore(ite, 0, argv[0], shiftTable);
339      start = System.currentTimeMillis()-start;
340      System.out.println("Result: "+o+", Elapsed: "+start);
341      } catch (Exception ex) {
342      ex.printStackTrace();
343      }
344      }*/

345   }
346   
347   public static class Match implements Cloneable JavaDoc {
348     int[] beginpos = null;
349     int[] endpos = null;
350     int nofgroups = 0;
351
352     CharacterIterator JavaDoc ciSource = null;
353     String JavaDoc strSource = null;
354     char[] charSource = null;
355
356     /**
357      * Creates an instance.
358      */

359     public Match() {
360     }
361
362     /**
363      *
364      */

365     public synchronized Object JavaDoc clone() {
366         Match ma = new Match();
367         if (this.nofgroups > 0) {
368             ma.setNumberOfGroups(this.nofgroups);
369             if (this.ciSource != null) ma.setSource(this.ciSource);
370             if (this.strSource != null) ma.setSource(this.strSource);
371             for (int i = 0; i < this.nofgroups; i ++) {
372                 ma.setBeginning(i, this.getBeginning(i));
373                 ma.setEnd(i, this.getEnd(i));
374             }
375         }
376         return ma;
377     }
378
379     /**
380      *
381      */

382     protected void setNumberOfGroups(int n) {
383         int oldn = this.nofgroups;
384         this.nofgroups = n;
385         if (oldn <= 0
386             || oldn < n || n*2 < oldn) {
387             this.beginpos = new int[n];
388             this.endpos = new int[n];
389         }
390         for (int i = 0; i < n; i ++) {
391             this.beginpos[i] = -1;
392             this.endpos[i] = -1;
393         }
394     }
395
396     /**
397      *
398      */

399     protected void setSource(CharacterIterator JavaDoc ci) {
400         this.ciSource = ci;
401         this.strSource = null;
402         this.charSource = null;
403     }
404     /**
405      *
406      */

407     protected void setSource(String JavaDoc str) {
408         this.ciSource = null;
409         this.strSource = str;
410         this.charSource = null;
411     }
412     /**
413      *
414      */

415     protected void setSource(char[] chars) {
416         this.ciSource = null;
417         this.strSource = null;
418         this.charSource = chars;
419     }
420
421     /**
422      *
423      */

424     protected void setBeginning(int index, int v) {
425         this.beginpos[index] = v;
426     }
427
428     /**
429      *
430      */

431     protected void setEnd(int index, int v) {
432         this.endpos[index] = v;
433     }
434
435     /**
436      * Return the number of regular expression groups.
437      * This method returns 1 when the regular expression has no capturing-parenthesis.
438      */

439     public int getNumberOfGroups() {
440         if (this.nofgroups <= 0)
441             throw new IllegalStateException JavaDoc("A result is not set.");
442         return this.nofgroups;
443     }
444
445     /**
446      * Return a start position in the target text matched to specified regular expression group.
447      *
448      * @param index Less than <code>getNumberOfGroups()</code>.
449      */

450     public int getBeginning(int index) {
451         if (this.beginpos == null)
452             throw new IllegalStateException JavaDoc("A result is not set.");
453         if (index < 0 || this.nofgroups <= index)
454             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
455                                                +this.nofgroups+": "+index);
456         return this.beginpos[index];
457     }
458
459     /**
460      * Return an end position in the target text matched to specified regular expression group.
461      *
462      * @param index Less than <code>getNumberOfGroups()</code>.
463      */

464     public int getEnd(int index) {
465         if (this.endpos == null)
466             throw new IllegalStateException JavaDoc("A result is not set.");
467         if (index < 0 || this.nofgroups <= index)
468             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
469                                                +this.nofgroups+": "+index);
470         return this.endpos[index];
471     }
472
473     /**
474      * Return an substring of the target text matched to specified regular expression group.
475      *
476      * @param index Less than <code>getNumberOfGroups()</code>.
477      */

478     public String JavaDoc getCapturedText(int index) {
479         if (this.beginpos == null)
480             throw new IllegalStateException JavaDoc("match() has never been called.");
481         if (index < 0 || this.nofgroups <= index)
482             throw new IllegalArgumentException JavaDoc("The parameter must be less than "
483                                                +this.nofgroups+": "+index);
484         String JavaDoc ret;
485         int begin = this.beginpos[index], end = this.endpos[index];
486         if (begin < 0 || end < 0) return null;
487         if (this.ciSource != null) {
488             ret = REUtil.substring(this.ciSource, begin, end);
489         } else if (this.strSource != null) {
490             ret = this.strSource.substring(begin, end);
491         } else {
492             ret = new String JavaDoc(this.charSource, begin, end-begin);
493         }
494         return ret;
495     }
496   }
497   
498   public final static class REUtil {
499     private REUtil() {
500     }
501
502     static final int composeFromSurrogates(int high, int low) {
503         return 0x10000 + ((high-0xd800)<<10) + low-0xdc00;
504     }
505
506     static final boolean isLowSurrogate(int ch) {
507         return (ch & 0xfc00) == 0xdc00;
508     }
509
510     static final boolean isHighSurrogate(int ch) {
511         return (ch & 0xfc00) == 0xd800;
512     }
513
514     static final String JavaDoc decomposeToSurrogates(int ch) {
515         char[] chs = new char[2];
516         ch -= 0x10000;
517         chs[0] = (char)((ch>>10)+0xd800);
518         chs[1] = (char)((ch&0x3ff)+0xdc00);
519         return new String JavaDoc(chs);
520     }
521
522     static final String JavaDoc substring(CharacterIterator JavaDoc iterator, int begin, int end) {
523         char[] src = new char[end-begin];
524         for (int i = 0; i < src.length; i ++)
525             src[i] = iterator.setIndex(i+begin);
526         return new String JavaDoc(src);
527     }
528
529     // ================================================================
530

531     static final int getOptionValue(int ch) {
532         int ret = 0;
533         switch (ch) {
534           case 'i':
535             ret = RegularExpression.IGNORE_CASE;
536             break;
537           case 'm':
538             ret = RegularExpression.MULTIPLE_LINES;
539             break;
540           case 's':
541             ret = RegularExpression.SINGLE_LINE;
542             break;
543           case 'x':
544             ret = RegularExpression.EXTENDED_COMMENT;
545             break;
546           case 'u':
547             ret = RegularExpression.USE_UNICODE_CATEGORY;
548             break;
549           case 'w':
550             ret = RegularExpression.UNICODE_WORD_BOUNDARY;
551             break;
552           case 'F':
553             ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
554             break;
555           case 'H':
556             ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
557             break;
558           case 'X':
559             ret = RegularExpression.XMLSCHEMA_MODE;
560             break;
561           case ',':
562             ret = RegularExpression.SPECIAL_COMMA;
563             break;
564           default:
565         }
566         return ret;
567     }
568
569     static final int parseOptions(String JavaDoc opts) throws ParseException {
570         if (opts == null) return 0;
571         int options = 0;
572         for (int i = 0; i < opts.length(); i ++) {
573             int v = getOptionValue(opts.charAt(i));
574             if (v == 0)
575                 throw new ParseException("Unknown Option: "+opts.substring(i), -1);
576             options |= v;
577         }
578         return options;
579     }
580
581     static final String JavaDoc createOptionString(int options) {
582         StringBuffer JavaDoc sb = new StringBuffer JavaDoc(9);
583         if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
584             sb.append('F');
585         if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
586             sb.append('H');
587         if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
588             sb.append('X');
589         if ((options & RegularExpression.IGNORE_CASE) != 0)
590             sb.append('i');
591         if ((options & RegularExpression.MULTIPLE_LINES) != 0)
592             sb.append('m');
593         if ((options & RegularExpression.SINGLE_LINE) != 0)
594             sb.append('s');
595         if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
596             sb.append('u');
597         if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
598             sb.append('w');
599         if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
600             sb.append('x');
601         if ((options & RegularExpression.SPECIAL_COMMA) != 0)
602             sb.append(',');
603         return sb.toString().intern();
604     }
605
606     // ================================================================
607

608     static String JavaDoc stripExtendedComment(String JavaDoc regex) {
609         int len = regex.length();
610         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(len);
611         int offset = 0;
612         while (offset < len) {
613             int ch = regex.charAt(offset++);
614                                                 // Skips a white space.
615
if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ')
616                 continue;
617
618             if (ch == '#') { // Skips chracters between '#' and a line end.
619
while (offset < len) {
620                     ch = regex.charAt(offset++);
621                     if (ch == '\r' || ch == '\n')
622                         break;
623                 }
624                 continue;
625             }
626
627             int next; // Strips an escaped white space.
628
if (ch == '\\' && offset < len) {
629                 if ((next = regex.charAt(offset)) == '#'
630                     || next == '\t' || next == '\n' || next == '\f'
631                     || next == '\r' || next == ' ') {
632                     buffer.append((char)next);
633                     offset ++;
634                 } else { // Other escaped character.
635
buffer.append('\\');
636                     buffer.append((char)next);
637                     offset ++;
638                 }
639             } else // As is.
640
buffer.append((char)ch);
641         }
642         return buffer.toString();
643     }
644
645     // ================================================================
646

647     /**
648      * Sample entry.
649      * <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil &lt;regex&gt; &lt;string&gt;</KBD></div>
650      */

651     public static void main(String JavaDoc[] argv) {
652         String JavaDoc pattern = null;
653         try {
654             String JavaDoc options = "";
655             String JavaDoc target = null;
656             if( argv.length == 0 ) {
657                 System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" );
658                 System.exit( 0 );
659             }
660             for (int i = 0; i < argv.length; i ++) {
661                 if (argv[i].length() == 0 || argv[i].charAt(0) != '-') {
662                     if (pattern == null)
663                         pattern = argv[i];
664                     else if (target == null)
665                         target = argv[i];
666                     else
667                         System.err.println("Unnecessary: "+argv[i]);
668                 } else if (argv[i].equals("-i")) {
669                     options += "i";
670                 } else if (argv[i].equals("-m")) {
671                     options += "m";
672                 } else if (argv[i].equals("-s")) {
673                     options += "s";
674                 } else if (argv[i].equals("-u")) {
675                     options += "u";
676                 } else if (argv[i].equals("-w")) {
677                     options += "w";
678                 } else if (argv[i].equals("-X")) {
679                     options += "X";
680                 } else {
681                     System.err.println("Unknown option: "+argv[i]);
682                 }
683             }
684             RegularExpression reg = new RegularExpression(pattern, options);
685             System.out.println("RegularExpression: "+reg);
686             Match match = new Match();
687             reg.matches(target, match);
688             for (int i = 0; i < match.getNumberOfGroups(); i ++) {
689                 if (i == 0 ) System.out.print("Matched range for the whole pattern: ");
690                 else System.out.print("["+i+"]: ");
691                 if (match.getBeginning(i) < 0)
692                     System.out.println("-1");
693                 else {
694                     System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", ");
695                     System.out.println("\""+match.getCapturedText(i)+"\"");
696                 }
697             }
698         } catch (ParseException pe) {
699             if (pattern == null) {
700                 pe.printStackTrace();
701             } else {
702                 System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage());
703                 String JavaDoc indent = " ";
704                 System.err.println(indent+pattern);
705                 int loc = pe.getLocation();
706                 if (loc >= 0) {
707                     System.err.print(indent);
708                     for (int i = 0; i < loc; i ++) System.err.print("-");
709                     System.err.println("^");
710                 }
711             }
712         } catch (Exception JavaDoc e) {
713             e.printStackTrace();
714         }
715     }
716
717     static final int CACHESIZE = 20;
718     static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE];
719     /**
720      * Creates a RegularExpression instance.
721      * This method caches created instances.
722      *
723      * @see RegularExpression#RegularExpression(String, String)
724      */

725     public static RegularExpression createRegex(String JavaDoc pattern, String JavaDoc options)
726         throws ParseException {
727         RegularExpression re = null;
728         int intOptions = REUtil.parseOptions(options);
729         synchronized (REUtil.regexCache) {
730             int i;
731             for (i = 0; i < REUtil.CACHESIZE; i ++) {
732                 RegularExpression cached = REUtil.regexCache[i];
733                 if (cached == null) {
734                     i = -1;
735                     break;
736                 }
737                 if (cached.equals(pattern, intOptions)) {
738                     re = cached;
739                     break;
740                 }
741             }
742             if (re != null) {
743                 if (i != 0) {
744                     System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i);
745                     REUtil.regexCache[0] = re;
746                 }
747             } else {
748                 re = new RegularExpression(pattern, options);
749                 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1);
750                 REUtil.regexCache[0] = re;
751             }
752         }
753         return re;
754     }
755
756     /**
757      *
758      * @see RegularExpression#matches(String)
759      */

760     public static boolean matches(String JavaDoc regex, String JavaDoc target) throws ParseException {
761         return REUtil.createRegex(regex, null).matches(target);
762     }
763
764     /**
765      *
766      * @see RegularExpression#matches(String)
767      */

768     public static boolean matches(String JavaDoc regex, String JavaDoc options, String JavaDoc target) throws ParseException {
769         return REUtil.createRegex(regex, options).matches(target);
770     }
771
772     // ================================================================
773

774     /**
775      *
776      */

777     public static String JavaDoc quoteMeta(String JavaDoc literal) {
778         int len = literal.length();
779         StringBuffer JavaDoc buffer = null;
780         for (int i = 0; i < len; i ++) {
781             int ch = literal.charAt(i);
782             if (".*+?{[()|\\^$".indexOf(ch) >= 0) {
783                 if (buffer == null) {
784                     buffer = new StringBuffer JavaDoc(i+(len-i)*2);
785                     if (i > 0) buffer.append(literal.substring(0, i));
786                 }
787                 buffer.append('\\');
788                 buffer.append((char)ch);
789             } else if (buffer != null)
790                 buffer.append((char)ch);
791         }
792         return buffer != null ? buffer.toString() : literal;
793     }
794
795     // ================================================================
796

797     static void dumpString(String JavaDoc v) {
798         for (int i = 0; i < v.length(); i ++) {
799             System.out.print(Integer.toHexString(v.charAt(i)));
800             System.out.print(" ");
801         }
802         System.out.println();
803     }
804 }
805   
806
807   /**
808    * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
809    * This engine does not conform to the POSIX regular expression.
810    *
811    * <hr width="50%">
812    * <h3>How to use</h3>
813    *
814    * <dl>
815    * <dt>A. Standard way
816    * <dd>
817    * <pre>
818    * RegularExpression re = new RegularExpression(<var>regex</var>);
819    * if (re.matches(text)) { ... }
820    * </pre>
821    *
822    * <dt>B. Capturing groups
823    * <dd>
824    * <pre>
825    * RegularExpression re = new RegularExpression(<var>regex</var>);
826    * Match match = new Match();
827    * if (re.matches(text, match)) {
828    * ... // You can refer captured texts with methods of the <code>Match</code> class.
829    * }
830    * </pre>
831    *
832    * </dl>
833    *
834    * <h4>Case-insensitive matching</h4>
835    * <pre>
836    * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
837    * if (re.matches(text) >= 0) { ...}
838    * </pre>
839    *
840    * <h4>Options</h4>
841    * <p>You can specify options to <a HREF="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
842    * or <a HREF="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
843    * This <var>options</var> parameter consists of the following characters.
844    * </p>
845    * <dl>
846    * <dt><a name="I_OPTION"><code>"i"</code></a>
847    * <dd>This option indicates case-insensitive matching.
848    * <dt><a name="M_OPTION"><code>"m"</code></a>
849    * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
850    * <dt><a name="S_OPTION"><code>"s"</code></a>
851    * <dd class="REGEX"><kbd>.</kbd> matches any one character.
852    * <dt><a name="U_OPTION"><code>"u"</code></a>
853    * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
854    * <dt><a name="W_OPTION"><code>"w"</code></a>
855    * <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
856    * 'Unicode Regular Expression Guidelines' Revision 4.
857    * When "w" and "u" are specified at the same time,
858    * <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
859    * <dt><a name="COMMA_OPTION"><code>","</code></a>
860    * <dd>The parser treats a comma in a character class as a range separator.
861    * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
862    * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
863    *
864    * <dt><a name="X_OPTION"><code>"X"</code></a>
865    * <dd class="REGEX">
866    * By this option, the engine confoms to <a HREF="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
867    * The <code>match()</code> method does not do subsring matching
868    * but entire string matching.
869    *
870    * </dl>
871    *
872    * <hr width="50%">
873    * <h3>Syntax</h3>
874    * <table border="1" bgcolor="#ddeeff">
875    * <tr>
876    * <td>
877    * <h4>Differences from the Perl 5 regular expression</h4>
878    * <ul>
879    * <li>There is 6-digit hexadecimal character representation (<kbd>\v</kbd><var>HHHHHH</var>.)
880    * <li>Supports subtraction, union, and intersection operations for character classes.
881    * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
882    * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
883    * <kbd>\ u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
884    * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
885    * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
886    * </ul>
887    * </td>
888    * </tr>
889    * </table>
890    *
891    * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
892    * <ul>
893    * <li>Character
894    * <dl>
895    * <dt class="REGEX"><kbd>.</kbd> (A period)
896    * <dd>Matches any one character except the following characters.
897    * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
898    * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
899    * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
900    * <dd>When <a HREF="#S_OPTION">the "s" option</a> is specified,
901    * it matches any character including the above four characters.
902    *
903    * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
904    * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
905    * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
906    *
907    * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
908    * <dd>Matches a control character.
909    * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
910    * '<kbd>[</kbd>', '<kbd>\</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
911    * It matches a control character of which the character code is less than
912    * the character code of the <var>C</var> by 0x0040.
913    * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
914    * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
915    *
916    * <dt class="REGEX">a non-meta character
917    * <dd>Matches the character.
918    *
919    * <dt class="REGEX"><KBD>\</KBD> + a meta character
920    * <dd>Matches the meta character.
921    *
922    * <dt class="REGEX"><kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>
923    * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
924    * You can write just 2 digits for <kbd>\x</kbd><var>HH</var>, and
925    * variable length digits for <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>.
926    *
927    * <!--
928    * <dt class="REGEX"><kbd>\ u</kbd><var>HHHH</var>
929    * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
930    * -->
931    *
932    * <dt class="REGEX"><kbd>\v</kbd><var>HHHHHH</var>
933    * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
934    *
935    * <dt class="REGEX"><kbd>\g</kbd>
936    * <dd>Matches a grapheme.
937    * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
938    *
939    * <dt class="REGEX"><kbd>\X</kbd>
940    * <dd class="REGEX">Matches a combining character sequence.
941    * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
942    * </dl>
943    * </li>
944    *
945    * <li>Character class
946    * <dl>
947   + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a HREF="#COMMA_OPTION">"," option</a>)
948   + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a HREF="#COMMA_OPTION">"," option</a>)
949    * <dd>Positive character class. It matches a character in ranges.
950    * <dd><var>R<sub>n</sub></var>:
951    * <ul>
952    * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\ u</kbd><var>HHHH</var--> <kbd>\v</kbd><var>HHHHHH</var>)
953    * <p>This range matches the character.
954    * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
955    * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
956   + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
957   + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
958    * <p>...
959    * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
960    * <p>These expressions specifies the same ranges as the following expressions.
961    * </ul>
962    * <p class="REGEX">Enumerated ranges are merged (union operation).
963    * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
964    *
965    * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a HREF="#COMMA_OPTION">"," option</a>)
966    * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a HREF="#COMMA_OPTION">"," option</a>)
967    * <dd>Negative character class. It matches a character not in ranges.
968    *
969    * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
970    * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
971    * <dd>Subtraction or union or intersection for character classes.
972    * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
973    * <dd>The result of this operations is a <u>positive character class</u>
974    * even if an expression includes any negative character classes.
975    * You have to take care on this in case-insensitive matching.
976    * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
977    * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
978    * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
979    * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
980    * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
981    *
982    * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a HREF="#X_OPTION">"X" option</a>)</dt>
983    * <dd>Character class subtraction for the XML Schema.
984    * You can use this syntax when you specify an <a HREF="#X_OPTION">"X" option</a>.
985    *
986    * <dt class="REGEX"><kbd>\d</kbd>
987    * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
988    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
989    * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
990    *
991    * <dt class="REGEX"><kbd>\D</kbd>
992    * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
993    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
994    * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
995    *
996    * <dt class="REGEX"><kbd>\s</kbd>
997    * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
998    * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
999    * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
1000   *
1001   * <dt class="REGEX"><kbd>\S</kbd>
1002   * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
1003   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1004   * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
1005   *
1006   * <dt class="REGEX"><kbd>\w</kbd>
1007   * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
1008   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1009   * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
1010   *
1011   * <dt class="REGEX"><kbd>\W</kbd>
1012   * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
1013   * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
1014   * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
1015   *
1016   * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
1017   * <dd>Matches one character in the specified General Category (the second field in <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
1018   * The following names are available:
1019   * <dl>
1020   * <dt>Unicode General Categories:
1021   * <dd><kbd>
1022   * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
1023   * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
1024   * </kbd>
1025   * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
1026   * <dt>Unicode Blocks:
1027   * <dd><kbd>
1028   * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
1029   * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
1030   * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
1031   * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
1032   * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
1033   * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
1034   * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
1035   * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
1036   * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
1037   * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
1038   * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
1039   * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
1040   * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
1041   * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
1042   * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
1043   * Small Form Variants, Arabic Presentation Forms-B, Specials,
1044   * Halfwidth and Fullwidth Forms
1045   * </kbd>
1046   * <dt>Others:
1047   * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u0000-\v10FFFF]</kbd>)
1048   * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
1049   * <dd><kbd>UNASSGINED</kbd>
1050   * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
1051   * </dl>
1052   *
1053   * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
1054   * <dd>Matches one character not in the specified General Category or the specified Block.
1055   * </dl>
1056   * </li>
1057   *
1058   * <li>Selection and Quantifier
1059   * <dl>
1060   * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
1061   * <dd>...
1062   *
1063   * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
1064   * <dd>Matches 0 or more <var>X</var>.
1065   *
1066   * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
1067   * <dd>Matches 1 or more <var>X</var>.
1068   *
1069   * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
1070   * <dd>Matches 0 or 1 <var>X</var>.
1071   *
1072   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
1073   * <dd>Matches <var>number</var> times.
1074   *
1075   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
1076   * <dd>...
1077   *
1078   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
1079   * <dd>...
1080   *
1081   * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
1082   * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
1083   * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
1084   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
1085   * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
1086   * <dd>Non-greedy matching.
1087   * </dl>
1088   * </li>
1089   *
1090   * <li>Grouping, Capturing, and Back-reference
1091   * <dl>
1092   * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
1093   * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
1094   * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
1095   * you have to write "<KBD>(?:foo)+</KBD>".
1096   *
1097   * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
1098   * <dd>Grouping with capturing.
1099   * It make a group and applications can know
1100   * where in target text a group matched with methods of a <code>Match</code> instance
1101   * after <code><a HREF="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
1102   * The 0th group means whole of this regular expression.
1103   * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
1104   *
1105   * <p>For instance, a regular expression is
1106   * "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
1107   * and target text is
1108   * "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
1109   * <ul>
1110   * <li><code>Match.getCapturedText(0)</code>:
1111   * "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
1112   * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
1113   * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
1114   * </ul>
1115   *
1116   * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
1117   * <dd>
1118   *
1119   * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
1120   * <dd>Independent expression group. ................
1121   *
1122   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
1123   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
1124   * <dd>............................
1125   * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
1126   * Note that it can not contain 'u'.
1127   *
1128   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
1129   * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
1130   * <dd>......
1131   * <dd>These expressions must be at the beginning of a group.
1132   * </dl>
1133   * </li>
1134   *
1135   * <li>Anchor
1136   * <dl>
1137   * <dt class="REGEX"><kbd>\A</kbd>
1138   * <dd>Matches the beginnig of the text.
1139   *
1140   * <dt class="REGEX"><kbd>\Z</kbd>
1141   * <dd>Matches the end of the text, or before an EOL character at the end of the text,
1142   * or CARRIAGE RETURN + LINE FEED at the end of the text.
1143   *
1144   * <dt class="REGEX"><kbd>\z</kbd>
1145   * <dd>Matches the end of the text.
1146   *
1147   * <dt class="REGEX"><kbd>^</kbd>
1148   * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
1149   * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
1150   * it matches the beginning of the text, or after one of EOL characters (
1151   * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
1152   * PARAGRAPH SEPARATOR (U+2029).)
1153   *
1154   * <dt class="REGEX"><kbd>$</kbd>
1155   * <dd>Matches the end of the text, or before an EOL character at the end of the text,
1156   * or CARRIAGE RETURN + LINE FEED at the end of the text.
1157   * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
1158   * it matches the end of the text, or before an EOL character.
1159   *
1160   * <dt class="REGEX"><kbd>\b</kbd>
1161   * <dd>Matches word boundary.
1162   * (See <a HREF="#W_OPTION">a "w" option</a>)
1163   *
1164   * <dt class="REGEX"><kbd>\B</kbd>
1165   * <dd>Matches non word boundary.
1166   * (See <a HREF="#W_OPTION">a "w" option</a>)
1167   *
1168   * <dt class="REGEX"><kbd>\&lt;</kbd>
1169   * <dd>Matches the beginning of a word.
1170   * (See <a HREF="#W_OPTION">a "w" option</a>)
1171   *
1172   * <dt class="REGEX"><kbd>\&gt;</kbd>
1173   * <dd>Matches the end of a word.
1174   * (See <a HREF="#W_OPTION">a "w" option</a>)
1175   * </dl>
1176   * </li>
1177   * <li>Lookahead and lookbehind
1178   * <dl>
1179   * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
1180   * <dd>Lookahead.
1181   *
1182   * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
1183   * <dd>Negative lookahead.
1184   *
1185   * <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
1186   * <dd>Lookbehind.
1187   * <dd>(Note for text capturing......)
1188   *
1189   * <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
1190   * <dd>Negative lookbehind.
1191   * </dl>
1192   * </li>
1193   *
1194   * <li>Misc.
1195   * <dl>
1196   * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
1197   * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
1198   * <dd>......
1199   * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
1200   * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
1201   * You can not write comments in character classes and before quantifiers.
1202   * </dl>
1203   * </li>
1204   * </ul>
1205   *
1206   *
1207   * <hr width="50%">
1208   * <h3>BNF for the regular expression</h3>
1209   * <pre>
1210   * regex ::= ('(?' options ')')? term ('|' term)*
1211   * term ::= factor+
1212   * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
1213   * | '(?#' [^)]* ')'
1214   * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
1215   * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
1216   * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
1217   * | '(?>' regex ')' | '(?' options ':' regex ')'
1218   * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
1219   * options ::= [imsw]* ('-' [imsw]+)?
1220   * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
1221   * looks ::= '(?=' regex ')' | '(?!' regex ')'
1222   * | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
1223   * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
1224   * category-block ::= '\' [pP] category-symbol-1
1225   * | ('\p{' | '\P{') (category-symbol | block-name
1226   * | other-properties) '}'
1227   * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
1228   * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
1229   * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
1230   * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
1231   * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
1232   * | 'Sm' | 'Sc' | 'Sk' | 'So'
1233   * block-name ::= (See above)
1234   * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
1235   * character-1 ::= (any character except meta-characters)
1236   *
1237   * char-class ::= '[' ranges ']'
1238   * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
1239   * ranges ::= '^'? (range <a HREF="#COMMA_OPTION">','?</a>)+
1240   * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
1241   * | range-char | range-char '-' range-char
1242   * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
1243   * code-point ::= '\x' hex-char hex-char
1244   * | '\x{' hex-char+ '}'
1245   * <!-- | '\ u' hex-char hex-char hex-char hex-char
1246   * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
1247   * hex-char ::= [0-9a-fA-F]
1248   * character-2 ::= (any character except \[]-,)
1249   * </pre>
1250   *
1251   * <hr width="50%">
1252   * <h3>to do</h3>
1253   * <ul>
1254   * <li><a HREF="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
1255   * <ul>
1256   * <li>2.4 Canonical Equivalents
1257   * <li>Level 3
1258   * </ul>
1259   * <li>Parsing performance
1260   * </ul>
1261   *
1262   * <hr width="50%">
1263   *
1264   * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
1265   * @version $Id: RegEx.java,v 1.8 2005/06/12 13:29:22 emerks Exp $
1266   */

1267  public static class RegularExpression implements java.io.Serializable JavaDoc {
1268      static final boolean DEBUG = false;
1269
1270      /**
1271       * Compiles a token tree into an operation flow.
1272       */

1273      private synchronized void compile(Token tok) {
1274          if (this.operations != null)
1275              return;
1276          this.numberOfClosures = 0;
1277          this.operations = this.compile(tok, null, false);
1278      }
1279
1280      /**
1281       * Converts a token to an operation.
1282       */

1283      private Op compile(Token tok, Op next, boolean reverse) {
1284          Op ret;
1285          switch (tok.type) {
1286          case Token.DOT:
1287              ret = Op.createDot();
1288              ret.next = next;
1289              break;
1290
1291          case Token.CHAR:
1292              ret = Op.createChar(tok.getChar());
1293              ret.next = next;
1294              break;
1295
1296          case Token.ANCHOR:
1297              ret = Op.createAnchor(tok.getChar());
1298              ret.next = next;
1299              break;
1300
1301          case Token.RANGE:
1302          case Token.NRANGE:
1303              ret = Op.createRange(tok);
1304              ret.next = next;
1305              break;
1306
1307          case Token.CONCAT:
1308              ret = next;
1309              if (!reverse) {
1310                  for (int i = tok.size()-1; i >= 0; i --) {
1311                      ret = compile(tok.getChild(i), ret, false);
1312                  }
1313              } else {
1314                  for (int i = 0; i < tok.size(); i ++) {
1315                      ret = compile(tok.getChild(i), ret, true);
1316                  }
1317              }
1318              break;
1319
1320          case Token.UNION:
1321              Op.UnionOp uni = Op.createUnion(tok.size());
1322              for (int i = 0; i < tok.size(); i ++) {
1323                  uni.addElement(compile(tok.getChild(i), next, reverse));
1324              }
1325              ret = uni; // ret.next is null.
1326
break;
1327
1328          case Token.CLOSURE:
1329          case Token.NONGREEDYCLOSURE:
1330              Token child = tok.getChild(0);
1331              int min = tok.getMin();
1332              int max = tok.getMax();
1333              if (min >= 0 && min == max) { // {n}
1334
ret = next;
1335                  for (int i = 0; i < min; i ++) {
1336                      ret = compile(child, ret, reverse);
1337                  }
1338                  break;
1339              }
1340              if (min > 0 && max > 0)
1341                  max -= min;
1342              if (max > 0) {
1343                  // X{2,6} -> XX(X(X(XX?)?)?)?
1344
ret = next;
1345                  for (int i = 0; i < max; i ++) {
1346                      Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
1347                      q.next = next;
1348                      q.setChild(compile(child, ret, reverse));
1349                      ret = q;
1350                  }
1351              } else {
1352                  Op.ChildOp op;
1353                  if (tok.type == Token.NONGREEDYCLOSURE) {
1354                      op = Op.createNonGreedyClosure();
1355                  } else { // Token.CLOSURE
1356
if (child.getMinLength() == 0)
1357                          op = Op.createClosure(this.numberOfClosures++);
1358                      else
1359                          op = Op.createClosure(-1);
1360                  }
1361                  op.next = next;
1362                  op.setChild(compile(child, op, reverse));
1363                  ret = op;
1364              }
1365              if (min > 0) {
1366                  for (int i = 0; i < min; i ++) {
1367                      ret = compile(child, ret, reverse);
1368                  }
1369              }
1370              break;
1371
1372          case Token.EMPTY:
1373              ret = next;
1374              break;
1375
1376          case Token.STRING:
1377              ret = Op.createString(tok.getString());
1378              ret.next = next;
1379              break;
1380
1381          case Token.BACKREFERENCE:
1382              ret = Op.createBackReference(tok.getReferenceNumber());
1383              ret.next = next;
1384              break;
1385
1386          case Token.PAREN:
1387              if (tok.getParenNumber() == 0) {
1388                  ret = compile(tok.getChild(0), next, reverse);
1389              } else if (reverse) {
1390                  next = Op.createCapture(tok.getParenNumber(), next);
1391                  next = compile(tok.getChild(0), next, reverse);
1392                  ret = Op.createCapture(-tok.getParenNumber(), next);
1393              } else {
1394                  next = Op.createCapture(-tok.getParenNumber(), next);
1395                  next = compile(tok.getChild(0), next, reverse);
1396                  ret = Op.createCapture(tok.getParenNumber(), next);
1397              }
1398              break;
1399
1400          case Token.LOOKAHEAD:
1401              ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
1402              break;
1403          case Token.NEGATIVELOOKAHEAD:
1404              ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
1405              break;
1406          case Token.LOOKBEHIND:
1407              ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
1408              break;
1409          case Token.NEGATIVELOOKBEHIND:
1410              ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
1411              break;
1412
1413          case Token.INDEPENDENT:
1414              ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
1415              break;
1416
1417          case Token.MODIFIERGROUP:
1418              ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
1419                                      ((Token.ModifierToken)tok).getOptions(),
1420                                      ((Token.ModifierToken)tok).getOptionsMask());
1421              break;
1422
1423          case Token.CONDITION:
1424              Token.ConditionToken ctok = (Token.ConditionToken)tok;
1425              int ref = ctok.refNumber;
1426              Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
1427              Op yes = compile(ctok.yes, next, reverse);
1428              Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
1429              ret = Op.createCondition(next, ref, condition, yes, no);
1430              break;
1431
1432          default:
1433              throw new RuntimeException JavaDoc("Unknown token type: "+tok.type);
1434          } // switch (tok.type)
1435
return ret;
1436      }
1437
1438
1439// Public
1440

1441      /**
1442       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1443       *
1444       * @return true if the target is matched to this regular expression.
1445       */

1446      public boolean matches(char[] target) {
1447          return this.matches(target, 0, target .length , (Match)null);
1448      }
1449
1450      /**
1451       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1452       * in specified range or not.
1453       *
1454       * @param start Start offset of the range.
1455       * @param end End offset +1 of the range.
1456       * @return true if the target is matched to this regular expression.
1457       */

1458      public boolean matches(char[] target, int start, int end) {
1459          return this.matches(target, start, end, (Match)null);
1460      }
1461
1462      /**
1463       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1464       *
1465       * @param match A Match instance for storing matching result.
1466       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1467       */

1468      public boolean matches(char[] target, Match match) {
1469          return this.matches(target, 0, target .length , match);
1470      }
1471
1472
1473      /**
1474       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1475       * in specified range or not.
1476       *
1477       * @param start Start offset of the range.
1478       * @param end End offset +1 of the range.
1479       * @param match A Match instance for storing matching result.
1480       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1481       */

1482      public boolean matches(char[] target, int start, int end, Match match) {
1483
1484          synchronized (this) {
1485              if (this.operations == null)
1486                  this.prepare();
1487              if (this.context == null)
1488                  this.context = new Context();
1489          }
1490          Context con = null;
1491          synchronized (this.context) {
1492              con = this.context.inuse ? new Context() : this.context;
1493              con.reset(target, start, end, this.numberOfClosures);
1494          }
1495          if (match != null) {
1496              match.setNumberOfGroups(this.nofparen);
1497              match.setSource(target);
1498          } else if (this.hasBackReferences) {
1499              match = new Match();
1500              match.setNumberOfGroups(this.nofparen);
1501              // Need not to call setSource() because
1502
// a caller can not access this match instance.
1503
}
1504          con.match = match;
1505
1506          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1507              int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
1508              //System.err.println("DEBUG: matchEnd="+matchEnd);
1509
if (matchEnd == con.limit) {
1510                  if (con.match != null) {
1511                      con.match.setBeginning(0, con.start);
1512                      con.match.setEnd(0, matchEnd);
1513                  }
1514                  con.inuse = false;
1515                  return true;
1516              }
1517              return false;
1518          }
1519
1520          /*
1521           * The pattern has only fixed string.
1522           * The engine uses Boyer-Moore.
1523           */

1524          if (this.fixedStringOnly) {
1525              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1526
int o = this.fixedStringTable.matches(target, con.start, con.limit);
1527              if (o >= 0) {
1528                  if (con.match != null) {
1529                      con.match.setBeginning(0, o);
1530                      con.match.setEnd(0, o+this.fixedString.length());
1531                  }
1532                  con.inuse = false;
1533                  return true;
1534              }
1535              con.inuse = false;
1536              return false;
1537          }
1538
1539          /*
1540           * The pattern contains a fixed string.
1541           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1542           * If not, it return with false.
1543           */

1544          if (this.fixedString != null) {
1545              int o = this.fixedStringTable.matches(target, con.start, con.limit);
1546              if (o < 0) {
1547                  //System.err.println("Non-match in fixed-string search.");
1548
con.inuse = false;
1549                  return false;
1550              }
1551          }
1552
1553          int limit = con.limit-this.minlength;
1554          int matchStart;
1555          int matchEnd = -1;
1556
1557          /*
1558           * Checks whether the expression starts with ".*".
1559           */

1560          if (this.operations != null
1561              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1562              if (isSet(this.options, SINGLE_LINE)) {
1563                  matchStart = con.start;
1564                  matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
1565              } else {
1566                  boolean previousIsEOL = true;
1567                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1568                      int ch = target [ matchStart ] ;
1569                      if (isEOLChar(ch)) {
1570                          previousIsEOL = true;
1571                      } else {
1572                          if (previousIsEOL) {
1573                              if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1574                                                                         matchStart, 1, this.options)))
1575                                  break;
1576                          }
1577                          previousIsEOL = false;
1578                      }
1579                  }
1580              }
1581          }
1582
1583          /*
1584           * Optimization against the first character.
1585           */

1586          else if (this.firstChar != null) {
1587              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1588
RangeToken range = this.firstChar;
1589              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
1590                  range = this.firstChar.getCaseInsensitiveToken();
1591                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1592                      int ch = target [ matchStart ] ;
1593                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1594                          ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
1595                          if (!range.match(ch)) continue;
1596                      } else {
1597                          if (!range.match(ch)) {
1598                              char ch1 = Character.toUpperCase((char)ch);
1599                              if (!range.match(ch1))
1600                                  if (!range.match(Character.toLowerCase(ch1)))
1601                                      continue;
1602                          }
1603                      }
1604                      if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1605                                                                 matchStart, 1, this.options)))
1606                          break;
1607                  }
1608              } else {
1609                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1610                      int ch = target [ matchStart ] ;
1611                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
1612                          ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
1613                      if (!range.match(ch)) continue;
1614                      if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
1615                                                                 matchStart, 1, this.options)))
1616                          break;
1617                  }
1618              }
1619          }
1620
1621          /*
1622           * Straightforward matching.
1623           */

1624          else {
1625              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1626                  if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
1627                      break;
1628              }
1629          }
1630
1631          if (matchEnd >= 0) {
1632              if (con.match != null) {
1633                  con.match.setBeginning(0, matchStart);
1634                  con.match.setEnd(0, matchEnd);
1635              }
1636              con.inuse = false;
1637              return true;
1638          } else {
1639              con.inuse = false;
1640              return false;
1641          }
1642      }
1643
1644  /**
1645   * @return -1 when not match; offset of the end of matched string when match.
1646   */

1647      private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
1648
1649          char[] target = con.charTarget;
1650
1651
1652          while (true) {
1653              if (op == null)
1654                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1655              if (offset > con.limit || offset < con.start)
1656                  return -1;
1657              switch (op.type) {
1658              case Op.CHAR:
1659                  if (isSet(opts, IGNORE_CASE)) {
1660                      int ch = op.getData();
1661                      if (dx > 0) {
1662                          if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
1663                              return -1;
1664                          offset ++;
1665                      } else {
1666                          int o1 = offset-1;
1667                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
1668                              return -1;
1669                          offset = o1;
1670                      }
1671                  } else {
1672                      int ch = op.getData();
1673                      if (dx > 0) {
1674                          if (offset >= con.limit || ch != target [ offset ] )
1675                              return -1;
1676                          offset ++;
1677                      } else {
1678                          int o1 = offset-1;
1679                          if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
1680                              return -1;
1681                          offset = o1;
1682                      }
1683                  }
1684                  op = op.next;
1685                  break;
1686
1687              case Op.DOT:
1688                  if (dx > 0) {
1689                      if (offset >= con.limit)
1690                          return -1;
1691                      int ch = target [ offset ] ;
1692                      if (isSet(opts, SINGLE_LINE)) {
1693                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1694                              offset ++;
1695                      } else {
1696                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1697                              ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
1698                          if (isEOLChar(ch))
1699                              return -1;
1700                      }
1701                      offset ++;
1702                  } else {
1703                      int o1 = offset-1;
1704                      if (o1 >= con.limit || o1 < 0)
1705                          return -1;
1706                      int ch = target [ o1 ] ;
1707                      if (isSet(opts, SINGLE_LINE)) {
1708                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1709                              o1 --;
1710                      } else {
1711                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1712                              ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
1713                          if (!isEOLChar(ch))
1714                              return -1;
1715                      }
1716                      offset = o1;
1717                  }
1718                  op = op.next;
1719                  break;
1720
1721              case Op.RANGE:
1722              case Op.NRANGE:
1723                  if (dx > 0) {
1724                      if (offset >= con.limit)
1725                          return -1;
1726                      int ch = target [ offset ] ;
1727                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1728                          ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
1729                      RangeToken tok = op.getToken();
1730                      if (isSet(opts, IGNORE_CASE)) {
1731                          tok = tok.getCaseInsensitiveToken();
1732                          if (!tok.match(ch)) {
1733                              if (ch >= 0x10000) return -1;
1734                              char uch;
1735                              if (!tok.match(uch = Character.toUpperCase((char)ch))
1736                                  && !tok.match(Character.toLowerCase(uch)))
1737                                  return -1;
1738                          }
1739                      } else {
1740                          if (!tok.match(ch)) return -1;
1741                      }
1742                      offset ++;
1743                  } else {
1744                      int o1 = offset-1;
1745                      if (o1 >= con.limit || o1 < 0)
1746                          return -1;
1747                      int ch = target [ o1 ] ;
1748                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1749                          ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
1750                      RangeToken tok = op.getToken();
1751                      if (isSet(opts, IGNORE_CASE)) {
1752                          tok = tok.getCaseInsensitiveToken();
1753                          if (!tok.match(ch)) {
1754                              if (ch >= 0x10000) return -1;
1755                              char uch;
1756                              if (!tok.match(uch = Character.toUpperCase((char)ch))
1757                                  && !tok.match(Character.toLowerCase(uch)))
1758                                  return -1;
1759                          }
1760                      } else {
1761                          if (!tok.match(ch)) return -1;
1762                      }
1763                      offset = o1;
1764                  }
1765                  op = op.next;
1766                  break;
1767
1768              case Op.ANCHOR:
1769                  boolean go = false;
1770                  switch (op.getData()) {
1771                  case '^':
1772                      if (isSet(opts, MULTIPLE_LINES)) {
1773                          if (!(offset == con.start
1774                                || offset > con.start && isEOLChar( target [ offset-1 ] )))
1775                              return -1;
1776                      } else {
1777                          if (offset != con.start)
1778                              return -1;
1779                      }
1780                      break;
1781
1782                  case '@': // Internal use only.
1783
// The @ always matches line beginnings.
1784
if (!(offset == con.start
1785                            || offset > con.start && isEOLChar( target [ offset-1 ] )))
1786                          return -1;
1787                      break;
1788
1789                  case '$':
1790                      if (isSet(opts, MULTIPLE_LINES)) {
1791                          if (!(offset == con.limit
1792                                || offset < con.limit && isEOLChar( target [ offset ] )))
1793                              return -1;
1794                      } else {
1795                          if (!(offset == con.limit
1796                                || offset+1 == con.limit && isEOLChar( target [ offset ] )
1797                                || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1798                                && target [ offset+1 ] == LINE_FEED))
1799                              return -1;
1800                      }
1801                      break;
1802
1803                  case 'A':
1804                      if (offset != con.start) return -1;
1805                      break;
1806
1807                  case 'Z':
1808                      if (!(offset == con.limit
1809                            || offset+1 == con.limit && isEOLChar( target [ offset ] )
1810                            || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1811                            && target [ offset+1 ] == LINE_FEED))
1812                          return -1;
1813                      break;
1814
1815                  case 'z':
1816                      if (offset != con.limit) return -1;
1817                      break;
1818
1819                  case 'b':
1820                      if (con.length == 0) return -1;
1821                      {
1822                          int after = getWordType(target, con.start, con.limit, offset, opts);
1823                          if (after == WT_IGNORE) return -1;
1824                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1825                          if (after == before) return -1;
1826                      }
1827                      break;
1828
1829                  case 'B':
1830                      if (con.length == 0)
1831                          go = true;
1832                      else {
1833                          int after = getWordType(target, con.start, con.limit, offset, opts);
1834                          go = after == WT_IGNORE
1835                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1836                      }
1837                      if (!go) return -1;
1838                      break;
1839
1840                  case '<':
1841                      if (con.length == 0 || offset == con.limit) return -1;
1842                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1843                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1844                          return -1;
1845                      break;
1846
1847                  case '>':
1848                      if (con.length == 0 || offset == con.start) return -1;
1849                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1850                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1851                          return -1;
1852                      break;
1853                  } // switch anchor type
1854
op = op.next;
1855                  break;
1856
1857              case Op.BACKREFERENCE:
1858                  {
1859                      int refno = op.getData();
1860                      if (refno <= 0 || refno >= this.nofparen)
1861                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
1862                      if (con.match.getBeginning(refno) < 0
1863                          || con.match.getEnd(refno) < 0)
1864                          return -1; // ********
1865
int o2 = con.match.getBeginning(refno);
1866                      int literallen = con.match.getEnd(refno)-o2;
1867                      if (!isSet(opts, IGNORE_CASE)) {
1868                          if (dx > 0) {
1869                              if (!regionMatches(target, offset, con.limit, o2, literallen))
1870                                  return -1;
1871                              offset += literallen;
1872                          } else {
1873                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1874                                  return -1;
1875                              offset -= literallen;
1876                          }
1877                      } else {
1878                          if (dx > 0) {
1879                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1880                                  return -1;
1881                              offset += literallen;
1882                          } else {
1883                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1884                                                           o2, literallen))
1885                                  return -1;
1886                              offset -= literallen;
1887                          }
1888                      }
1889                  }
1890                  op = op.next;
1891                  break;
1892              case Op.STRING:
1893                  {
1894                      String JavaDoc literal = op.getString();
1895                      int literallen = literal.length();
1896                      if (!isSet(opts, IGNORE_CASE)) {
1897                          if (dx > 0) {
1898                              if (!regionMatches(target, offset, con.limit, literal, literallen))
1899                                  return -1;
1900                              offset += literallen;
1901                          } else {
1902                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1903                                  return -1;
1904                              offset -= literallen;
1905                          }
1906                      } else {
1907                          if (dx > 0) {
1908                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1909                                  return -1;
1910                              offset += literallen;
1911                          } else {
1912                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1913                                                           literal, literallen))
1914                                  return -1;
1915                              offset -= literallen;
1916                          }
1917                      }
1918                  }
1919                  op = op.next;
1920                  break;
1921
1922              case Op.CLOSURE:
1923                  {
1924                      /*
1925                       * Saves current position to avoid
1926                       * zero-width repeats.
1927                       */

1928                      int id = op.getData();
1929                      if (id >= 0) {
1930                          int previousOffset = con.offsets[id];
1931                          if (previousOffset < 0 || previousOffset != offset) {
1932                              con.offsets[id] = offset;
1933                          } else {
1934                              con.offsets[id] = -1;
1935                              op = op.next;
1936                              break;
1937                          }
1938                      }
1939
1940                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1941                      if (id >= 0) con.offsets[id] = -1;
1942                      if (ret >= 0) return ret;
1943                      op = op.next;
1944                  }
1945                  break;
1946
1947              case Op.QUESTION:
1948                  {
1949                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1950                      if (ret >= 0) return ret;
1951                      op = op.next;
1952                  }
1953                  break;
1954
1955              case Op.NONGREEDYCLOSURE:
1956              case Op.NONGREEDYQUESTION:
1957                  {
1958                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1959                      if (ret >= 0) return ret;
1960                      op = op.getChild();
1961                  }
1962                  break;
1963
1964              case Op.UNION:
1965                  for (int i = 0; i < op.size(); i ++) {
1966                      int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1967                      if (DEBUG) {
1968                          System.err.println("UNION: "+i+", ret="+ret);
1969                      }
1970                      if (ret >= 0) return ret;
1971                  }
1972                  return -1;
1973
1974              case Op.CAPTURE:
1975                  int refno = op.getData();
1976                  if (con.match != null && refno > 0) {
1977                      int save = con.match.getBeginning(refno);
1978                      con.match.setBeginning(refno, offset);
1979                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1980                      if (ret < 0) con.match.setBeginning(refno, save);
1981                      return ret;
1982                  } else if (con.match != null && refno < 0) {
1983                      int index = -refno;
1984                      int save = con.match.getEnd(index);
1985                      con.match.setEnd(index, offset);
1986                      int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1987                      if (ret < 0) con.match.setEnd(index, save);
1988                      return ret;
1989                  }
1990                  op = op.next;
1991                  break;
1992
1993              case Op.LOOKAHEAD:
1994                  if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1995                  op = op.next;
1996                  break;
1997              case Op.NEGATIVELOOKAHEAD:
1998                  if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1999                  op = op.next;
2000                  break;
2001              case Op.LOOKBEHIND:
2002                  if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
2003                  op = op.next;
2004                  break;
2005              case Op.NEGATIVELOOKBEHIND:
2006                  if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
2007                  op = op.next;
2008                  break;
2009
2010              case Op.INDEPENDENT:
2011                  {
2012                      int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
2013                      if (ret < 0) return ret;
2014                      offset = ret;
2015                      op = op.next;
2016                  }
2017                  break;
2018
2019              case Op.MODIFIER:
2020                  {
2021                      int localopts = opts;
2022                      localopts |= op.getData();
2023                      localopts &= ~op.getData2();
2024                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2025
int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
2026                      if (ret < 0) return ret;
2027                      offset = ret;
2028                      op = op.next;
2029                  }
2030                  break;
2031
2032              case Op.CONDITION:
2033                  {
2034                      Op.ConditionOp cop = (Op.ConditionOp)op;
2035                      boolean matchp = false;
2036                      if (cop.refNumber > 0) {
2037                          if (cop.refNumber >= this.nofparen)
2038                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2039                          matchp = con.match.getBeginning(cop.refNumber) >= 0
2040                                   && con.match.getEnd(cop.refNumber) >= 0;
2041                      } else {
2042                          matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
2043                      }
2044
2045                      if (matchp) {
2046                          op = cop.yes;
2047                      } else if (cop.no != null) {
2048                          op = cop.no;
2049                      } else {
2050                          op = cop.next;
2051                      }
2052                  }
2053                  break;
2054
2055              default:
2056                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
2057              } // switch (op.type)
2058
} // while
2059
}
2060
2061      private static final int getPreviousWordType(char[] target, int begin, int end,
2062                                                   int offset, int opts) {
2063          int ret = getWordType(target, begin, end, --offset, opts);
2064          while (ret == WT_IGNORE)
2065              ret = getWordType(target, begin, end, --offset, opts);
2066          return ret;
2067      }
2068
2069      private static final int getWordType(char[] target, int begin, int end,
2070                                           int offset, int opts) {
2071          if (offset < begin || offset >= end) return WT_OTHER;
2072          return getWordType0( target [ offset ] , opts);
2073      }
2074
2075
2076
2077      private static final boolean regionMatches(char[] target, int offset, int limit,
2078                                                 String JavaDoc part, int partlen) {
2079          if (offset < 0) return false;
2080          if (limit-offset < partlen)
2081              return false;
2082          int i = 0;
2083          while (partlen-- > 0) {
2084              if ( target [ offset++ ] != part.charAt(i++))
2085                  return false;
2086          }
2087          return true;
2088      }
2089
2090      private static final boolean regionMatches(char[] target, int offset, int limit,
2091                                                 int offset2, int partlen) {
2092          if (offset < 0) return false;
2093          if (limit-offset < partlen)
2094              return false;
2095          int i = offset2;
2096          while (partlen-- > 0) {
2097              if ( target [ offset++ ] != target [ i++ ] )
2098                  return false;
2099          }
2100          return true;
2101      }
2102
2103  /**
2104   * @see java.lang.String#regionMatches
2105   */

2106      private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
2107                                                           String JavaDoc part, int partlen) {
2108          if (offset < 0) return false;
2109          if (limit-offset < partlen)
2110              return false;
2111          int i = 0;
2112          while (partlen-- > 0) {
2113              char ch1 = target [ offset++ ] ;
2114              char ch2 = part.charAt(i++);
2115              if (ch1 == ch2)
2116                  continue;
2117              char uch1 = Character.toUpperCase(ch1);
2118              char uch2 = Character.toUpperCase(ch2);
2119              if (uch1 == uch2)
2120                  continue;
2121              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2122                  return false;
2123          }
2124          return true;
2125      }
2126
2127      private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
2128                                                           int offset2, int partlen) {
2129          if (offset < 0) return false;
2130          if (limit-offset < partlen)
2131              return false;
2132          int i = offset2;
2133          while (partlen-- > 0) {
2134              char ch1 = target [ offset++ ] ;
2135              char ch2 = target [ i++ ] ;
2136              if (ch1 == ch2)
2137                  continue;
2138              char uch1 = Character.toUpperCase(ch1);
2139              char uch2 = Character.toUpperCase(ch2);
2140              if (uch1 == uch2)
2141                  continue;
2142              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2143                  return false;
2144          }
2145          return true;
2146      }
2147
2148
2149
2150
2151      /**
2152       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2153       *
2154       * @return true if the target is matched to this regular expression.
2155       */

2156      public boolean matches(String JavaDoc target) {
2157          return this.matches(target, 0, target .length() , (Match)null);
2158      }
2159
2160      /**
2161       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
2162       * in specified range or not.
2163       *
2164       * @param start Start offset of the range.
2165       * @param end End offset +1 of the range.
2166       * @return true if the target is matched to this regular expression.
2167       */

2168      public boolean matches(String JavaDoc target, int start, int end) {
2169          return this.matches(target, start, end, (Match)null);
2170      }
2171
2172      /**
2173       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2174       *
2175       * @param match A Match instance for storing matching result.
2176       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2177       */

2178      public boolean matches(String JavaDoc target, Match match) {
2179          return this.matches(target, 0, target .length() , match);
2180      }
2181
2182      /**
2183       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
2184       * in specified range or not.
2185       *
2186       * @param start Start offset of the range.
2187       * @param end End offset +1 of the range.
2188       * @param match A Match instance for storing matching result.
2189       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2190       */

2191      public boolean matches(String JavaDoc target, int start, int end, Match match) {
2192
2193          synchronized (this) {
2194              if (this.operations == null)
2195                  this.prepare();
2196              if (this.context == null)
2197                  this.context = new Context();
2198          }
2199          Context con = null;
2200          synchronized (this.context) {
2201              con = this.context.inuse ? new Context() : this.context;
2202              con.reset(target, start, end, this.numberOfClosures);
2203          }
2204          if (match != null) {
2205              match.setNumberOfGroups(this.nofparen);
2206              match.setSource(target);
2207          } else if (this.hasBackReferences) {
2208              match = new Match();
2209              match.setNumberOfGroups(this.nofparen);
2210              // Need not to call setSource() because
2211
// a caller can not access this match instance.
2212
}
2213          con.match = match;
2214
2215          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
2216              if (DEBUG) {
2217                  System.err.println("target string="+target);
2218              }
2219              int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
2220              if (DEBUG) {
2221                  System.err.println("matchEnd="+matchEnd);
2222                  System.err.println("con.limit="+con.limit);
2223              }
2224              if (matchEnd == con.limit) {
2225                  if (con.match != null) {
2226                      con.match.setBeginning(0, con.start);
2227                      con.match.setEnd(0, matchEnd);
2228                  }
2229                  con.inuse = false;
2230                  return true;
2231              }
2232              return false;
2233          }
2234
2235          /*
2236           * The pattern has only fixed string.
2237           * The engine uses Boyer-Moore.
2238           */

2239          if (this.fixedStringOnly) {
2240              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2241
int o = this.fixedStringTable.matches(target, con.start, con.limit);
2242              if (o >= 0) {
2243                  if (con.match != null) {
2244                      con.match.setBeginning(0, o);
2245                      con.match.setEnd(0, o+this.fixedString.length());
2246                  }
2247                  con.inuse = false;
2248                  return true;
2249              }
2250              con.inuse = false;
2251              return false;
2252          }
2253
2254          /*
2255           * The pattern contains a fixed string.
2256           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2257           * If not, it return with false.
2258           */

2259          if (this.fixedString != null) {
2260              int o = this.fixedStringTable.matches(target, con.start, con.limit);
2261              if (o < 0) {
2262                  //System.err.println("Non-match in fixed-string search.");
2263
con.inuse = false;
2264                  return false;
2265              }
2266          }
2267
2268          int limit = con.limit-this.minlength;
2269          int matchStart;
2270          int matchEnd = -1;
2271
2272          /*
2273           * Checks whether the expression starts with ".*".
2274           */

2275          if (this.operations != null
2276              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2277              if (isSet(this.options, SINGLE_LINE)) {
2278                  matchStart = con.start;
2279                  matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
2280              } else {
2281                  boolean previousIsEOL = true;
2282                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2283                      int ch = target .charAt( matchStart ) ;
2284                      if (isEOLChar(ch)) {
2285                          previousIsEOL = true;
2286                      } else {
2287                          if (previousIsEOL) {
2288                              if (0 <= (matchEnd = this. matchString (con, this.operations,
2289                                                                      matchStart, 1, this.options)))
2290                                  break;
2291                          }
2292                          previousIsEOL = false;
2293                      }
2294                  }
2295              }
2296          }
2297
2298          /*
2299           * Optimization against the first character.
2300           */

2301          else if (this.firstChar != null) {
2302              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2303
RangeToken range = this.firstChar;
2304              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
2305                  range = this.firstChar.getCaseInsensitiveToken();
2306                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2307                      int ch = target .charAt( matchStart ) ;
2308                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2309                          ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
2310                          if (!range.match(ch)) continue;
2311                      } else {
2312                          if (!range.match(ch)) {
2313                              char ch1 = Character.toUpperCase((char)ch);
2314                              if (!range.match(ch1))
2315                                  if (!range.match(Character.toLowerCase(ch1)))
2316                                      continue;
2317                          }
2318                      }
2319                      if (0 <= (matchEnd = this. matchString (con, this.operations,
2320                                                              matchStart, 1, this.options)))
2321                          break;
2322                  }
2323              } else {
2324                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2325                      int ch = target .charAt( matchStart ) ;
2326                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2327                          ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
2328                      if (!range.match(ch)) continue;
2329                      if (0 <= (matchEnd = this. matchString (con, this.operations,
2330                                                              matchStart, 1, this.options)))
2331                          break;
2332                  }
2333              }
2334          }
2335
2336          /*
2337           * Straightforward matching.
2338           */

2339          else {
2340              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2341                  if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
2342                      break;
2343              }
2344          }
2345
2346          if (matchEnd >= 0) {
2347              if (con.match != null) {
2348                  con.match.setBeginning(0, matchStart);
2349                  con.match.setEnd(0, matchEnd);
2350              }
2351              con.inuse = false;
2352              return true;
2353          } else {
2354              con.inuse = false;
2355              return false;
2356          }
2357      }
2358
2359      /**
2360       * @return -1 when not match; offset of the end of matched string when match.
2361       */

2362      private int matchString (Context con, Op op, int offset, int dx, int opts) {
2363
2364
2365
2366
2367          String JavaDoc target = con.strTarget;
2368
2369
2370
2371
2372          while (true) {
2373              if (op == null)
2374                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
2375              if (offset > con.limit || offset < con.start)
2376                  return -1;
2377              switch (op.type) {
2378              case Op.CHAR:
2379                  if (isSet(opts, IGNORE_CASE)) {
2380                      int ch = op.getData();
2381                      if (dx > 0) {
2382                          if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) ))
2383                              return -1;
2384                          offset ++;
2385                      } else {
2386                          int o1 = offset-1;
2387                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) ))
2388                              return -1;
2389                          offset = o1;
2390                      }
2391                  } else {
2392                      int ch = op.getData();
2393                      if (dx > 0) {
2394                          if (offset >= con.limit || ch != target .charAt( offset ) )
2395                              return -1;
2396                          offset ++;
2397                      } else {
2398                          int o1 = offset-1;
2399                          if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) )
2400                              return -1;
2401                          offset = o1;
2402                      }
2403                  }
2404                  op = op.next;
2405                  break;
2406
2407              case Op.DOT:
2408                  if (dx > 0) {
2409                      if (offset >= con.limit)
2410                          return -1;
2411                      int ch = target .charAt( offset ) ;
2412                      if (isSet(opts, SINGLE_LINE)) {
2413                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2414                              offset ++;
2415                      } else {
2416                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2417                              ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
2418                          if (isEOLChar(ch))
2419                              return -1;
2420                      }
2421                      offset ++;
2422                  } else {
2423                      int o1 = offset-1;
2424                      if (o1 >= con.limit || o1 < 0)
2425                          return -1;
2426                      int ch = target .charAt( o1 ) ;
2427                      if (isSet(opts, SINGLE_LINE)) {
2428                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2429                              o1 --;
2430                      } else {
2431                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2432                              ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
2433                          if (!isEOLChar(ch))
2434                              return -1;
2435                      }
2436                      offset = o1;
2437                  }
2438                  op = op.next;
2439                  break;
2440
2441              case Op.RANGE:
2442              case Op.NRANGE:
2443                  if (dx > 0) {
2444                      if (offset >= con.limit)
2445                          return -1;
2446                      int ch = target .charAt( offset ) ;
2447                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2448                          ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
2449                      RangeToken tok = op.getToken();
2450                      if (isSet(opts, IGNORE_CASE)) {
2451                          tok = tok.getCaseInsensitiveToken();
2452                          if (!tok.match(ch)) {
2453                              if (ch >= 0x10000) return -1;
2454                              char uch;
2455                              if (!tok.match(uch = Character.toUpperCase((char)ch))
2456                                  && !tok.match(Character.toLowerCase(uch)))
2457                                  return -1;
2458                          }
2459                      } else {
2460                          if (!tok.match(ch)) return -1;
2461                      }
2462                      offset ++;
2463                  } else {
2464                      int o1 = offset-1;
2465                      if (o1 >= con.limit || o1 < 0)
2466                          return -1;
2467                      int ch = target .charAt( o1 ) ;
2468                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2469                          ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
2470                      RangeToken tok = op.getToken();
2471                      if (isSet(opts, IGNORE_CASE)) {
2472                          tok = tok.getCaseInsensitiveToken();
2473                          if (!tok.match(ch)) {
2474                              if (ch >= 0x10000) return -1;
2475                              char uch;
2476                              if (!tok.match(uch = Character.toUpperCase((char)ch))
2477                                  && !tok.match(Character.toLowerCase(uch)))
2478                                  return -1;
2479                          }
2480                      } else {
2481                          if (!tok.match(ch)) return -1;
2482                      }
2483                      offset = o1;
2484                  }
2485                  op = op.next;
2486                  break;
2487
2488              case Op.ANCHOR:
2489                  boolean go = false;
2490                  switch (op.getData()) {
2491                  case '^':
2492                      if (isSet(opts, MULTIPLE_LINES)) {
2493                          if (!(offset == con.start
2494                                || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
2495                              return -1;
2496                      } else {
2497                          if (offset != con.start)
2498                              return -1;
2499                      }
2500                      break;
2501
2502                  case '@': // Internal use only.
2503
// The @ always matches line beginnings.
2504
if (!(offset == con.start
2505                            || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
2506                          return -1;
2507                      break;
2508
2509                  case '$':
2510                      if (isSet(opts, MULTIPLE_LINES)) {
2511                          if (!(offset == con.limit
2512                                || offset < con.limit && isEOLChar( target .charAt( offset ) )))
2513                              return -1;
2514                      } else {
2515                          if (!(offset == con.limit
2516                                || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
2517                                || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
2518                                && target .charAt( offset+1 ) == LINE_FEED))
2519                              return -1;
2520                      }
2521                      break;
2522
2523                  case 'A':
2524                      if (offset != con.start) return -1;
2525                      break;
2526
2527                  case 'Z':
2528                      if (!(offset == con.limit
2529                            || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
2530                            || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
2531                            && target .charAt( offset+1 ) == LINE_FEED))
2532                          return -1;
2533                      break;
2534
2535                  case 'z':
2536                      if (offset != con.limit) return -1;
2537                      break;
2538
2539                  case 'b':
2540                      if (con.length == 0) return -1;
2541                      {
2542                          int after = getWordType(target, con.start, con.limit, offset, opts);
2543                          if (after == WT_IGNORE) return -1;
2544                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
2545                          if (after == before) return -1;
2546                      }
2547                      break;
2548
2549                  case 'B':
2550                      if (con.length == 0)
2551                          go = true;
2552                      else {
2553                          int after = getWordType(target, con.start, con.limit, offset, opts);
2554                          go = after == WT_IGNORE
2555                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
2556                      }
2557                      if (!go) return -1;
2558                      break;
2559
2560                  case '<':
2561                      if (con.length == 0 || offset == con.limit) return -1;
2562                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
2563                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
2564                          return -1;
2565                      break;
2566
2567                  case '>':
2568                      if (con.length == 0 || offset == con.start) return -1;
2569                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
2570                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
2571                          return -1;
2572                      break;
2573                  } // switch anchor type
2574
op = op.next;
2575                  break;
2576
2577              case Op.BACKREFERENCE:
2578                  {
2579                      int refno = op.getData();
2580                      if (refno <= 0 || refno >= this.nofparen)
2581                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
2582                      if (con.match.getBeginning(refno) < 0
2583                          || con.match.getEnd(refno) < 0)
2584                          return -1; // ********
2585
int o2 = con.match.getBeginning(refno);
2586                      int literallen = con.match.getEnd(refno)-o2;
2587                      if (!isSet(opts, IGNORE_CASE)) {
2588                          if (dx > 0) {
2589                              if (!regionMatches(target, offset, con.limit, o2, literallen))
2590                                  return -1;
2591                              offset += literallen;
2592                          } else {
2593                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
2594                                  return -1;
2595                              offset -= literallen;
2596                          }
2597                      } else {
2598                          if (dx > 0) {
2599                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
2600                                  return -1;
2601                              offset += literallen;
2602                          } else {
2603                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2604                                                           o2, literallen))
2605                                  return -1;
2606                              offset -= literallen;
2607                          }
2608                      }
2609                  }
2610                  op = op.next;
2611                  break;
2612              case Op.STRING:
2613                  {
2614                      String JavaDoc literal = op.getString();
2615                      int literallen = literal.length();
2616                      if (!isSet(opts, IGNORE_CASE)) {
2617                          if (dx > 0) {
2618                              if (!regionMatches(target, offset, con.limit, literal, literallen))
2619                                  return -1;
2620                              offset += literallen;
2621                          } else {
2622                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
2623                                  return -1;
2624                              offset -= literallen;
2625                          }
2626                      } else {
2627                          if (dx > 0) {
2628                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
2629                                  return -1;
2630                              offset += literallen;
2631                          } else {
2632                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2633                                                           literal, literallen))
2634                                  return -1;
2635                              offset -= literallen;
2636                          }
2637                      }
2638                  }
2639                  op = op.next;
2640                  break;
2641
2642              case Op.CLOSURE:
2643                  {
2644                      /*
2645                       * Saves current position to avoid
2646                       * zero-width repeats.
2647                       */

2648                      int id = op.getData();
2649                      if (id >= 0) {
2650                          int previousOffset = con.offsets[id];
2651                          if (previousOffset < 0 || previousOffset != offset) {
2652                              con.offsets[id] = offset;
2653                          } else {
2654                              con.offsets[id] = -1;
2655                              op = op.next;
2656                              break;
2657                          }
2658                      }
2659                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2660                      if (id >= 0) con.offsets[id] = -1;
2661                      if (ret >= 0) return ret;
2662                      op = op.next;
2663                  }
2664                  break;
2665
2666              case Op.QUESTION:
2667                  {
2668                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2669                      if (ret >= 0) return ret;
2670                      op = op.next;
2671                  }
2672                  break;
2673
2674              case Op.NONGREEDYCLOSURE:
2675              case Op.NONGREEDYQUESTION:
2676                  {
2677                      int ret = this. matchString (con, op.next, offset, dx, opts);
2678                      if (ret >= 0) return ret;
2679                      op = op.getChild();
2680                  }
2681                  break;
2682
2683              case Op.UNION:
2684                  for (int i = 0; i < op.size(); i ++) {
2685                      int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
2686                      if (DEBUG) {
2687                          System.err.println("UNION: "+i+", ret="+ret);
2688                      }
2689                      if (ret >= 0) return ret;
2690                  }
2691                  return -1;
2692
2693              case Op.CAPTURE:
2694                  int refno = op.getData();
2695                  if (con.match != null && refno > 0) {
2696                      int save = con.match.getBeginning(refno);
2697                      con.match.setBeginning(refno, offset);
2698                      int ret = this. matchString (con, op.next, offset, dx, opts);
2699                      if (ret < 0) con.match.setBeginning(refno, save);
2700                      return ret;
2701                  } else if (con.match != null && refno < 0) {
2702                      int index = -refno;
2703                      int save = con.match.getEnd(index);
2704                      con.match.setEnd(index, offset);
2705                      int ret = this. matchString (con, op.next, offset, dx, opts);
2706                      if (ret < 0) con.match.setEnd(index, save);
2707                      return ret;
2708                  }
2709                  op = op.next;
2710                  break;
2711
2712              case Op.LOOKAHEAD:
2713                  if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
2714                  op = op.next;
2715                  break;
2716              case Op.NEGATIVELOOKAHEAD:
2717                  if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
2718                  op = op.next;
2719                  break;
2720              case Op.LOOKBEHIND:
2721                  if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
2722                  op = op.next;
2723                  break;
2724              case Op.NEGATIVELOOKBEHIND:
2725                  if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
2726                  op = op.next;
2727                  break;
2728
2729              case Op.INDEPENDENT:
2730                  {
2731                      int ret = this. matchString (con, op.getChild(), offset, dx, opts);
2732                      if (ret < 0) return ret;
2733                      offset = ret;
2734                      op = op.next;
2735                  }
2736                  break;
2737
2738              case Op.MODIFIER:
2739                  {
2740                      int localopts = opts;
2741                      localopts |= op.getData();
2742                      localopts &= ~op.getData2();
2743                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2744
int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
2745                      if (ret < 0) return ret;
2746                      offset = ret;
2747                      op = op.next;
2748                  }
2749                  break;
2750
2751              case Op.CONDITION:
2752                  {
2753                      Op.ConditionOp cop = (Op.ConditionOp)op;
2754                      boolean matchp = false;
2755                      if (cop.refNumber > 0) {
2756                          if (cop.refNumber >= this.nofparen)
2757                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2758                          matchp = con.match.getBeginning(cop.refNumber) >= 0
2759                                   && con.match.getEnd(cop.refNumber) >= 0;
2760                      } else {
2761                          matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
2762                      }
2763
2764                      if (matchp) {
2765                          op = cop.yes;
2766                      } else if (cop.no != null) {
2767                          op = cop.no;
2768                      } else {
2769                          op = cop.next;
2770                      }
2771                  }
2772                  break;
2773
2774              default:
2775                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
2776              } // switch (op.type)
2777
} // while
2778
}
2779
2780      private static final int getPreviousWordType(String JavaDoc target, int begin, int end,
2781                                                   int offset, int opts) {
2782          int ret = getWordType(target, begin, end, --offset, opts);
2783          while (ret == WT_IGNORE)
2784              ret = getWordType(target, begin, end, --offset, opts);
2785          return ret;
2786      }
2787
2788      private static final int getWordType(String JavaDoc target, int begin, int end,
2789                                           int offset, int opts) {
2790          if (offset < begin || offset >= end) return WT_OTHER;
2791          return getWordType0( target .charAt( offset ) , opts);
2792      }
2793
2794
2795      private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2796                                                 String JavaDoc part, int partlen) {
2797          if (limit-offset < partlen) return false;
2798          return text.regionMatches(offset, part, 0, partlen);
2799      }
2800
2801      private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2802                                                 int offset2, int partlen) {
2803          if (limit-offset < partlen) return false;
2804          return text.regionMatches(offset, text, offset2, partlen);
2805      }
2806
2807      private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2808                                                           String JavaDoc part, int partlen) {
2809          return text.regionMatches(true, offset, part, 0, partlen);
2810      }
2811
2812      private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2813                                                           int offset2, int partlen) {
2814          if (limit-offset < partlen) return false;
2815          return text.regionMatches(true, offset, text, offset2, partlen);
2816      }
2817
2818
2819
2820
2821
2822
2823
2824      /**
2825       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2826       *
2827       * @return true if the target is matched to this regular expression.
2828       */

2829      public boolean matches(CharacterIterator JavaDoc target) {
2830          return this.matches(target, (Match)null);
2831      }
2832
2833
2834      /**
2835       * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2836       *
2837       * @param match A Match instance for storing matching result.
2838       * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2839       */

2840      public boolean matches(CharacterIterator JavaDoc target, Match match) {
2841          int start = target.getBeginIndex();
2842          int end = target.getEndIndex();
2843
2844
2845
2846          synchronized (this) {
2847              if (this.operations == null)
2848                  this.prepare();
2849              if (this.context == null)
2850                  this.context = new Context();
2851          }
2852          Context con = null;
2853          synchronized (this.context) {
2854              con = this.context.inuse ? new Context() : this.context;
2855              con.reset(target, start, end, this.numberOfClosures);
2856          }
2857          if (match != null) {
2858              match.setNumberOfGroups(this.nofparen);
2859              match.setSource(target);
2860          } else if (this.hasBackReferences) {
2861              match = new Match();
2862              match.setNumberOfGroups(this.nofparen);
2863              // Need not to call setSource() because
2864
// a caller can not access this match instance.
2865
}
2866          con.match = match;
2867
2868          if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
2869              int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2870              //System.err.println("DEBUG: matchEnd="+matchEnd);
2871
if (matchEnd == con.limit) {
2872                  if (con.match != null) {
2873                      con.match.setBeginning(0, con.start);
2874                      con.match.setEnd(0, matchEnd);
2875                  }
2876                  con.inuse = false;
2877                  return true;
2878              }
2879              return false;
2880          }
2881
2882          /*
2883           * The pattern has only fixed string.
2884           * The engine uses Boyer-Moore.
2885           */

2886          if (this.fixedStringOnly) {
2887              //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2888
int o = this.fixedStringTable.matches(target, con.start, con.limit);
2889              if (o >= 0) {
2890                  if (con.match != null) {
2891                      con.match.setBeginning(0, o);
2892                      con.match.setEnd(0, o+this.fixedString.length());
2893                  }
2894                  con.inuse = false;
2895                  return true;
2896              }
2897              con.inuse = false;
2898              return false;
2899          }
2900
2901          /*
2902           * The pattern contains a fixed string.
2903           * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2904           * If not, it return with false.
2905           */

2906          if (this.fixedString != null) {
2907              int o = this.fixedStringTable.matches(target, con.start, con.limit);
2908              if (o < 0) {
2909                  //System.err.println("Non-match in fixed-string search.");
2910
con.inuse = false;
2911                  return false;
2912              }
2913          }
2914
2915          int limit = con.limit-this.minlength;
2916          int matchStart;
2917          int matchEnd = -1;
2918
2919          /*
2920           * Checks whether the expression starts with ".*".
2921           */

2922          if (this.operations != null
2923              && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2924              if (isSet(this.options, SINGLE_LINE)) {
2925                  matchStart = con.start;
2926                  matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2927              } else {
2928                  boolean previousIsEOL = true;
2929                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2930                      int ch = target .setIndex( matchStart ) ;
2931                      if (isEOLChar(ch)) {
2932                          previousIsEOL = true;
2933                      } else {
2934                          if (previousIsEOL) {
2935                              if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2936                                                                                 matchStart, 1, this.options)))
2937                                  break;
2938                          }
2939                          previousIsEOL = false;
2940                      }
2941                  }
2942              }
2943          }
2944
2945          /*
2946           * Optimization against the first character.
2947           */

2948          else if (this.firstChar != null) {
2949              //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2950
RangeToken range = this.firstChar;
2951              if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
2952                  range = this.firstChar.getCaseInsensitiveToken();
2953                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2954                      int ch = target .setIndex( matchStart ) ;
2955                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2956                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2957                          if (!range.match(ch)) continue;
2958                      } else {
2959                          if (!range.match(ch)) {
2960                              char ch1 = Character.toUpperCase((char)ch);
2961                              if (!range.match(ch1))
2962                                  if (!range.match(Character.toLowerCase(ch1)))
2963                                      continue;
2964                          }
2965                      }
2966                      if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2967                                                                         matchStart, 1, this.options)))
2968                          break;
2969                  }
2970              } else {
2971                  for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2972                      int ch = target .setIndex( matchStart ) ;
2973                      if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2974                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2975                      if (!range.match(ch)) continue;
2976                      if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2977                                                                         matchStart, 1, this.options)))
2978                          break;
2979                  }
2980              }
2981          }
2982
2983          /*
2984           * Straightforward matching.
2985           */

2986          else {
2987              for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2988                  if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
2989                      break;
2990              }
2991          }
2992
2993          if (matchEnd >= 0) {
2994              if (con.match != null) {
2995                  con.match.setBeginning(0, matchStart);
2996                  con.match.setEnd(0, matchEnd);
2997              }
2998              con.inuse = false;
2999              return true;
3000          } else {
3001              con.inuse = false;
3002              return false;
3003          }
3004      }
3005
3006      /**
3007       * @return -1 when not match; offset of the end of matched string when match.
3008       */

3009      private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
3010
3011
3012          CharacterIterator JavaDoc target = con.ciTarget;
3013
3014
3015
3016
3017
3018
3019          while (true) {
3020              if (op == null)
3021                  return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
3022              if (offset > con.limit || offset < con.start)
3023                  return -1;
3024              switch (op.type) {
3025              case Op.CHAR:
3026                  if (isSet(opts, IGNORE_CASE)) {
3027                      int ch = op.getData();
3028                      if (dx > 0) {
3029                          if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) ))
3030                              return -1;
3031                          offset ++;
3032                      } else {
3033                          int o1 = offset-1;
3034                          if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) ))
3035                              return -1;
3036                          offset = o1;
3037                      }
3038                  } else {
3039                      int ch = op.getData();
3040                      if (dx > 0) {
3041                          if (offset >= con.limit || ch != target .setIndex( offset ) )
3042                              return -1;
3043                          offset ++;
3044                      } else {
3045                          int o1 = offset-1;
3046                          if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) )
3047                              return -1;
3048                          offset = o1;
3049                      }
3050                  }
3051                  op = op.next;
3052                  break;
3053
3054              case Op.DOT:
3055                  if (dx > 0) {
3056                      if (offset >= con.limit)
3057                          return -1;
3058                      int ch = target .setIndex( offset ) ;
3059                      if (isSet(opts, SINGLE_LINE)) {
3060                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3061                              offset ++;
3062                      } else {
3063                          if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3064                              ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
3065                          if (isEOLChar(ch))
3066                              return -1;
3067                      }
3068                      offset ++;
3069                  } else {
3070                      int o1 = offset-1;
3071                      if (o1 >= con.limit || o1 < 0)
3072                          return -1;
3073                      int ch = target .setIndex( o1 ) ;
3074                      if (isSet(opts, SINGLE_LINE)) {
3075                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3076                              o1 --;
3077                      } else {
3078                          if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3079                              ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
3080                          if (!isEOLChar(ch))
3081                              return -1;
3082                      }
3083                      offset = o1;
3084                  }
3085                  op = op.next;
3086                  break;
3087
3088              case Op.RANGE:
3089              case Op.NRANGE:
3090                  if (dx > 0) {
3091                      if (offset >= con.limit)
3092                          return -1;
3093                      int ch = target .setIndex( offset ) ;
3094                      if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
3095                          ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
3096                      RangeToken tok = op.getToken();
3097                      if (isSet(opts, IGNORE_CASE)) {
3098                          tok = tok.getCaseInsensitiveToken();
3099                          if (!tok.match(ch)) {
3100                              if (ch >= 0x10000) return -1;
3101                              char uch;
3102                              if (!tok.match(uch = Character.toUpperCase((char)ch))
3103                                  && !tok.match(Character.toLowerCase(uch)))
3104                                  return -1;
3105                          }
3106                      } else {
3107                          if (!tok.match(ch)) return -1;
3108                      }
3109                      offset ++;
3110                  } else {
3111                      int o1 = offset-1;
3112                      if (o1 >= con.limit || o1 < 0)
3113                          return -1;
3114                      int ch = target .setIndex( o1 ) ;
3115                      if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
3116                          ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
3117                      RangeToken tok = op.getToken();
3118                      if (isSet(opts, IGNORE_CASE)) {
3119                          tok = tok.getCaseInsensitiveToken();
3120                          if (!tok.match(ch)) {
3121                              if (ch >= 0x10000) return -1;
3122                              char uch;
3123                              if (!tok.match(uch = Character.toUpperCase((char)ch))
3124                                  && !tok.match(Character.toLowerCase(uch)))
3125                                  return -1;
3126                          }
3127                      } else {
3128                          if (!tok.match(ch)) return -1;
3129                      }
3130                      offset = o1;
3131                  }
3132                  op = op.next;
3133                  break;
3134
3135              case Op.ANCHOR:
3136                  boolean go = false;
3137                  switch (op.getData()) {
3138                  case '^':
3139                      if (isSet(opts, MULTIPLE_LINES)) {
3140                          if (!(offset == con.start
3141                                || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
3142                              return -1;
3143                      } else {
3144                          if (offset != con.start)
3145                              return -1;
3146                      }
3147                      break;
3148
3149                  case '@': // Internal use only.
3150
// The @ always matches line beginnings.
3151
if (!(offset == con.start
3152                            || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
3153                          return -1;
3154                      break;
3155
3156                  case '$':
3157                      if (isSet(opts, MULTIPLE_LINES)) {
3158                          if (!(offset == con.limit
3159                                || offset < con.limit && isEOLChar( target .setIndex( offset ) )))
3160                              return -1;
3161                      } else {
3162                          if (!(offset == con.limit
3163                                || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
3164                                || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
3165                                && target .setIndex( offset+1 ) == LINE_FEED))
3166                              return -1;
3167                      }
3168                      break;
3169
3170                  case 'A':
3171                      if (offset != con.start) return -1;
3172                      break;
3173
3174                  case 'Z':
3175                      if (!(offset == con.limit
3176                            || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
3177                            || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
3178                            && target .setIndex( offset+1 ) == LINE_FEED))
3179                          return -1;
3180                      break;
3181
3182                  case 'z':
3183                      if (offset != con.limit) return -1;
3184                      break;
3185
3186                  case 'b':
3187                      if (con.length == 0) return -1;
3188                      {
3189                          int after = getWordType(target, con.start, con.limit, offset, opts);
3190                          if (after == WT_IGNORE) return -1;
3191                          int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
3192                          if (after == before) return -1;
3193                      }
3194                      break;
3195
3196                  case 'B':
3197                      if (con.length == 0)
3198                          go = true;
3199                      else {
3200                          int after = getWordType(target, con.start, con.limit, offset, opts);
3201                          go = after == WT_IGNORE
3202                               || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
3203                      }
3204                      if (!go) return -1;
3205                      break;
3206
3207                  case '<':
3208                      if (con.length == 0 || offset == con.limit) return -1;
3209                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
3210                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
3211                          return -1;
3212                      break;
3213
3214                  case '>':
3215                      if (con.length == 0 || offset == con.start) return -1;
3216                      if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
3217                          || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
3218                          return -1;
3219                      break;
3220                  } // switch anchor type
3221
op = op.next;
3222                  break;
3223
3224              case Op.BACKREFERENCE:
3225                  {
3226                      int refno = op.getData();
3227                      if (refno <= 0 || refno >= this.nofparen)
3228                          throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
3229                      if (con.match.getBeginning(refno) < 0
3230                          || con.match.getEnd(refno) < 0)
3231                          return -1; // ********
3232
int o2 = con.match.getBeginning(refno);
3233                      int literallen = con.match.getEnd(refno)-o2;
3234                      if (!isSet(opts, IGNORE_CASE)) {
3235                          if (dx > 0) {
3236                              if (!regionMatches(target, offset, con.limit, o2, literallen))
3237                                  return -1;
3238                              offset += literallen;
3239                          } else {
3240                              if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
3241                                  return -1;
3242                              offset -= literallen;
3243                          }
3244                      } else {
3245                          if (dx > 0) {
3246                              if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
3247                                  return -1;
3248                              offset += literallen;
3249                          } else {
3250                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
3251                                                           o2, literallen))
3252                                  return -1;
3253                              offset -= literallen;
3254                          }
3255                      }
3256                  }
3257                  op = op.next;
3258                  break;
3259              case Op.STRING:
3260                  {
3261                      String JavaDoc literal = op.getString();
3262                      int literallen = literal.length();
3263                      if (!isSet(opts, IGNORE_CASE)) {
3264                          if (dx > 0) {
3265                              if (!regionMatches(target, offset, con.limit, literal, literallen))
3266                                  return -1;
3267                              offset += literallen;
3268                          } else {
3269                              if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
3270                                  return -1;
3271                              offset -= literallen;
3272                          }
3273                      } else {
3274                          if (dx > 0) {
3275                              if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
3276                                  return -1;
3277                              offset += literallen;
3278                          } else {
3279                              if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
3280                                                           literal, literallen))
3281                                  return -1;
3282                              offset -= literallen;
3283                          }
3284                      }
3285                  }
3286                  op = op.next;
3287                  break;
3288
3289              case Op.CLOSURE:
3290                  {
3291                      /*
3292                       * Saves current position to avoid
3293                       * zero-width repeats.
3294                       */

3295                      int id = op.getData();
3296                      if (id >= 0) {
3297                          int previousOffset = con.offsets[id];
3298                          if (previousOffset < 0 || previousOffset != offset) {
3299                              con.offsets[id] = offset;
3300                          } else {
3301                              con.offsets[id] = -1;
3302                              op = op.next;
3303                              break;
3304                          }
3305                      }
3306                      
3307                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3308                      if (id >= 0) con.offsets[id] = -1;
3309                      if (ret >= 0) return ret;
3310                      op = op.next;
3311                  }
3312                  break;
3313
3314              case Op.QUESTION:
3315                  {
3316                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3317                      if (ret >= 0) return ret;
3318                      op = op.next;
3319                  }
3320                  break;
3321
3322              case Op.NONGREEDYCLOSURE:
3323              case Op.NONGREEDYQUESTION:
3324                  {
3325                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3326                      if (ret >= 0) return ret;
3327                      op = op.getChild();
3328                  }
3329                  break;
3330
3331              case Op.UNION:
3332                  for (int i = 0; i < op.size(); i ++) {
3333                      int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
3334                      if (DEBUG) {
3335                          System.err.println("UNION: "+i+", ret="+ret);
3336                      }
3337                      if (ret >= 0) return ret;
3338                  }
3339                  return -1;
3340
3341              case Op.CAPTURE:
3342                  int refno = op.getData();
3343                  if (con.match != null && refno > 0) {
3344                      int save = con.match.getBeginning(refno);
3345                      con.match.setBeginning(refno, offset);
3346                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3347                      if (ret < 0) con.match.setBeginning(refno, save);
3348                      return ret;
3349                  } else if (con.match != null && refno < 0) {
3350                      int index = -refno;
3351                      int save = con.match.getEnd(index);
3352                      con.match.setEnd(index, offset);
3353                      int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
3354                      if (ret < 0) con.match.setEnd(index, save);
3355                      return ret;
3356                  }
3357                  op = op.next;
3358                  break;
3359
3360              case Op.LOOKAHEAD:
3361                  if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
3362                  op = op.next;
3363                  break;
3364              case Op.NEGATIVELOOKAHEAD:
3365                  if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
3366                  op = op.next;
3367                  break;
3368              case Op.LOOKBEHIND:
3369                  if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
3370                  op = op.next;
3371                  break;
3372              case Op.NEGATIVELOOKBEHIND:
3373                  if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
3374                  op = op.next;
3375                  break;
3376
3377              case Op.INDEPENDENT:
3378                  {
3379                      int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
3380                      if (ret < 0) return ret;
3381                      offset = ret;
3382                      op = op.next;
3383                  }
3384                  break;
3385
3386              case Op.MODIFIER:
3387                  {
3388                      int localopts = opts;
3389                      localopts |= op.getData();
3390                      localopts &= ~op.getData2();
3391                      //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
3392
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
3393                      if (ret < 0) return ret;
3394                      offset = ret;
3395                      op = op.next;
3396                  }
3397                  break;
3398
3399              case Op.CONDITION:
3400                  {
3401                      Op.ConditionOp cop = (Op.ConditionOp)op;
3402                      boolean matchp = false;
3403                      if (cop.refNumber > 0) {
3404                          if (cop.refNumber >= this.nofparen)
3405                              throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
3406                          matchp = con.match.getBeginning(cop.refNumber) >= 0
3407                                   && con.match.getEnd(cop.refNumber) >= 0;
3408                      } else {
3409                          matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
3410                      }
3411
3412                      if (matchp) {
3413                          op = cop.yes;
3414                      } else if (cop.no != null) {
3415                          op = cop.no;
3416                      } else {
3417                          op = cop.next;
3418                      }
3419                  }
3420                  break;
3421
3422              default:
3423                  throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
3424              } // switch (op.type)
3425
} // while
3426
}
3427
3428      private static final int getPreviousWordType(CharacterIterator JavaDoc target, int begin, int end,
3429                                                   int offset, int opts) {
3430          int ret = getWordType(target, begin, end, --offset, opts);
3431          while (ret == WT_IGNORE)
3432              ret = getWordType(target, begin, end, --offset, opts);
3433          return ret;
3434      }
3435
3436      private static final int getWordType(CharacterIterator JavaDoc target, int begin, int end,
3437                                           int offset, int opts) {
3438          if (offset < begin || offset >= end) return WT_OTHER;
3439          return getWordType0( target .setIndex( offset ) , opts);
3440      }
3441
3442
3443
3444      private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
3445                                                 String JavaDoc part, int partlen) {
3446          if (offset < 0) return false;
3447          if (limit-offset < partlen)
3448              return false;
3449          int i = 0;
3450          while (partlen-- > 0) {
3451              if ( target .setIndex( offset++ ) != part.charAt(i++))
3452                  return false;
3453          }
3454          return true;
3455      }
3456
3457      private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
3458                                                 int offset2, int partlen) {
3459          if (offset < 0) return false;
3460          if (limit-offset < partlen)
3461              return false;
3462          int i = offset2;
3463          while (partlen-- > 0) {
3464              if ( target .setIndex( offset++ ) != target .setIndex( i++ ) )
3465                  return false;
3466          }
3467          return true;
3468      }
3469
3470      /**
3471       * @see java.lang.String#regionMatches
3472       */

3473      private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
3474                                                           String JavaDoc part, int partlen) {
3475          if (offset < 0) return false;
3476          if (limit-offset < partlen)
3477              return false;
3478          int i = 0;
3479          while (partlen-- > 0) {
3480              char ch1 = target .setIndex( offset++ ) ;
3481              char ch2 = part.charAt(i++);
3482              if (ch1 == ch2)
3483                  continue;
3484              char uch1 = Character.toUpperCase(ch1);
3485              char uch2 = Character.toUpperCase(ch2);
3486              if (uch1 == uch2)
3487                  continue;
3488              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
3489                  return false;
3490          }
3491          return true;
3492      }
3493
3494      private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
3495                                                           int offset2, int partlen) {
3496          if (offset < 0) return false;
3497          if (limit-offset < partlen)
3498              return false;
3499          int i = offset2;
3500          while (partlen-- > 0) {
3501              char ch1 = target .setIndex( offset++ ) ;
3502              char ch2 = target .setIndex( i++ ) ;
3503              if (ch1 == ch2)
3504                  continue;
3505              char uch1 = Character.toUpperCase(ch1);
3506              char uch2 = Character.toUpperCase(ch2);
3507              if (uch1 == uch2)
3508                  continue;
3509              if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
3510                  return false;
3511          }
3512          return true;
3513      }
3514
3515
3516
3517
3518      // ================================================================
3519

3520      /**
3521       * A regular expression.
3522       * @serial
3523       */

3524      String JavaDoc regex;
3525      /**
3526       * @serial
3527       */

3528      int options;
3529
3530      /**
3531       * The number of parenthesis in the regular expression.
3532       * @serial
3533       */

3534      int nofparen;
3535      /**
3536       * Internal representation of the regular expression.
3537       * @serial
3538       */

3539      Token tokentree;
3540
3541      boolean hasBackReferences = false;
3542
3543      transient int minlength;
3544      transient Op operations = null;
3545      transient int numberOfClosures;
3546      transient Context context = null;
3547      transient RangeToken firstChar = null;
3548
3549      transient String JavaDoc fixedString = null;
3550      transient int fixedStringOptions;
3551      transient BMPattern fixedStringTable = null;
3552      transient boolean fixedStringOnly = false;
3553
3554
3555      static final class Context {
3556          CharacterIterator JavaDoc ciTarget;
3557          String JavaDoc strTarget;
3558          char[] charTarget;
3559          int start;
3560          int limit;
3561          int length;
3562          Match match;
3563          boolean inuse = false;
3564          int[] offsets;
3565
3566          Context() {
3567          }
3568
3569          private void resetCommon(int nofclosures) {
3570              this.length = this.limit-this.start;
3571              this.inuse = true;
3572              this.match = null;
3573              if (this.offsets == null || this.offsets.length != nofclosures)
3574                  this.offsets = new int[nofclosures];
3575              for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1;
3576          }
3577          void reset(CharacterIterator JavaDoc target, int start, int limit, int nofclosures) {
3578              this.ciTarget = target;
3579              this.start = start;
3580              this.limit = limit;
3581              this.resetCommon(nofclosures);
3582          }
3583          void reset(String JavaDoc target, int start, int limit, int nofclosures) {
3584              this.strTarget = target;
3585              this.start = start;
3586              this.limit = limit;
3587              this.resetCommon(nofclosures);
3588          }
3589          void reset(char[] target, int start, int limit, int nofclosures) {
3590              this.charTarget = target;
3591              this.start = start;
3592              this.limit = limit;
3593              this.resetCommon(nofclosures);
3594          }
3595      }
3596
3597      /**
3598       * Prepares for matching. This method is called just before starting matching.
3599       */

3600      void prepare() {
3601          if (Op.COUNT) Op.nofinstances = 0;
3602          this.compile(this.tokentree);
3603          /*
3604          if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
3605              Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
3606              anchor.next = this.operations;
3607              this.operations = anchor;
3608          }
3609          */

3610          if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
3611
3612          this.minlength = this.tokentree.getMinLength();
3613
3614          this.firstChar = null;
3615          if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
3616              && !isSet(this.options, XMLSCHEMA_MODE)) {
3617              RangeToken firstChar = Token.createRange();
3618              int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
3619              if (fresult == Token.FC_TERMINAL) {
3620                  firstChar.compactRanges();
3621                  this.firstChar = firstChar;
3622                  if (DEBUG)
3623                      System.err.println("DEBUG: Use the first character optimization: "+firstChar);
3624              }
3625          }
3626
3627          if (this.operations != null
3628              && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
3629              && this.operations.next == null) {
3630              if (DEBUG)
3631                  System.err.print(" *** Only fixed string! *** ");
3632              this.fixedStringOnly = true;
3633              if (this.operations.type == Op.STRING)
3634                  this.fixedString = this.operations.getString();
3635              else if (this.operations.getData() >= 0x10000) { // Op.CHAR
3636
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
3637              } else {
3638                  char[] ac = new char[1];
3639                  ac[0] = (char)this.operations.getData();
3640                  this.fixedString = new String JavaDoc(ac);
3641              }
3642              this.fixedStringOptions = this.options;
3643              this.fixedStringTable = new BMPattern(this.fixedString, 256,
3644                                                    isSet(this.fixedStringOptions, IGNORE_CASE));
3645          } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
3646                     && !isSet(this.options, XMLSCHEMA_MODE)) {
3647              Token.FixedStringContainer container = new Token.FixedStringContainer();
3648              this.tokentree.findFixedString(container, this.options);
3649              this.fixedString = container.token == null ? null : container.token.getString();
3650              this.fixedStringOptions = container.options;
3651              if (this.fixedString != null && this.fixedString.length() < 2)
3652                  this.fixedString = null;
3653              // This pattern has a fixed string of which length is more than one.
3654
if (this.fixedString != null) {
3655                  this.fixedStringTable = new BMPattern(this.fixedString, 256,
3656                                                        isSet(this.fixedStringOptions, IGNORE_CASE));
3657                  if (DEBUG) {
3658                      System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
3659                                         +"/" //+this.fixedString
3660
+"/"+REUtil.createOptionString(this.fixedStringOptions));
3661                      System.err.print("String: ");
3662                      REUtil.dumpString(this.fixedString);
3663                  }
3664              }
3665          }
3666      }
3667
3668      /**
3669       * An option.
3670       * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
3671       * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
3672       * does not capture.
3673       *
3674       * @see #RegularExpression(java.lang.String,int)
3675       * @see #setPattern(java.lang.String,int)
3676      static final int MARK_PARENS = 1<<0;
3677       */

3678
3679      /**
3680       * "i"
3681       */

3682      static final int IGNORE_CASE = 1<<1;
3683
3684      /**
3685       * "s"
3686       */

3687      static final int SINGLE_LINE = 1<<2;
3688
3689      /**
3690       * "m"
3691       */

3692      static final int MULTIPLE_LINES = 1<<3;
3693
3694      /**
3695       * "x"
3696       */

3697      static final int EXTENDED_COMMENT = 1<<4;
3698
3699      /**
3700       * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
3701       *
3702       * @see #RegularExpression(java.lang.String,int)
3703       * @see #setPattern(java.lang.String,int)
3704       * @see #UNICODE_WORD_BOUNDARY
3705       */

3706      static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
3707

3708      /**
3709       * An option.
3710       * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
3711       * <p>By default, the engine considers a position between a word character
3712       * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
3713       * is a word boundary.
3714       * <p>By this option, the engine checks word boundaries with the method of
3715       * 'Unicode Regular Expression Guidelines' Revision 4.
3716       *
3717       * @see #RegularExpression(java.lang.String,int)
3718       * @see #setPattern(java.lang.String,int)
3719       */

3720      static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
3721

3722      /**
3723       * "H"
3724       */

3725      static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
3726      /**
3727       * "F"
3728       */

3729      static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
3730      /**
3731       * "X". XML Schema mode.
3732       */

3733      static final int XMLSCHEMA_MODE = 1<<9;
3734      /**
3735       * ",".
3736       */

3737      static final int SPECIAL_COMMA = 1<<10;
3738
3739
3740      private static final boolean isSet(int options, int flag) {
3741          return (options & flag) == flag;
3742      }
3743
3744      /**
3745       * Creates a new RegularExpression instance.
3746       *
3747       * @param regex A regular expression
3748       * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3749       */

3750      public RegularExpression(String JavaDoc regex) throws ParseException {
3751          this.setPattern(regex, null);
3752      }
3753
3754      /**
3755       * Creates a new RegularExpression instance with options.
3756       *
3757       * @param regex A regular expression
3758       * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
3759       * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3760       */

3761      public RegularExpression(String JavaDoc regex, String JavaDoc options) throws ParseException {
3762          this.setPattern(regex, options);
3763      }
3764
3765      RegularExpression(String JavaDoc regex, Token tok, int parens, boolean hasBackReferences, int options) {
3766          this.regex = regex;
3767          this.tokentree = tok;
3768          this.nofparen = parens;
3769          this.options = options;
3770          this.hasBackReferences = hasBackReferences;
3771      }
3772
3773      /**
3774       *
3775       */

3776      public void setPattern(String JavaDoc newPattern) throws ParseException {
3777          this.setPattern(newPattern, this.options);
3778      }
3779
3780      private void setPattern(String JavaDoc newPattern, int options) throws ParseException {
3781          this.regex = newPattern;
3782          this.options = options;
3783          RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
3784                           ? new ParserForXMLSchema() : new RegexParser();
3785          this.tokentree = rp.parse(this.regex, this.options);
3786          this.nofparen = rp.parennumber;
3787          this.hasBackReferences = rp.hasBackReferences;
3788
3789          this.operations = null;
3790          this.context = null;
3791      }
3792      /**
3793       *
3794       */

3795      public void setPattern(String JavaDoc newPattern, String JavaDoc options) throws ParseException {
3796          this.setPattern(newPattern, REUtil.parseOptions(options));
3797      }
3798
3799      /**
3800       *
3801       */

3802      public String JavaDoc getPattern() {
3803          return this.regex;
3804      }
3805
3806      /**
3807       * Represents this instence in String.
3808       */

3809      public String JavaDoc toString() {
3810          return this.tokentree.toString(this.options);
3811      }
3812
3813      /**
3814       * Returns a option string.
3815       * The order of letters in it may be different from a string specified
3816       * in a constructor or <code>setPattern()</code>.
3817       *
3818       * @see #RegularExpression(String, String)
3819       * @see #setPattern(java.lang.String,java.lang.String)
3820       */

3821      public String JavaDoc getOptions() {
3822          return REUtil.createOptionString(this.options);
3823      }
3824
3825      /**
3826       * Return true if patterns are the same and the options are equivalent.
3827       */

3828      public boolean equals(Object JavaDoc obj) {
3829          if (obj == null) return false;
3830          if (!(obj instanceof RegularExpression))
3831              return false;
3832          RegularExpression r = (RegularExpression)obj;
3833          return this.regex.equals(r.regex) && this.options == r.options;
3834      }
3835
3836      boolean equals(String JavaDoc pattern, int options) {
3837          return this.regex.equals(pattern) && this.options == options;
3838      }
3839
3840      /**
3841       *
3842       */

3843      public int hashCode() {
3844          return (this.regex+"/"+this.getOptions()).hashCode();
3845      }
3846
3847      /**
3848       * Return the number of regular expression groups.
3849       * This method returns 1 when the regular expression has no capturing-parenthesis.
3850       *
3851       */

3852      public int getNumberOfGroups() {
3853          return this.nofparen;
3854      }
3855
3856      // ================================================================
3857

3858      private static final int WT_IGNORE = 0;
3859      private static final int WT_LETTER = 1;
3860      private static final int WT_OTHER = 2;
3861      private static final int getWordType0(char ch, int opts) {
3862          if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3863              if (isSet(opts, USE_UNICODE_CATEGORY)) {
3864                  return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
3865              }
3866              return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3867          }
3868
3869          switch (Character.getType(ch)) {
3870          case Character.UPPERCASE_LETTER: // L
3871
case Character.LOWERCASE_LETTER: // L
3872
case Character.TITLECASE_LETTER: // L
3873
case Character.MODIFIER_LETTER: // L
3874
case Character.OTHER_LETTER: // L
3875
case Character.LETTER_NUMBER: // N
3876
case Character.DECIMAL_DIGIT_NUMBER: // N
3877
case Character.OTHER_NUMBER: // N
3878
case Character.COMBINING_SPACING_MARK: // Mc
3879
return WT_LETTER;
3880
3881          case Character.FORMAT: // Cf
3882
case Character.NON_SPACING_MARK: // Mn
3883
case Character.ENCLOSING_MARK: // Mc
3884
return WT_IGNORE;
3885
3886          case Character.CONTROL: // Cc
3887
switch (ch) {
3888              case '\t':
3889              case '\n':
3890              case '\u000B':
3891              case '\f':
3892              case '\r':
3893                  return WT_OTHER;
3894              default:
3895                  return WT_IGNORE;
3896              }
3897
3898          default:
3899              return WT_OTHER;
3900          }
3901      }
3902
3903      // ================================================================
3904

3905      static final int LINE_FEED = 0x000A;
3906      static final int CARRIAGE_RETURN = 0x000D;
3907      static final int LINE_SEPARATOR = 0x2028;
3908      static final int PARAGRAPH_SEPARATOR = 0x2029;
3909
3910      private static final boolean isEOLChar(int ch) {
3911          return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
3912          || ch == PARAGRAPH_SEPARATOR;
3913      }
3914
3915      private static final boolean isWordChar(int ch) { // Legacy word characters
3916
if (ch == '_') return true;
3917          if (ch < '0') return false;
3918          if (ch > 'z') return false;
3919          if (ch <= '9') return true;
3920          if (ch < 'A') return false;
3921          if (ch <= 'Z') return true;
3922          if (ch < 'a') return false;
3923          return true;
3924      }
3925
3926      private static final boolean matchIgnoreCase(int chardata, int ch) {
3927          if (chardata == ch) return true;
3928          if (chardata > 0xffff || ch > 0xffff) return false;
3929          char uch1 = Character.toUpperCase((char)chardata);
3930          char uch2 = Character.toUpperCase((char)ch);
3931          if (uch1 == uch2) return true;
3932          return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
3933      }
3934  }
3935  
3936  public static class ParseException extends RuntimeException JavaDoc {
3937    int location;
3938
3939    /*
3940    public ParseException(String mes) {
3941        this(mes, -1);
3942    }
3943    */

3944    /**
3945     *
3946     */

3947    public ParseException(String JavaDoc mes, int location) {
3948        super(mes);
3949        this.location = location;
3950    }
3951
3952    /**
3953     *
3954     * @return -1 if location information is not available.
3955     */

3956    public int getLocation() {
3957        return this.location;
3958    }
3959}
3960
3961  static class Op {
3962    static final int DOT = 0;
3963    static final int CHAR = 1; // Single character
3964
static final int RANGE = 3; // [a-zA-Z]
3965
static final int NRANGE = 4; // [^a-zA-Z]
3966
static final int ANCHOR = 5; // ^ $ ...
3967
static final int STRING = 6; // literal String
3968
static final int CLOSURE = 7; // X*
3969
static final int NONGREEDYCLOSURE = 8; // X*?
3970
static final int QUESTION = 9; // X?
3971
static final int NONGREEDYQUESTION = 10; // X??
3972
static final int UNION = 11; // X|Y
3973
static final int CAPTURE = 15; // ( and )
3974
static final int BACKREFERENCE = 16; // \1 \2 ...
3975
static final int LOOKAHEAD = 20; // (?=...)
3976
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
3977
static final int LOOKBEHIND = 22; // (?<=...)
3978
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
3979
static final int INDEPENDENT = 24; // (?>...)
3980
static final int MODIFIER = 25; // (?ims-ims:...)
3981
static final int CONDITION = 26; // (?(..)yes|no)
3982

3983    static int nofinstances = 0;
3984    static final boolean COUNT = false;
3985
3986    static Op createDot() {
3987        if (Op.COUNT) Op.nofinstances ++;
3988        return new Op(Op.DOT);
3989    }
3990    static CharOp createChar(int data) {
3991        if (Op.COUNT) Op.nofinstances ++;
3992        return new CharOp(Op.CHAR, data);
3993    }
3994    static CharOp createAnchor(int data) {
3995        if (Op.COUNT) Op.nofinstances ++;
3996        return new CharOp(Op.ANCHOR, data);
3997    }
3998    static CharOp createCapture(int number, Op next) {
3999        if (Op.COUNT) Op.nofinstances ++;
4000        CharOp op = new CharOp(Op.CAPTURE, number);
4001        op.next = next;
4002        return op;
4003    }
4004    static UnionOp createUnion(int size) {
4005        if (Op.COUNT) Op.nofinstances ++;
4006        //System.err.println("Creates UnionOp");
4007
return new UnionOp(Op.UNION, size);
4008    }
4009    static ChildOp createClosure(int id) {
4010        if (Op.COUNT) Op.nofinstances ++;
4011        return new ModifierOp(Op.CLOSURE, id, -1);
4012    }
4013    static ChildOp createNonGreedyClosure() {
4014        if (Op.COUNT) Op.nofinstances ++;
4015        return new ChildOp(Op.NONGREEDYCLOSURE);
4016    }
4017    static ChildOp createQuestion(boolean nongreedy) {
4018        if (Op.COUNT) Op.nofinstances ++;
4019        return new ChildOp(nongreedy ? Op.NONGREEDYQUESTION : Op.QUESTION);
4020    }
4021    static RangeOp createRange(Token tok) {
4022        if (Op.COUNT) Op.nofinstances ++;
4023        return new RangeOp(Op.RANGE, tok);
4024    }
4025    static ChildOp createLook(int type, Op next, Op branch) {
4026        if (Op.COUNT) Op.nofinstances ++;
4027        ChildOp op = new ChildOp(type);
4028        op.setChild(branch);
4029        op.next = next;
4030        return op;
4031    }
4032    static CharOp createBackReference(int refno) {
4033        if (Op.COUNT) Op.nofinstances ++;
4034        return new CharOp(Op.BACKREFERENCE, refno);
4035    }
4036    static StringOp createString(String JavaDoc literal) {
4037        if (Op.COUNT) Op.nofinstances ++;
4038        return new StringOp(Op.STRING, literal);
4039    }
4040    static ChildOp createIndependent(Op next, Op branch) {
4041        if (Op.COUNT) Op.nofinstances ++;
4042        ChildOp op = new ChildOp(Op.INDEPENDENT);
4043        op.setChild(branch);
4044        op.next = next;
4045        return op;
4046    }
4047    static ModifierOp createModifier(Op next, Op branch, int add, int mask) {
4048        if (Op.COUNT) Op.nofinstances ++;
4049        ModifierOp op = new ModifierOp(Op.MODIFIER, add, mask);
4050        op.setChild(branch);
4051        op.next = next;
4052        return op;
4053    }
4054    static ConditionOp createCondition(Op next, int ref, Op conditionflow, Op yesflow, Op noflow) {
4055        if (Op.COUNT) Op.nofinstances ++;
4056        ConditionOp op = new ConditionOp(Op.CONDITION, ref, conditionflow, yesflow, noflow);
4057        op.next = next;
4058        return op;
4059    }
4060
4061    int type;
4062    Op next = null;
4063
4064    protected Op(int type) {
4065        this.type = type;
4066    }
4067
4068    int size() { // for UNION
4069
return 0;
4070    }
4071    Op elementAt(int index) { // for UNIoN
4072
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4073    }
4074    Op getChild() { // for CLOSURE, QUESTION
4075
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4076    }
4077                                                // ModifierOp
4078
int getData() { // CharOp for CHAR, BACKREFERENCE, CAPTURE, ANCHOR,
4079
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4080    }
4081    int getData2() { // ModifierOp
4082
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4083    }
4084    RangeToken getToken() { // RANGE, NRANGE
4085
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4086    }
4087    String JavaDoc getString() { // STRING
4088
throw new RuntimeException JavaDoc("Internal Error: type="+this.type);
4089    }
4090
4091    // ================================================================
4092
static class CharOp extends Op {
4093        int charData;
4094        CharOp(int type, int data) {
4095            super(type);
4096            this.charData = data;
4097        }
4098        int getData() {
4099            return this.charData;
4100        }
4101    }
4102
4103    // ================================================================
4104
static class UnionOp extends Op {
4105        Vector JavaDoc branches;
4106        UnionOp(int type, int size) {
4107            super(type);
4108            this.branches = new Vector JavaDoc(size);
4109        }
4110        void addElement(Op op) {
4111            this.branches.addElement(op);
4112        }
4113        int size() {
4114            return this.branches.size();
4115        }
4116        Op elementAt(int index) {
4117            return (Op)this.branches.elementAt(index);
4118        }
4119    }
4120
4121    // ================================================================
4122
static class ChildOp extends Op {
4123        Op child;
4124        ChildOp(int type) {
4125            super(type);
4126        }
4127        void setChild(Op child) {
4128            this.child = child;
4129        }
4130        Op getChild() {
4131            return this.child;
4132        }
4133    }
4134    // ================================================================
4135
static class ModifierOp extends ChildOp {
4136        int v1;
4137        int v2;
4138        ModifierOp(int type, int v1, int v2) {
4139            super(type);
4140            this.v1 = v1;
4141            this.v2 = v2;
4142        }
4143        int getData() {
4144            return this.v1;
4145        }
4146        int getData2() {
4147            return this.v2;
4148        }
4149    }
4150    // ================================================================
4151
static class RangeOp extends Op {
4152        Token tok;
4153        RangeOp(int type, Token tok) {
4154            super(type);
4155            this.tok = tok;
4156        }
4157        RangeToken getToken() {
4158            return (RangeToken)this.tok;
4159        }
4160    }
4161    // ================================================================
4162
static class StringOp extends Op {
4163        String JavaDoc string;
4164        StringOp(int type, String JavaDoc literal) {
4165            super(type);
4166            this.string = literal;
4167        }
4168        String JavaDoc getString() {
4169            return this.string;
4170        }
4171    }
4172    // ================================================================
4173
static class ConditionOp extends Op {
4174        int refNumber;
4175        Op condition;
4176        Op yes;
4177        Op no;
4178        ConditionOp(int type, int refno, Op conditionflow, Op yesflow, Op noflow) {
4179            super(type);
4180            this.refNumber = refno;
4181            this.condition = conditionflow;
4182            this.yes = yesflow;
4183            this.no = noflow;
4184        }
4185    }
4186}
4187
4188  final static class RangeToken extends Token implements java.io.Serializable JavaDoc {
4189
4190    int[] ranges;
4191    boolean sorted;
4192    boolean compacted;
4193    RangeToken icaseCache = null;
4194    int[] map = null;
4195    int nonMapIndex;
4196
4197    RangeToken(int type) {
4198        super(type);
4199        this.setSorted(false);
4200    }
4201
4202                                                // for RANGE or NRANGE
4203
protected void addRange(int start, int end) {
4204        this.icaseCache = null;
4205        //System.err.println("Token#addRange(): "+start+" "+end);
4206
int r1, r2;
4207        if (start <= end) {
4208            r1 = start;
4209            r2 = end;
4210        } else {
4211            r1 = end;
4212            r2 = start;
4213        }
4214
4215        int pos = 0;
4216        if (this.ranges == null) {
4217            this.ranges = new int[2];
4218            this.ranges[0] = r1;
4219            this.ranges[1] = r2;
4220            this.setSorted(true);
4221        } else {
4222            pos = this.ranges.length;
4223            if (this.ranges[pos-1]+1 == r1) {
4224                this.ranges[pos-1] = r2;
4225                return;
4226            }
4227            int[] temp = new int[pos+2];
4228            System.arraycopy(this.ranges, 0, temp, 0, pos);
4229            this.ranges = temp;
4230            if (this.ranges[pos-1] >= r1)
4231                this.setSorted(false);
4232            this.ranges[pos++] = r1;
4233            this.ranges[pos] = r2;
4234            if (!this.sorted)
4235                this.sortRanges();
4236        }
4237    }
4238
4239    private final boolean isSorted() {
4240        return this.sorted;
4241    }
4242    private final void setSorted(boolean sort) {
4243        this.sorted = sort;
4244        if (!sort) this.compacted = false;
4245    }
4246    private final boolean isCompacted() {
4247        return this.compacted;
4248    }
4249    private final void setCompacted() {
4250        this.compacted = true;
4251    }
4252
4253    protected void sortRanges() {
4254        if (this.isSorted())
4255            return;
4256        if (this.ranges == null)
4257            return;
4258        //System.err.println("Do sorting: "+this.ranges.length);
4259

4260                                                // Bubble sort
4261
// Why? -- In many cases,
4262
// this.ranges has few elements.
4263
for (int i = this.ranges.length-4; i >= 0; i -= 2) {
4264            for (int j = 0; j <= i; j += 2) {
4265                if (this.ranges[j] > this.ranges[j+2]
4266                    || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
4267                    int tmp;
4268                    tmp = this.ranges[j+2];
4269                    this.ranges[j+2] = this.ranges[j];
4270                    this.ranges[j] = tmp;
4271                    tmp = this.ranges[j+3];
4272                    this.ranges[j+3] = this.ranges[j+1];
4273                    this.ranges[j+1] = tmp;
4274                }
4275            }
4276        }
4277        this.setSorted(true);
4278    }
4279
4280    /**
4281     * this.ranges is sorted.
4282     */

4283    protected void compactRanges() {
4284        boolean DEBUG = false;
4285        if (this.ranges == null || this.ranges.length <= 2)
4286            return;
4287        if (this.isCompacted())
4288            return;
4289        int base = 0; // Index of writing point
4290
int target = 0; // Index of processing point
4291

4292        while (target < this.ranges.length) {
4293            if (base != target) {
4294                this.ranges[base] = this.ranges[target++];
4295                this.ranges[base+1] = this.ranges[target++];
4296            } else
4297                target += 2;
4298            int baseend = this.ranges[base+1];
4299            while (target < this.ranges.length) {
4300                if (baseend+1 < this.ranges[target])
4301                    break;
4302                if (baseend+1 == this.ranges[target]) {
4303                    if (DEBUG)
4304                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4305                                           +", "+this.ranges[base+1]
4306                                           +"], ["+this.ranges[target]
4307                                           +", "+this.ranges[target+1]
4308                                           +"] -> ["+this.ranges[base]
4309                                           +", "+this.ranges[target+1]
4310                                           +"]");
4311                    this.ranges[base+1] = this.ranges[target+1];
4312                    baseend = this.ranges[base+1];
4313                    target += 2;
4314                } else if (baseend >= this.ranges[target+1]) {
4315                    if (DEBUG)
4316                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4317                                           +", "+this.ranges[base+1]
4318                                           +"], ["+this.ranges[target]
4319                                           +", "+this.ranges[target+1]
4320                                           +"] -> ["+this.ranges[base]
4321                                           +", "+this.ranges[base+1]
4322                                           +"]");
4323                    target += 2;
4324                } else if (baseend < this.ranges[target+1]) {
4325                    if (DEBUG)
4326                        System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
4327                                           +", "+this.ranges[base+1]
4328                                           +"], ["+this.ranges[target]
4329                                           +", "+this.ranges[target+1]
4330                                           +"] -> ["+this.ranges[base]
4331                                           +", "+this.ranges[target+1]
4332                                           +"]");
4333                    this.ranges[base+1] = this.ranges[target+1];
4334                    baseend = this.ranges[base+1];
4335                    target += 2;
4336                } else {
4337                    throw new RuntimeException JavaDoc("Token#compactRanges(): Internel Error: ["
4338                                               +this.ranges[base]
4339                                               +","+this.ranges[base+1]
4340                                               +"] ["+this.ranges[target]
4341                                               +","+this.ranges[target+1]+"]");
4342                }
4343            } // while
4344
base += 2;
4345        }
4346
4347        if (base != this.ranges.length) {
4348            int[] result = new int[base];
4349            System.arraycopy(this.ranges, 0, result, 0, base);
4350            this.ranges = result;
4351        }
4352        this.setCompacted();
4353    }
4354
4355    protected void mergeRanges(Token token) {
4356        RangeToken tok = (RangeToken)token;
4357        this.sortRanges();
4358        tok.sortRanges();
4359        if (tok.ranges == null)
4360            return;
4361        this.icaseCache = null;
4362        this.setSorted(true);
4363        if (this.ranges == null) {
4364            this.ranges = new int[tok.ranges.length];
4365            System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
4366            return;
4367        }
4368        int[] result = new int[this.ranges.length+tok.ranges.length];
4369        for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
4370            if (i >= this.ranges.length) {
4371                result[k++] = tok.ranges[j++];
4372                result[k++] = tok.ranges[j++];
4373            } else if (j >= tok.ranges.length) {
4374                result[k++] = this.ranges[i++];
4375                result[k++] = this.ranges[i++];
4376            } else if (tok.ranges[j] < this.ranges[i]
4377                       || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
4378                result[k++] = tok.ranges[j++];
4379                result[k++] = tok.ranges[j++];
4380            } else {
4381                result[k++] = this.ranges[i++];
4382                result[k++] = this.ranges[i++];
4383            }
4384        }
4385        this.ranges = result;
4386    }
4387
4388    protected void subtractRanges(Token token) {
4389        if (token.type == NRANGE) {
4390            this.intersectRanges(token);
4391            return;
4392        }
4393        RangeToken tok = (RangeToken)token;
4394        if (tok.ranges == null || this.ranges == null)
4395            return;
4396        this.icaseCache = null;
4397        this.sortRanges();
4398        this.compactRanges();
4399        tok.sortRanges();
4400        tok.compactRanges();
4401
4402        //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
4403

4404        int[] result = new int[this.ranges.length+tok.ranges.length];
4405        int wp = 0, src = 0, sub = 0;
4406        while (src < this.ranges.length && sub < tok.ranges.length) {
4407            int srcbegin = this.ranges[src];
4408            int srcend = this.ranges[src+1];
4409            int subbegin = tok.ranges[sub];
4410            int subend = tok.ranges[sub+1];
4411            if (srcend < subbegin) { // Not overlapped
4412
// src: o-----o
4413
// sub: o-----o
4414
// res: o-----o
4415
// Reuse sub
4416
result[wp++] = this.ranges[src++];
4417                result[wp++] = this.ranges[src++];
4418            } else if (srcend >= subbegin
4419                       && srcbegin <= subend) { // Overlapped
4420
// src: o--------o
4421
// sub: o----o
4422
// sub: o----o
4423
// sub: o----o
4424
// sub: o------------o
4425
if (subbegin <= srcbegin && srcend <= subend) {
4426                                                // src: o--------o
4427
// sub: o------------o
4428
// res: empty
4429
// Reuse sub
4430
src += 2;
4431                } else if (subbegin <= srcbegin) {
4432                                                // src: o--------o
4433
// sub: o----o
4434
// res: o-----o
4435
// Reuse src(=res)
4436
this.ranges[src] = subend+1;
4437                    sub += 2;
4438                } else if (srcend <= subend) {
4439                                                // src: o--------o
4440
// sub: o----o
4441
// res: o-----o
4442
// Reuse sub
4443
result[wp++] = srcbegin;
4444                    result[wp++] = subbegin-1;
4445                    src += 2;
4446                } else {
4447                                                // src: o--------o
4448
// sub: o----o
4449
// res: o-o o-o
4450
// Reuse src(=right res)
4451
result[wp++] = srcbegin;
4452                    result[wp++] = subbegin-1;
4453                    this.ranges[src] = subend+1;
4454                    sub += 2;
4455                }
4456            } else if (subend < srcbegin) {
4457                                                // Not overlapped
4458
// src: o-----o
4459
// sub: o----o
4460
sub += 2;
4461            } else {
4462                throw new RuntimeException JavaDoc("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
4463                                           +","+this.ranges[src+1]
4464                                           +"] - ["+tok.ranges[sub]
4465                                           +","+tok.ranges[sub+1]
4466                                           +"]");
4467            }
4468        }
4469        while (src < this.ranges.length) {
4470            result[wp++] = this.ranges[src++];
4471            result[wp++] = this.ranges[src++];
4472        }
4473        this.ranges = new int[wp];
4474        System.arraycopy(result, 0, this.ranges, 0, wp);
4475                                                // this.ranges is sorted and compacted.
4476
}
4477
4478    /**
4479     * @param tok Ignore whether it is NRANGE or not.
4480     */

4481    protected void intersectRanges(Token token) {
4482        RangeToken tok = (RangeToken)token;
4483        if (tok.ranges == null || this.ranges == null)
4484            return;
4485        this.icaseCache = null;
4486        this.sortRanges();
4487        this.compactRanges();
4488        tok.sortRanges();
4489        tok.compactRanges();
4490
4491        int[] result = new int[this.ranges.length+tok.ranges.length];
4492        int wp = 0, src1 = 0, src2 = 0;
4493        while (src1 < this.ranges.length && src2 < tok.ranges.length) {
4494            int src1begin = this.ranges[src1];
4495            int src1end = this.ranges[src1+1];
4496            int src2begin = tok.ranges[src2];
4497            int src2end = tok.ranges[src2+1];
4498            if (src1end < src2begin) { // Not overlapped
4499
// src1: o-----o
4500
// src2: o-----o
4501
// res: empty
4502
// Reuse src2
4503
src1 += 2;
4504            } else if (src1end >= src2begin
4505                       && src1begin <= src2end) { // Overlapped
4506
// src1: o--------o
4507
// src2: o----o
4508
// src2: o----o
4509
// src2: o----o
4510
// src2: o------------o
4511
if (src2begin <= src2begin && src1end <= src2end) {
4512                                                // src1: o--------o
4513
// src2: o------------o
4514
// res: o--------o
4515
// Reuse src2
4516
result[wp++] = src1begin;
4517                    result[wp++] = src1end;
4518                    src1 += 2;
4519                } else if (src2begin <= src1begin) {
4520                                                // src1: o--------o
4521
// src2: o----o
4522
// res: o--o
4523
// Reuse the rest of src1
4524
result[wp++] = src1begin;
4525                    result[wp++] = src2end;
4526                    this.ranges[src1] = src2end+1;
4527                    src2 += 2;
4528                } else if (src1end <= src2end) {
4529                                                // src1: o--------o
4530
// src2: o----o
4531
// res: o--o
4532
// Reuse src2
4533
result[wp++] = src2begin;
4534                    result[wp++] = src1end;
4535                    src1 += 2;
4536                } else {
4537                                                // src1: o--------o
4538
// src2: o----o
4539
// res: o----o
4540
// Reuse the rest of src1
4541
result[wp++] = src2begin;
4542                    result[wp++] = src2end;
4543                    this.ranges[src1] = src2end+1;
4544                }
4545            } else if (src2end < src1begin) {
4546                                                // Not overlapped
4547
// src1: o-----o
4548
// src2: o----o
4549
src2 += 2;
4550            } else {
4551                throw new RuntimeException JavaDoc("Token#intersectRanges(): Internal Error: ["
4552                                           +this.ranges[src1]
4553                                           +","+this.ranges[src1+1]
4554                                           +"] & ["+tok.ranges[src2]
4555                                           +","+tok.ranges[src2+1]
4556                                           +"]");
4557            }
4558        }
4559        while (src1 < this.ranges.length) {
4560            result[wp++] = this.ranges[src1++];
4561            result[wp++] = this.ranges[src1++];
4562        }
4563        this.ranges = new int[wp];
4564        System.arraycopy(result, 0, this.ranges, 0, wp);
4565                                                // this.ranges is sorted and compacted.
4566
}
4567
4568    /**
4569     * for RANGE: Creates complement.
4570     * for NRANGE: Creates the same meaning RANGE.
4571     */

4572    static Token complementRanges(Token token) {
4573        if (token.type != RANGE && token.type != NRANGE)
4574            throw new IllegalArgumentException JavaDoc("Token#complementRanges(): must be RANGE: "+token.type);
4575        RangeToken tok = (RangeToken)token;
4576        tok.sortRanges();
4577        tok.compactRanges();
4578        int len = tok.ranges.length+2;
4579        if (tok.ranges[0] == 0)
4580            len -= 2;
4581        int last = tok.ranges[tok.ranges.length-1];
4582        if (last == UTF16_MAX)
4583            len -= 2;
4584        RangeToken ret = Token.createRange();
4585        ret.ranges = new int[len];
4586        int wp = 0;
4587        if (tok.ranges[0] > 0) {
4588            ret.ranges[wp++] = 0;
4589            ret.ranges[wp++] = tok.ranges[0]-1;
4590        }
4591        for (int i = 1; i < tok.ranges.length-2; i += 2) {
4592            ret.ranges[wp++] = tok.ranges[i]+1;
4593            ret.ranges[wp++] = tok.ranges[i+1]-1;
4594        }
4595        if (last != UTF16_MAX) {
4596            ret.ranges[wp++] = last+1;
4597            ret.ranges[wp] = UTF16_MAX;
4598        }
4599        ret.setCompacted();
4600        return ret;
4601    }
4602
4603    synchronized RangeToken getCaseInsensitiveToken() {
4604        if (this.icaseCache != null)
4605            return this.icaseCache;
4606            
4607        RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
4608        for (int i = 0; i < this.ranges.length; i += 2) {
4609            for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
4610                if (ch > 0xffff)
4611                    uppers.addRange(ch, ch);
4612                else {
4613                    char uch = Character.toUpperCase((char)ch);
4614                    uppers.addRange(uch, uch);
4615                }
4616            }
4617        }
4618        RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
4619        for (int i = 0; i < uppers.ranges.length; i += 2) {
4620            for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
4621                if (ch > 0xffff)
4622                    lowers.addRange(ch, ch);
4623                else {
4624                    char uch = Character.toUpperCase((char)ch);
4625                    lowers.addRange(uch, uch);
4626                }
4627            }
4628        }
4629        lowers.mergeRanges(uppers);
4630        lowers.mergeRanges(this);
4631        lowers.compactRanges();
4632
4633        this.icaseCache = lowers;
4634        return lowers;
4635    }
4636
4637    void dumpRanges() {
4638        System.err.print("RANGE: ");
4639        if (this.ranges == null)
4640            System.err.println(" NULL");
4641        for (int i = 0; i < this.ranges.length; i += 2) {
4642            System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
4643        }
4644        System.err.println("");
4645    }
4646
4647    boolean match(int ch) {
4648        if (this.map == null) this.createMap();
4649        boolean ret;
4650        if (this.type == RANGE) {
4651            if (ch < MAPSIZE)
4652                return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
4653            ret = false;
4654            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
4655                if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
4656                    return true;
4657            }
4658        } else {
4659            if (ch < MAPSIZE)
4660                return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
4661            ret = true;
4662            for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
4663                if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
4664                    return false;
4665            }
4666        }
4667        return ret;
4668    }
4669
4670    private static final int MAPSIZE = 256;
4671    private void createMap() {
4672        int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
4673
this.map = new int[asize];
4674        this.nonMapIndex = this.ranges.length;
4675        for (int i = 0; i < asize; i ++) this.map[i] = 0;
4676        for (int i = 0; i < this.ranges.length; i += 2) {
4677            int s = this.ranges[i];
4678            int e = this.ranges[i+1];
4679            if (s < MAPSIZE) {
4680                for (int j = s; j <= e && j < MAPSIZE; j ++)
4681                    this.map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
4682
} else {
4683                this.nonMapIndex = i;
4684                break;
4685            }
4686            if (e >= MAPSIZE) {
4687                this.nonMapIndex = i;
4688                break;
4689            }
4690        }
4691        //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
4692
}
4693
4694    public String JavaDoc toString(int options) {
4695        String JavaDoc ret;
4696        if (this.type == RANGE) {
4697            if (this == Token.token_dot)
4698                ret = ".";
4699            else if (this == Token.token_0to9)
4700                ret = "\\d";
4701            else if (this == Token.token_wordchars)
4702                ret = "\\w";
4703            else if (this == Token.token_spaces)
4704                ret = "\\s";
4705            else {
4706                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
4707                sb.append("[");
4708                for (int i = 0; i < this.ranges.length; i += 2) {
4709                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
4710                    if (this.ranges[i] == this.ranges[i+1]) {
4711                        sb.append(escapeCharInCharClass(this.ranges[i]));
4712                    } else {
4713                        sb.append(escapeCharInCharClass(this.ranges[i]));
4714                        sb.append('-');
4715                        sb.append(escapeCharInCharClass(this.ranges[i+1]));
4716                    }
4717                }
4718                sb.append("]");
4719                ret = sb.toString();
4720            }
4721        } else {
4722            if (this == Token.token_not_0to9)
4723                ret = "\\D";
4724            else if (this == Token.token_not_wordchars)
4725                ret = "\\W";
4726            else if (this == Token.token_not_spaces)
4727                ret = "\\S";
4728            else {
4729                StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
4730                sb.append("[^");
4731                for (int i = 0; i < this.ranges.length; i += 2) {
4732                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
4733                    if (this.ranges[i] == this.ranges[i+1]) {
4734                        sb.append(escapeCharInCharClass(this.ranges[i]));
4735                    } else {
4736                        sb.append(escapeCharInCharClass(this.ranges[i]));
4737                        sb.append('-');
4738                        sb.append(escapeCharInCharClass(this.ranges[i+1]));
4739                    }
4740                }
4741                sb.append("]");
4742                ret = sb.toString();
4743            }
4744        }
4745        return ret;
4746    }
4747
4748    private static String JavaDoc escapeCharInCharClass(int ch) {
4749        String JavaDoc ret;
4750        switch (ch) {
4751          case '[': case ']': case '-': case '^':
4752          case ',': case '\\':
4753            ret = "\\"+(char)ch;
4754            break;
4755          case '\f': ret = "\\f"; break;
4756          case '\n': ret = "\\n"; break;
4757          case '\r': ret = "\\r"; break;
4758          case '\t': ret = "\\t"; break;
4759          case 0x1b: ret = "\\e"; break;
4760          //case 0x0b: ret = "\\v"; break;
4761
default:
4762            if (ch < 0x20) {
4763                String JavaDoc pre = "0"+Integer.toHexString(ch);
4764                ret = "\\x"+pre.substring(pre.length()-2, pre.length());
4765            } else if (ch >= 0x10000) {
4766                String JavaDoc pre = "0"+Integer.toHexString(ch);
4767                ret = "\\v"+pre.substring(pre.length()-6, pre.length());
4768            } else
4769                ret = ""+(char)ch;
4770        }
4771        return ret;
4772    }
4773
4774}
4775
4776  static class RegexParser {
4777      static final int T_CHAR = 0;
4778      static final int T_EOF = 1;
4779      static final int T_OR = 2; // '|'
4780
static final int T_STAR = 3; // '*'
4781
static final int T_PLUS = 4; // '+'
4782
static final int T_QUESTION = 5; // '?'
4783
static final int T_LPAREN = 6; // '('
4784
static final int T_RPAREN = 7; // ')'
4785
static final int T_DOT = 8; // '.'
4786
static final int T_LBRACKET = 9; // '['
4787
static final int T_BACKSOLIDUS = 10; // '\'
4788
static final int T_CARET = 11; // '^'
4789
static final int T_DOLLAR = 12; // '$'
4790
static final int T_LPAREN2 = 13; // '(?:'
4791
static final int T_LOOKAHEAD = 14; // '(?='
4792
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
4793
static final int T_LOOKBEHIND = 16; // '(?<='
4794
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
4795
static final int T_INDEPENDENT = 18; // '(?>'
4796
static final int T_SET_OPERATIONS = 19; // '(?['
4797
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
4798
static final int T_COMMENT = 21; // '(?#'
4799
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
4800
static final int T_CONDITION = 23; // '(?('
4801
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
4802

4803      static class ReferencePosition {
4804          int refNumber;
4805          int position;
4806          ReferencePosition(int n, int pos) {
4807              this.refNumber = n;
4808              this.position = pos;
4809          }
4810      }
4811
4812      int offset;
4813      String JavaDoc regex;
4814      int regexlen;
4815      int options;
4816      ResourceBundle JavaDoc resources;
4817      int chardata;
4818      int nexttoken;
4819      static protected final int S_NORMAL = 0;
4820      static protected final int S_INBRACKETS = 1;
4821      static protected final int S_INXBRACKETS = 2;
4822      int context = S_NORMAL;
4823      int parennumber = 1;
4824      boolean hasBackReferences;
4825      Vector JavaDoc references = null;
4826
4827      public RegexParser() {
4828          //this.setLocale(Locale.getDefault());
4829
}
4830      public RegexParser(Locale JavaDoc locale) {
4831          //this.setLocale(locale);
4832
}
4833
4834      public void setLocale(Locale JavaDoc locale) {
4835      }
4836
4837      final ParseException ex(String JavaDoc key, int loc) {
4838          return new ParseException(EcorePlugin.INSTANCE.getString(key), loc);
4839      }
4840
4841      private final boolean isSet(int flag) {
4842          return (this.options & flag) == flag;
4843      }
4844
4845      synchronized Token parse(String JavaDoc regex, int options) throws ParseException {
4846          this.options = options;
4847          this.offset = 0;
4848          this.setContext(S_NORMAL);
4849          this.parennumber = 1;
4850          this.hasBackReferences = false;
4851          this.regex = regex;
4852          if (this.isSet(RegularExpression.EXTENDED_COMMENT))
4853              this.regex = REUtil.stripExtendedComment(this.regex);
4854          this.regexlen = this.regex.length();
4855
4856
4857          this.next();
4858          Token ret = this.parseRegex();
4859          if (this.offset != this.regexlen)
4860              throw ex("parser.parse.1", this.offset);
4861          if (this.references != null) {
4862              for (int i = 0; i < this.references.size(); i ++) {
4863                  ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
4864                  if (this.parennumber <= position.refNumber)
4865                      throw ex("parser.parse.2", position.position);
4866              }
4867              this.references.removeAllElements();
4868          }
4869          return ret;
4870      }
4871
4872      /*
4873      public RegularExpression createRegex(String regex, int options) throws ParseException {
4874          Token tok = this.parse(regex, options);
4875          return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
4876      }
4877      */

4878
4879      protected final void setContext(int con) {
4880          this.context = con;
4881      }
4882
4883      final int read() {
4884          return this.nexttoken;
4885      }
4886
4887      final void next() {
4888          if (this.offset >= this.regexlen) {
4889              this.chardata = -1;
4890              this.nexttoken = T_EOF;
4891              return;
4892          }
4893
4894          int ret;
4895          int ch = this.regex.charAt(this.offset++);
4896          this.chardata = ch;
4897
4898          if (this.context == S_INBRACKETS) {
4899              // In a character class, this.chardata has one character, that is to say,
4900
// a pair of surrogates is composed and stored to this.chardata.
4901
switch (ch) {
4902                case '\\':
4903                  ret = T_BACKSOLIDUS;
4904                  if (this.offset >= this.regexlen)
4905                      throw ex("parser.next.1", this.offset-1);
4906                  this.chardata = this.regex.charAt(this.offset++);
4907                  break;
4908
4909                case '-':
4910                  if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
4911                      && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
4912                      this.offset++;
4913                      ret = T_XMLSCHEMA_CC_SUBTRACTION;
4914                  } else
4915                      ret = T_CHAR;
4916                  break;
4917
4918                case '[':
4919                  if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
4920                      && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
4921                      this.offset++;
4922                      ret = T_POSIX_CHARCLASS_START;
4923                      break;
4924                  } // Through down
4925
default:
4926                  if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
4927                      int low = this.regex.charAt(this.offset);
4928                      if (REUtil.isLowSurrogate(low)) {
4929                          this.chardata = REUtil.composeFromSurrogates(ch, low);
4930                          this.offset ++;
4931                      }
4932                  }
4933                  ret = T_CHAR;
4934              }
4935              this.nexttoken = ret;
4936              return;
4937          }
4938
4939          switch (ch) {
4940            case '|': ret = T_OR; break;
4941            case '*': ret = T_STAR; break;
4942            case '+': ret = T_PLUS; break;
4943            case '?': ret = T_QUESTION; break;
4944            case ')': ret = T_RPAREN; break;
4945            case '.': ret = T_DOT; break;
4946            case '[': ret = T_LBRACKET; break;
4947            case '^': ret = T_CARET; break;
4948            case '$': ret = T_DOLLAR; break;
4949            case '(':
4950              ret = T_LPAREN;
4951              if (this.offset >= this.regexlen)
4952                  break;
4953              if (this.regex.charAt(this.offset) != '?')
4954                  break;
4955              if (++this.offset >= this.regexlen)
4956                  throw ex("parser.next.2", this.offset-1);
4957              ch = this.regex.charAt(this.offset++);
4958              switch (ch) {
4959                case ':': ret = T_LPAREN2; break;
4960                case '=': ret = T_LOOKAHEAD; break;
4961                case '!': ret = T_NEGATIVELOOKAHEAD; break;
4962                case '[': ret = T_SET_OPERATIONS; break;
4963                case '>': ret = T_INDEPENDENT; break;
4964                case '<':
4965                  if (this.offset >= this.regexlen)
4966                      throw ex("parser.next.2", this.offset-3);
4967                  ch = this.regex.charAt(this.offset++);
4968                  if (ch == '=') {
4969                      ret = T_LOOKBEHIND;
4970                  } else if (ch == '!') {
4971                      ret = T_NEGATIVELOOKBEHIND;
4972                  } else
4973                      throw ex("parser.next.3", this.offset-3);
4974                  break;
4975                case '#':
4976                  while (this.offset < this.regexlen) {
4977                      ch = this.regex.charAt(this.offset++);
4978                      if (ch == ')') break;
4979                  }
4980                  if (ch != ')')
4981                      throw ex("parser.next.4", this.offset-1);
4982                  ret = T_COMMENT;
4983                  break;
4984                default:
4985                  if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
4986
this.offset --;
4987                      ret = T_MODIFIERS;
4988                      break;
4989                  } else if (ch == '(') { // conditional
4990
ret = T_CONDITION; // this.offsets points the next of '('.
4991
break;
4992                  }
4993                  throw ex("parser.next.2", this.offset-2);
4994              }
4995              break;
4996              
4997            case '\\':
4998              ret = T_BACKSOLIDUS;
4999              if (this.offset >= this.regexlen)
5000                  throw ex("parser.next.1", this.offset-1);
5001              this.chardata = this.regex.charAt(this.offset++);
5002              break;
5003
5004            default:
5005              ret = T_CHAR;
5006          }
5007          this.nexttoken = ret;
5008      }
5009
5010      /**
5011       * regex ::= term (`|` term)*
5012       * term ::= factor+
5013       * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
5014       * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
5015       * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
5016       * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
5017       * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
5018       */

5019      Token parseRegex() throws ParseException {
5020          Token tok = this.parseTerm();
5021          Token parent = null;
5022          while (this.read() == T_OR) {
5023              this.next(); // '|'
5024
if (parent == null) {
5025                  parent = Token.createUnion();
5026                  parent.addChild(tok);
5027                  tok = parent;
5028              }
5029              tok.addChild(this.parseTerm());
5030          }
5031          return tok;
5032      }
5033
5034      /**
5035       * term ::= factor+
5036       */

5037      Token parseTerm() throws ParseException {
5038          int ch = this.read();
5039          if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
5040              return Token.createEmpty();
5041          } else {
5042              Token tok = this.parseFactor();
5043              Token concat = null;
5044              while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
5045                  if (concat == null) {
5046                      concat = Token.createConcat();
5047                      concat.addChild(tok);
5048                      tok = concat;
5049                  }
5050                  concat.addChild(this.parseFactor());
5051                  //tok = Token.createConcat(tok, this.parseFactor());
5052
}
5053              return tok;
5054          }
5055      }
5056
5057      // ----------------------------------------------------------------
5058

5059      Token processCaret() throws ParseException {
5060          this.next();
5061          return Token.token_linebeginning;
5062      }
5063      Token processDollar() throws ParseException {
5064          this.next();
5065          return Token.token_lineend;
5066      }
5067      Token processLookahead() throws ParseException {
5068          this.next();
5069          Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
5070          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5071          this.next(); // ')'
5072
return tok;
5073      }
5074      Token processNegativelookahead() throws ParseException {
5075          this.next();
5076          Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
5077          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5078          this.next(); // ')'
5079
return tok;
5080      }
5081      Token processLookbehind() throws ParseException {
5082          this.next();
5083          Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
5084          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5085          this.next(); // ')'
5086
return tok;
5087      }
5088      Token processNegativelookbehind() throws ParseException {
5089          this.next();
5090          Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
5091          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5092          this.next(); // ')'
5093
return tok;
5094      }
5095      Token processBacksolidus_A() throws ParseException {
5096          this.next();
5097          return Token.token_stringbeginning;
5098      }
5099      Token processBacksolidus_Z() throws ParseException {
5100          this.next();
5101          return Token.token_stringend2;
5102      }
5103      Token processBacksolidus_z() throws ParseException {
5104          this.next();
5105          return Token.token_stringend;
5106      }
5107      Token processBacksolidus_b() throws ParseException {
5108          this.next();
5109          return Token.token_wordedge;
5110      }
5111      Token processBacksolidus_B() throws ParseException {
5112          this.next();
5113          return Token.token_not_wordedge;
5114      }
5115      Token processBacksolidus_lt() throws ParseException {
5116          this.next();
5117          return Token.token_wordbeginning;
5118      }
5119      Token processBacksolidus_gt() throws ParseException {
5120          this.next();
5121          return Token.token_wordend;
5122      }
5123      Token processStar(Token tok) throws ParseException {
5124          this.next();
5125          if (this.read() == T_QUESTION) {
5126              this.next();
5127              return Token.createNGClosure(tok);
5128          } else
5129              return Token.createClosure(tok);
5130      }
5131      Token processPlus(Token tok) throws ParseException {
5132          // X+ -> XX*
5133
this.next();
5134          if (this.read() == T_QUESTION) {
5135              this.next();
5136              return Token.createConcat(tok, Token.createNGClosure(tok));
5137          } else
5138              return Token.createConcat(tok, Token.createClosure(tok));
5139      }
5140      Token processQuestion(Token tok) throws ParseException {
5141          // X? -> X|
5142
this.next();
5143          Token par = Token.createUnion();
5144          if (this.read() == T_QUESTION) {
5145              this.next();
5146              par.addChild(Token.createEmpty());
5147              par.addChild(tok);
5148          } else {
5149              par.addChild(tok);
5150              par.addChild(Token.createEmpty());
5151          }
5152          return par;
5153      }
5154      boolean checkQuestion(int off) {
5155          return off < this.regexlen && this.regex.charAt(off) == '?';
5156      }
5157      Token processParen() throws ParseException {
5158          this.next();
5159          int p = this.parennumber++;
5160          Token tok = Token.createParen(this.parseRegex(), p);
5161          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5162          this.next(); // Skips ')'
5163
return tok;
5164      }
5165      Token processParen2() throws ParseException {
5166          this.next();
5167          Token tok = Token.createParen(this.parseRegex(), 0);
5168          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5169          this.next(); // Skips ')'
5170
return tok;
5171      }
5172      Token processCondition() throws ParseException {
5173                                                  // this.offset points the next of '('
5174
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
5175                                                  // Parses a condition.
5176
int refno = -1;
5177          Token condition = null;
5178          int ch = this.regex.charAt(this.offset);
5179          if ('1' <= ch && ch <= '9') {
5180              refno = ch-'0';
5181              this.hasBackReferences = true;
5182              if (this.references == null) this.references = new Vector JavaDoc();
5183              this.references.addElement(new ReferencePosition(refno, this.offset));
5184              this.offset ++;
5185              if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
5186              this.offset ++;
5187          } else {
5188              if (ch == '?') this.offset --; // Points '('.
5189
this.next();
5190              condition = this.parseFactor();
5191              switch (condition.type) {
5192                case Token.LOOKAHEAD:
5193                case Token.NEGATIVELOOKAHEAD:
5194                case Token.LOOKBEHIND:
5195                case Token.NEGATIVELOOKBEHIND:
5196                  break;
5197                case Token.ANCHOR:
5198                  if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5199                  break;
5200                default:
5201                  throw ex("parser.factor.5", this.offset);
5202              }
5203          }
5204                                                  // Parses yes/no-patterns.
5205
this.next();
5206          Token yesPattern = this.parseRegex();
5207          Token noPattern = null;
5208          if (yesPattern.type == Token.UNION) {
5209              if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
5210              noPattern = yesPattern.getChild(1);
5211              yesPattern = yesPattern.getChild(0);
5212          }
5213          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5214          this.next();
5215          return Token.createCondition(refno, condition, yesPattern, noPattern);
5216      }
5217      Token processModifiers() throws ParseException {
5218                                                  // this.offset points the next of '?'.
5219
// modifiers ::= [imsw]* ('-' [imsw]*)? ':'
5220
int add = 0, mask = 0, ch = -1;
5221          while (this.offset < this.regexlen) {
5222              ch = this.regex.charAt(this.offset);
5223              int v = REUtil.getOptionValue(ch);
5224              if (v == 0) break; // '-' or ':'?
5225
add |= v;
5226              this.offset ++;
5227          }
5228          if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
5229          if (ch == '-') {
5230              this.offset ++;
5231              while (this.offset < this.regexlen) {
5232                  ch = this.regex.charAt(this.offset);
5233                  int v = REUtil.getOptionValue(ch);
5234                  if (v == 0) break; // ':'?
5235
mask |= v;
5236                  this.offset ++;
5237              }
5238              if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
5239          }
5240          Token tok;
5241          if (ch == ':') {
5242              this.offset ++;
5243              this.next();
5244              tok = Token.createModifierGroup(this.parseRegex(), add, mask);
5245              if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5246              this.next();
5247          } else if (ch == ')') { // such as (?-i)
5248
this.offset ++;
5249              this.next();
5250              tok = Token.createModifierGroup(this.parseRegex(), add, mask);
5251          } else
5252              throw ex("parser.factor.3", this.offset);
5253
5254          return tok;
5255      }
5256      Token processIndependent() throws ParseException {
5257          this.next();
5258          Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
5259          if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
5260          this.next(); // Skips ')'
5261
return tok;
5262      }
5263      Token processBacksolidus_c() throws ParseException {
5264          int ch2; // Must be in 0x0040-0x005f
5265
if (this.offset >= this.regexlen
5266              || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
5267              throw ex("parser.atom.1", this.offset-1);
5268          this.next();
5269          return Token.createChar(ch2-0x40);
5270      }
5271      Token processBacksolidus_C() throws ParseException {
5272          throw ex("parser.process.1", this.offset);
5273      }
5274      Token processBacksolidus_i() throws ParseException {
5275          Token tok = Token.createChar('i');
5276          this.next();
5277          return tok;
5278      }
5279      Token processBacksolidus_I() throws ParseException {
5280          throw ex("parser.process.1", this.offset);
5281      }
5282      Token processBacksolidus_g() throws ParseException {
5283          this.next();
5284          return Token.getGraphemePattern();
5285      }
5286      Token processBacksolidus_X() throws ParseException {
5287          this.next();
5288          return Token.getCombiningCharacterSequence();
5289      }
5290      Token processBackreference() throws ParseException {
5291          int refnum = this.chardata-'0';
5292          Token tok = Token.createBackReference(refnum);
5293          this.hasBackReferences = true;
5294          if (this.references == null) this.references = new Vector JavaDoc();
5295          this.references.addElement(new ReferencePosition(refnum, this.offset-2));
5296          this.next();
5297          return tok;
5298      }
5299
5300      // ----------------------------------------------------------------
5301

5302      /**
5303       * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
5304       * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
5305       * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
5306       * | '(?#' [^)]* ')'
5307       * minmax ::= '{' min (',' max?)? '}'
5308       * min ::= [0-9]+
5309       * max ::= [0-9]+
5310       */

5311      Token parseFactor() throws ParseException {
5312          int ch = this.read();
5313          Token tok;
5314          switch (ch) {
5315            case T_CARET: return this.processCaret();
5316            case T_DOLLAR: return this.processDollar();
5317            case T_LOOKAHEAD: return this.processLookahead();
5318            case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
5319            case T_LOOKBEHIND: return this.processLookbehind();
5320            case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
5321
5322            case T_COMMENT:
5323              this.next();
5324              return Token.createEmpty();
5325
5326            case T_BACKSOLIDUS:
5327              switch (this.chardata) {
5328                case 'A': return this.processBacksolidus_A();
5329                case 'Z': return this.processBacksolidus_Z();
5330                case 'z': return this.processBacksolidus_z();
5331                case 'b': return this.processBacksolidus_b();
5332                case 'B': return this.processBacksolidus_B();
5333                case '<': return this.processBacksolidus_lt();
5334                case '>': return this.processBacksolidus_gt();
5335              }
5336                                                  // through down
5337
}
5338          tok = this.parseAtom();
5339          ch = this.read();
5340          switch (ch) {
5341            case T_STAR: return this.processStar(tok);
5342            case T_PLUS: return this.processPlus(tok);
5343            case T_QUESTION: return this.processQuestion(tok);
5344            case T_CHAR:
5345              if (this.chardata == '{' && this.offset < this.regexlen) {
5346
5347                  int off = this.offset; // this.offset -> next of '{'
5348
int min = 0, max = -1;
5349
5350                  if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5351
5352                      min = ch -'0';
5353                      while (off < this.regexlen
5354                             && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5355                          min = min*10 +ch-'0';
5356                          if (min < 0)
5357                              throw ex("parser.quantifier.5", this.offset);
5358                      }
5359                  }
5360                  else {
5361                      throw ex("parser.quantifier.1", this.offset);
5362                  }
5363
5364                  max = min;
5365                  if (ch == ',') {
5366
5367                     if (off >= this.regexlen) {
5368                         throw ex("parser.quantifier.3", this.offset);
5369                     }
5370                     else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
5371
5372                          max = ch -'0'; // {min,max}
5373
while (off < this.regexlen
5374                                 && (ch = this.regex.charAt(off++)) >= '0'
5375                                 && ch <= '9') {
5376                              max = max*10 +ch-'0';
5377                              if (max < 0)
5378                                  throw ex("parser.quantifier.5", this.offset);
5379                          }
5380
5381                          if (min > max)
5382                              throw ex("parser.quantifier.4", this.offset);
5383                     }
5384                     else { // assume {min,}
5385
max = -1;
5386                      }
5387                  }
5388
5389                 if (ch != '}')
5390                     throw ex("parser.quantifier.2", this.offset);
5391
5392                 if (this.checkQuestion(off)) { // off -> next of '}'
5393
tok = Token.createNGClosure(tok);
5394                      this.offset = off+1;
5395                  } else {
5396                      tok = Token.createClosure(tok);
5397                      this.offset = off;
5398                  }
5399
5400                  tok.setMin(min);
5401                  tok.setMax(max);
5402                  //System.err.println("CLOSURE: "+min+", "+max);
5403
this.next();
5404              }
5405          }
5406          return tok;
5407      }
5408
5409      /**
5410       * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
5411       * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
5412       * | '(?>' regex ')'
5413       * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
5414       */

5415      Token parseAtom() throws ParseException {
5416          int ch = this.read();
5417          Token tok = null;
5418          switch (ch) {
5419            case T_LPAREN: return this.processParen();
5420            case T_LPAREN2: return this.processParen2(); // '(?:'
5421
case T_CONDITION: return this.processCondition(); // '(?('
5422
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
5423
case T_INDEPENDENT: return this.processIndependent();
5424            case T_DOT:
5425              this.next(); // Skips '.'
5426
tok = Token.token_dot;
5427              break;
5428
5429              /**
5430               * char-class ::= '[' ( '^'? range ','?)+ ']'
5431               * range ::= '\d' | '\w' | '\s' | category-block | range-char
5432               * | range-char '-' range-char
5433               * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
5434               * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
5435               */

5436            case T_LBRACKET: return this.parseCharacterClass(true);
5437            case T_SET_OPERATIONS: return this.parseSetOperations();
5438
5439            case T_BACKSOLIDUS:
5440              switch (this.chardata) {
5441                case 'd': case 'D':
5442                case 'w': case 'W':
5443                case 's': case 'S':
5444                  tok = this.getTokenForShorthand(this.chardata);
5445                  this.next();
5446                  return tok;
5447
5448                case 'e': case 'f': case 'n': case 'r':
5449                case 't': case 'u': case 'v': case 'x':
5450                  {
5451                      int ch2 = this.decodeEscaped();
5452                      if (ch2 < 0x10000) {
5453                          tok = Token.createChar(ch2);
5454                      } else {
5455                          tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
5456                      }
5457                  }
5458                  break;
5459
5460                case 'c': return this.processBacksolidus_c();
5461                case 'C': return this.processBacksolidus_C();
5462                case 'i': return this.processBacksolidus_i();
5463                case 'I': return this.processBacksolidus_I();
5464                case 'g': return this.processBacksolidus_g();
5465                case 'X': return this.processBacksolidus_X();
5466                case '1': case '2': case '3': case '4':
5467                case '5': case '6': case '7': case '8': case '9':
5468                  return this.processBackreference();
5469
5470                case 'P':
5471                case 'p':
5472                  int pstart = this.offset;
5473                  tok = processBacksolidus_pP(this.chardata);
5474                  if (tok == null) throw this.ex("parser.atom.5", pstart);
5475                  break;
5476
5477                default:
5478                  tok = Token.createChar(this.chardata);
5479              }
5480              this.next();
5481              break;
5482
5483            case T_CHAR:
5484              if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
5485                  throw this.ex("parser.atom.4", this.offset-1);
5486              tok = Token.createChar(this.chardata);
5487              int high = this.chardata;
5488              this.next();
5489              if (REUtil.isHighSurrogate(high)
5490                  && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
5491                  char[] sur = new char[2];
5492                  sur[0] = (char)high;
5493                  sur[1] = (char)this.chardata;
5494                  tok = Token.createParen(Token.createString(new String JavaDoc(sur)), 0);
5495                  this.next();
5496              }
5497              break;
5498
5499            default:
5500              throw this.ex("parser.atom.4", this.offset-1);
5501          }
5502          return tok;
5503      }
5504
5505      protected RangeToken processBacksolidus_pP(int c) throws ParseException {
5506
5507          this.next();
5508          if (this.read() != T_CHAR || this.chardata != '{')
5509              throw this.ex("parser.atom.2", this.offset-1);
5510
5511          // handle category escape
5512
boolean positive = c == 'p';
5513          int namestart = this.offset;
5514          int nameend = this.regex.indexOf('}', namestart);
5515
5516          if (nameend < 0)
5517              throw this.ex("parser.atom.3", this.offset);
5518
5519          String JavaDoc pname = this.regex.substring(namestart, nameend);
5520          this.offset = nameend+1;
5521
5522          return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
5523      }
5524
5525      int processCIinCharacterClass(RangeToken tok, int c) {
5526          return this.decodeEscaped();
5527      }
5528
5529      /**
5530       * char-class ::= '[' ( '^'? range ','?)+ ']'
5531       * range ::= '\d' | '\w' | '\s' | category-block | range-char
5532       * | range-char '-' range-char
5533       * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
5534       * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
5535       */

5536      protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
5537          this.setContext(S_INBRACKETS);
5538          this.next(); // '['
5539
boolean nrange = false;
5540          RangeToken base = null;
5541          RangeToken tok;
5542          if (this.read() == T_CHAR && this.chardata == '^') {
5543              nrange = true;
5544              this.next(); // '^'
5545
if (useNrange) {
5546                  tok = Token.createNRange();
5547              } else {
5548                  base = Token.createRange();
5549                  base.addRange(0, Token.UTF16_MAX);
5550                  tok = Token.createRange();
5551              }
5552          } else {
5553              tok = Token.createRange();
5554          }
5555          int type;
5556          boolean firstloop = true;
5557          while ((type = this.read()) != T_EOF) {
5558              if (type == T_CHAR && this.chardata == ']' && !firstloop)
5559                  break;
5560              firstloop = false;
5561              int c = this.chardata;
5562              boolean end = false;
5563              if (type == T_BACKSOLIDUS) {
5564                  switch (c) {
5565                    case 'd': case 'D':
5566                    case 'w': case 'W':
5567                    case 's': case 'S':
5568                      tok.mergeRanges(this.getTokenForShorthand(c));
5569                      end = true;
5570                      break;
5571
5572                    case 'i': case 'I':
5573                    case 'c': case 'C':
5574                      c = this.processCIinCharacterClass(tok, c);
5575                      if (c < 0) end = true;
5576                      break;
5577                      
5578                    case 'p':
5579                    case 'P':
5580                      int pstart = this.offset;
5581                      RangeToken tok2 = this.processBacksolidus_pP(c);
5582                      if (tok2 == null) throw this.ex("parser.atom.5", pstart);
5583                      tok.mergeRanges(tok2);
5584                      end = true;
5585                      break;
5586
5587                    default:
5588                      c = this.decodeEscaped();
5589                  } // \ + c
5590
} // backsolidus
5591
// POSIX Character class such as [:alnum:]
5592
else if (type == T_POSIX_CHARCLASS_START) {
5593                  int nameend = this.regex.indexOf(':', this.offset);
5594                  if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
5595                  boolean positive = true;
5596                  if (this.regex.charAt(this.offset) == '^') {
5597                      this.offset ++;
5598                      positive = false;
5599                  }
5600                  String JavaDoc name = this.regex.substring(this.offset, nameend);
5601                  RangeToken range = Token.getRange(name, positive,
5602                                                    this.isSet(RegularExpression.XMLSCHEMA_MODE));
5603                  if (range == null) throw this.ex("parser.cc.3", this.offset);
5604                  tok.mergeRanges(range);
5605                  end = true;
5606                  if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
5607                      throw this.ex("parser.cc.1", nameend);
5608                  this.offset = nameend+2;
5609              }
5610              this.next();
5611              if (!end) { // if not shorthands...
5612
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
5613
tok.addRange(c, c);
5614                  } else {
5615                      this.next(); // Skips '-'
5616
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
5617                      if (type == T_CHAR && this.chardata == ']') {
5618                          tok.addRange(c, c);
5619                          tok.addRange('-', '-');
5620                      } else {
5621                          int rangeend = this.chardata;
5622                          if (type == T_BACKSOLIDUS)
5623                              rangeend = this.decodeEscaped();
5624                          this.next();
5625                          tok.addRange(c, rangeend);
5626                      }
5627                  }
5628              }
5629              if (this.isSet(RegularExpression.SPECIAL_COMMA)
5630                  && this.read() == T_CHAR && this.chardata == ',')
5631                  this.next();
5632          }
5633          if (this.read() == T_EOF)
5634              throw this.ex("parser.cc.2", this.offset);
5635          if (!useNrange && nrange) {
5636              base.subtractRanges(tok);
5637              tok = base;
5638          }
5639          tok.sortRanges();
5640          tok.compactRanges();
5641          //tok.dumpRanges();
5642
/*
5643          if (this.isSet(RegularExpression.IGNORE_CASE))
5644              tok = RangeToken.createCaseInsensitiveToken(tok);
5645          */

5646          this.setContext(S_NORMAL);
5647          this.next(); // Skips ']'
5648

5649          return tok;
5650      }
5651
5652      /**
5653       * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
5654       */

5655      protected RangeToken parseSetOperations() throws ParseException {
5656          RangeToken tok = this.parseCharacterClass(false);
5657          int type;
5658          while ((type = this.read()) != T_RPAREN) {
5659              int ch = this.chardata;
5660              if (type == T_CHAR && (ch == '-' || ch == '&')
5661                  || type == T_PLUS) {
5662                  this.next();
5663                  if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
5664                  RangeToken t2 = this.parseCharacterClass(false);
5665                  if (type == T_PLUS)
5666                      tok.mergeRanges(t2);
5667                  else if (ch == '-')
5668                      tok.subtractRanges(t2);
5669                  else if (ch == '&')
5670                      tok.intersectRanges(t2);
5671                  else
5672                      throw new RuntimeException JavaDoc("ASSERT");
5673              } else {
5674                  throw ex("parser.ope.2", this.offset-1);
5675              }
5676          }
5677          this.next();
5678          return tok;
5679      }
5680
5681      Token getTokenForShorthand(int ch) {
5682          Token tok;
5683          switch (ch) {
5684            case 'd':
5685              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5686                  ? Token.getRange("Nd", true) : Token.token_0to9;
5687              break;
5688            case 'D':
5689              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5690                  ? Token.getRange("Nd", false) : Token.token_not_0to9;
5691              break;
5692            case 'w':
5693              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5694                  ? Token.getRange("IsWord", true) : Token.token_wordchars;
5695              break;
5696            case 'W':
5697              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5698                  ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
5699              break;
5700            case 's':
5701              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5702                  ? Token.getRange("IsSpace", true) : Token.token_spaces;
5703              break;
5704            case 'S':
5705              tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
5706                  ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
5707              break;
5708
5709            default:
5710              throw new RuntimeException JavaDoc("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
5711          }
5712          return tok;
5713      }
5714
5715      /**
5716       */

5717      int decodeEscaped() throws ParseException {
5718          if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
5719          int c = this.chardata;
5720          switch (c) {
5721            case 'e': c = 0x1b; break; // ESCAPE U+001B
5722
case 'f': c = '\f'; break; // FORM FEED U+000C
5723
case 'n': c = '\n'; break; // LINE FEED U+000A
5724
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
5725
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
5726
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
5727
case 'x':
5728              this.next();
5729              if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
5730              if (this.chardata == '{') {
5731                  int v1 = 0;
5732                  int uv = 0;
5733                  do {
5734                      this.next();
5735                      if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
5736                      if ((v1 = hexChar(this.chardata)) < 0)
5737                          break;
5738                      if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
5739                      uv = uv*16+v1;
5740                  } while (true);
5741                  if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
5742                  if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
5743                  c = uv;
5744              } else {
5745                  int v1 = 0;
5746                  if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5747                      throw ex("parser.descape.1", this.offset-1);
5748                  int uv = v1;
5749                  this.next();
5750                  if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5751                      throw ex("parser.descape.1", this.offset-1);
5752                  uv = uv*16+v1;
5753                  c = uv;
5754              }
5755              break;
5756
5757            case 'u':
5758              int v1 = 0;
5759              this.next();
5760              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5761                  throw ex("parser.descape.1", this.offset-1);
5762              int uv = v1;
5763              this.next();
5764              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5765                  throw ex("parser.descape.1", this.offset-1);
5766              uv = uv*16+v1;
5767              this.next();
5768              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5769                  throw ex("parser.descape.1", this.offset-1);
5770              uv = uv*16+v1;
5771              this.next();
5772              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5773                  throw ex("parser.descape.1", this.offset-1);
5774              uv = uv*16+v1;
5775              c = uv;
5776              break;
5777
5778            case 'v':
5779              this.next();
5780              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5781                  throw ex("parser.descape.1", this.offset-1);
5782              uv = v1;
5783              this.next();
5784              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5785                  throw ex("parser.descape.1", this.offset-1);
5786              uv = uv*16+v1;
5787              this.next();
5788              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5789                  throw ex("parser.descape.1", this.offset-1);
5790              uv = uv*16+v1;
5791              this.next();
5792              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5793                  throw ex("parser.descape.1", this.offset-1);
5794              uv = uv*16+v1;
5795              this.next();
5796              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5797                  throw ex("parser.descape.1", this.offset-1);
5798              uv = uv*16+v1;
5799              this.next();
5800              if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
5801                  throw ex("parser.descape.1", this.offset-1);
5802              uv = uv*16+v1;
5803              if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
5804              c = uv;
5805              break;
5806            case 'A':
5807            case 'Z':
5808            case 'z':
5809              throw ex("parser.descape.5", this.offset-2);
5810            default:
5811          }
5812          return c;
5813      }
5814
5815      static private final int hexChar(int ch) {
5816          if (ch < '0') return -1;
5817          if (ch > 'f') return -1;
5818          if (ch <= '9') return ch-'0';
5819          if (ch < 'A') return -1;
5820          if (ch <= 'F') return ch-'A'+10;
5821          if (ch < 'a') return -1;
5822          return ch-'a'+10;
5823      }
5824  }
5825
5826
5827  static class Token implements java.io.Serializable JavaDoc {
5828      static final boolean COUNTTOKENS = true;
5829      static int tokens = 0;
5830
5831      static final int CHAR = 0; // Literal char
5832
static final int DOT = 11; // .
5833
static final int CONCAT = 1; // XY
5834
static final int UNION = 2; // X|Y|Z
5835
static final int CLOSURE = 3; // X*
5836
static final int RANGE = 4; // [a-zA-Z] etc.
5837
static final int NRANGE = 5; // [^a-zA-Z] etc.
5838
static final int PAREN = 6; // (X) or (?:X)
5839
static final int EMPTY = 7; //
5840
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
5841
static final int NONGREEDYCLOSURE = 9; // *? +?
5842
static final int STRING = 10; // strings
5843
static final int BACKREFERENCE = 12; // back references
5844
static final int LOOKAHEAD = 20; // (?=...)
5845
static final int NEGATIVELOOKAHEAD = 21; // (?!...)
5846
static final int LOOKBEHIND = 22; // (?<=...)
5847
static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
5848
static final int INDEPENDENT = 24; // (?>...)
5849
static final int MODIFIERGROUP = 25; // (?ims-ims:...)
5850
static final int CONDITION = 26; // (?(...)yes|no)
5851

5852      static final int UTF16_MAX = 0x10ffff;
5853
5854      int type;
5855
5856      static Token token_dot;
5857      static Token token_0to9;
5858      static Token token_wordchars;
5859      static Token token_not_0to9;
5860      static Token token_not_wordchars;
5861      static Token token_spaces;
5862      static Token token_not_spaces;
5863      static Token token_empty;
5864      static Token token_linebeginning;
5865      static Token token_linebeginning2;
5866      static Token token_lineend;
5867      static Token token_stringbeginning;
5868      static Token token_stringend;
5869      static Token token_stringend2;
5870      static Token token_wordedge;
5871      static Token token_not_wordedge;
5872      static Token token_wordbeginning;
5873      static Token token_wordend;
5874      static {
5875          Token.token_empty = new Token(Token.EMPTY);
5876
5877          Token.token_linebeginning = Token.createAnchor('^');
5878          Token.token_linebeginning2 = Token.createAnchor('@');
5879          Token.token_lineend = Token.createAnchor('$');
5880          Token.token_stringbeginning = Token.createAnchor('A');
5881          Token.token_stringend = Token.createAnchor('z');
5882          Token.token_stringend2 = Token.createAnchor('Z');
5883          Token.token_wordedge = Token.createAnchor('b');
5884          Token.token_not_wordedge = Token.createAnchor('B');
5885          Token.token_wordbeginning = Token.createAnchor('<');
5886          Token.token_wordend = Token.createAnchor('>');
5887
5888          Token.token_dot = new Token(Token.DOT);
5889
5890          Token.token_0to9 = Token.createRange();
5891          Token.token_0to9.addRange('0', '9');
5892          Token.token_wordchars = Token.createRange();
5893          Token.token_wordchars.addRange('0', '9');
5894          Token.token_wordchars.addRange('A', 'Z');
5895          Token.token_wordchars.addRange('_', '_');
5896          Token.token_wordchars.addRange('a', 'z');
5897          Token.token_spaces = Token.createRange();
5898          Token.token_spaces.addRange('\t', '\t');
5899          Token.token_spaces.addRange('\n', '\n');
5900          Token.token_spaces.addRange('\f', '\f');
5901          Token.token_spaces.addRange('\r', '\r');
5902          Token.token_spaces.addRange(' ', ' ');
5903
5904          Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
5905          Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
5906          Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
5907      }
5908
5909      static Token.ParenToken createLook(int type, Token child) {
5910          if (COUNTTOKENS) Token.tokens ++;
5911          return new Token.ParenToken(type, child, 0);
5912      }
5913      static Token.ParenToken createParen(Token child, int pnumber) {
5914          if (COUNTTOKENS) Token.tokens ++;
5915          return new Token.ParenToken(Token.PAREN, child, pnumber);
5916      }
5917      static Token.ClosureToken createClosure(Token tok) {
5918          if (COUNTTOKENS) Token.tokens ++;
5919          return new Token.ClosureToken(Token.CLOSURE, tok);
5920      }
5921      static Token.ClosureToken createNGClosure(Token tok) {
5922          if (COUNTTOKENS) Token.tokens ++;
5923          return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
5924      }
5925      static Token.ConcatToken createConcat(Token tok1, Token tok2) {
5926          if (COUNTTOKENS) Token.tokens ++;
5927          return new Token.ConcatToken(tok1, tok2);
5928      }
5929      static Token.UnionToken createConcat() {
5930          if (COUNTTOKENS) Token.tokens ++;
5931          return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
5932
}
5933      static Token.UnionToken createUnion() {
5934          if (COUNTTOKENS) Token.tokens ++;
5935          return new Token.UnionToken(Token.UNION);
5936      }
5937      static Token createEmpty() {
5938          return Token.token_empty;
5939      }
5940      static RangeToken createRange() {
5941          if (COUNTTOKENS) Token.tokens ++;
5942          return new RangeToken(Token.RANGE);
5943      }
5944      static RangeToken createNRange() {
5945          if (COUNTTOKENS) Token.tokens ++;
5946          return new RangeToken(Token.NRANGE);
5947      }
5948      static Token.CharToken createChar(int ch) {
5949          if (COUNTTOKENS) Token.tokens ++;
5950          return new Token.CharToken(Token.CHAR, ch);
5951      }
5952      static private Token.CharToken createAnchor(int ch) {
5953          if (COUNTTOKENS) Token.tokens ++;
5954          return new Token.CharToken(Token.ANCHOR, ch);
5955      }
5956      static Token.StringToken createBackReference(int refno) {
5957          if (COUNTTOKENS) Token.tokens ++;
5958          return new Token.StringToken(Token.BACKREFERENCE, null, refno);
5959      }
5960      static Token.StringToken createString(String JavaDoc str) {
5961          if (COUNTTOKENS) Token.tokens ++;
5962          return new Token.StringToken(Token.STRING, str, 0);
5963      }
5964      static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
5965          if (COUNTTOKENS) Token.tokens ++;
5966          return new Token.ModifierToken(child, add, mask);
5967      }
5968      static Token.ConditionToken createCondition(int refno, Token condition,
5969                                                  Token yespat, Token nopat) {
5970          if (COUNTTOKENS) Token.tokens ++;
5971          return new Token.ConditionToken(refno, condition, yespat, nopat);
5972      }
5973
5974      protected Token(int type) {
5975          this.type = type;
5976      }
5977
5978      /**
5979       * A number of children.
5980       */

5981      int size() {
5982          return 0;
5983      }
5984      Token getChild(int index) {
5985          return null;
5986      }
5987      void addChild(Token tok) {
5988          throw new RuntimeException JavaDoc("Not supported.");
5989      }
5990
5991                                                  // for RANGE or NRANGE
5992
protected void addRange(int start, int end) {
5993          throw new RuntimeException JavaDoc("Not supported.");
5994      }
5995      protected void sortRanges() {
5996          throw new RuntimeException JavaDoc("Not supported.");
5997      }
5998      protected void compactRanges() {
5999          throw new RuntimeException JavaDoc("Not supported.");
6000      }
6001      protected void mergeRanges(Token tok) {
6002          throw new RuntimeException JavaDoc("Not supported.");
6003      }
6004      protected void subtractRanges(Token tok) {
6005          throw new RuntimeException JavaDoc("Not supported.");
6006      }
6007      protected void intersectRanges(Token tok) {
6008          throw new RuntimeException JavaDoc("Not supported.");
6009      }
6010      static Token complementRanges(Token tok) {
6011          return RangeToken.complementRanges(tok);
6012      }
6013
6014
6015      void setMin(int min) { // for CLOSURE
6016
}
6017      void setMax(int max) { // for CLOSURE
6018
}
6019      int getMin() { // for CLOSURE
6020
return -1;
6021      }
6022      int getMax() { // for CLOSURE
6023
return -1;
6024      }
6025      int getReferenceNumber() { // for STRING
6026
return 0;
6027      }
6028      String JavaDoc getString() { // for STRING
6029
return null;
6030      }
6031
6032      int getParenNumber() {
6033          return 0;
6034      }
6035      int getChar() {
6036          return -1;
6037      }
6038
6039      public String JavaDoc toString() {
6040          return this.toString(0);
6041      }
6042      public String JavaDoc toString(int options) {
6043          return this.type == Token.DOT ? "." : "";
6044      }
6045
6046      /**
6047       * How many characters are needed?
6048       */

6049      final int getMinLength() {
6050          switch (this.type) {
6051            case CONCAT:
6052              int sum = 0;
6053              for (int i = 0; i < this.size(); i ++)
6054                  sum += this.getChild(i).getMinLength();
6055              return sum;
6056
6057            case CONDITION:
6058            case UNION:
6059              if (this.size() == 0)
6060                  return 0;
6061              int ret = this.getChild(0).getMinLength();
6062              for (int i = 1; i < this.size(); i ++) {
6063                  int min = this.getChild(i).getMinLength();
6064                  if (min < ret) ret = min;
6065              }
6066              return ret;
6067
6068            case CLOSURE:
6069            case NONGREEDYCLOSURE:
6070              if (this.getMin() >= 0)
6071                  return this.getMin() * this.getChild(0).getMinLength();
6072              return 0;
6073
6074            case EMPTY:
6075            case ANCHOR:
6076              return 0;
6077
6078            case DOT:
6079            case CHAR:
6080            case RANGE:
6081            case NRANGE:
6082              return 1;
6083
6084            case INDEPENDENT:
6085            case PAREN:
6086            case MODIFIERGROUP:
6087              return this.getChild(0).getMinLength();
6088
6089            case BACKREFERENCE:
6090              return 0; // *******
6091

6092            case STRING:
6093              return this.getString().length();
6094
6095            case LOOKAHEAD:
6096            case NEGATIVELOOKAHEAD:
6097            case LOOKBEHIND:
6098            case NEGATIVELOOKBEHIND:
6099              return 0; // ***** Really?
6100

6101            default:
6102              throw new RuntimeException JavaDoc("Token#getMinLength(): Invalid Type: "+this.type);
6103          }
6104      }
6105
6106      final int getMaxLength() {
6107          switch (this.type) {
6108            case CONCAT:
6109              int sum = 0;
6110              for (int i = 0; i < this.size(); i ++) {
6111                  int d = this.getChild(i).getMaxLength();
6112                  if (d < 0) return -1;
6113                  sum += d;
6114              }
6115              return sum;
6116
6117            case CONDITION:
6118            case UNION:
6119              if (this.size() == 0)
6120                  return 0;
6121              int ret = this.getChild(0).getMaxLength();
6122              for (int i = 1; ret >= 0 && i < this.size(); i ++) {
6123                  int max = this.getChild(i).getMaxLength();
6124                  if (max < 0) { // infinity
6125
ret = -1;
6126                      break;
6127                  }
6128                  if (max > ret) ret = max;
6129              }
6130              return ret;
6131
6132            case CLOSURE:
6133            case NONGREEDYCLOSURE:
6134              if (this.getMax() >= 0)
6135                                                  // When this.child.getMaxLength() < 0,
6136
// this returns minus value
6137
return this.getMax() * this.getChild(0).getMaxLength();
6138              return -1;
6139
6140            case EMPTY:
6141            case ANCHOR:
6142              return 0;
6143
6144            case CHAR:
6145              return 1;
6146            case DOT:
6147            case RANGE:
6148            case NRANGE:
6149              return 2;
6150
6151            case INDEPENDENT:
6152            case PAREN:
6153            case MODIFIERGROUP:
6154              return this.getChild(0).getMaxLength();
6155
6156            case BACKREFERENCE:
6157              return -1; // ******
6158

6159            case STRING:
6160              return this.getString().length();
6161
6162            case LOOKAHEAD:
6163            case NEGATIVELOOKAHEAD:
6164            case LOOKBEHIND:
6165            case NEGATIVELOOKBEHIND:
6166              return 0; // ***** Really?
6167

6168            default:
6169              throw new RuntimeException JavaDoc("Token#getMaxLength(): Invalid Type: "+this.type);
6170          }
6171      }
6172
6173      static final int FC_CONTINUE = 0;
6174      static final int FC_TERMINAL = 1;
6175      static final int FC_ANY = 2;
6176      private static final boolean isSet(int options, int flag) {
6177          return (options & flag) == flag;
6178      }
6179      final int analyzeFirstCharacter(RangeToken result, int options) {
6180          switch (this.type) {
6181            case CONCAT:
6182              int ret = FC_CONTINUE;
6183              for (int i = 0; i < this.size(); i ++)
6184                  if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
6185                      break;
6186              return ret;
6187
6188            case UNION:
6189              if (this.size() == 0)
6190                  return FC_CONTINUE;
6191              /*
6192               * a|b|c -> FC_TERMINAL
6193               * a|.|c -> FC_ANY
6194               * a|b| -> FC_CONTINUE
6195               */

6196              int ret2 = FC_CONTINUE;
6197              boolean hasEmpty = false;
6198              for (int i = 0; i < this.size(); i ++) {
6199                  ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
6200                  if (ret2 == FC_ANY)
6201                      break;
6202                  else if (ret2 == FC_CONTINUE)
6203                      hasEmpty = true;
6204              }
6205              return hasEmpty ? FC_CONTINUE : ret2;
6206
6207            case CONDITION:
6208              int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
6209              if (this.size() == 1) return FC_CONTINUE;
6210              if (ret3 == FC_ANY) return ret3;
6211              int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
6212              if (ret4 == FC_ANY) return ret4;
6213              return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
6214
6215            case CLOSURE:
6216            case NONGREEDYCLOSURE:
6217              this.getChild(0).analyzeFirstCharacter(result, options);
6218              return FC_CONTINUE;
6219
6220            case EMPTY:
6221            case ANCHOR:
6222              return FC_CONTINUE;
6223
6224            case CHAR:
6225              int ch = this.getChar();
6226              result.addRange(ch, ch);
6227              if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
6228                  ch = Character.toUpperCase((char)ch);
6229                  result.addRange(ch, ch);
6230                  ch = Character.toLowerCase((char)ch);
6231                  result.addRange(ch, ch);
6232              }
6233              return FC_TERMINAL;
6234
6235            case DOT: // ****
6236
if (isSet(options, RegularExpression.SINGLE_LINE)) {
6237                  return FC_CONTINUE; // **** We can not optimize.
6238
} else {
6239                  return FC_CONTINUE;
6240                  /*
6241                  result.addRange(0, RegularExpression.LINE_FEED-1);
6242                  result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
6243                  result.addRange(RegularExpression.CARRIAGE_RETURN+1,
6244                                  RegularExpression.LINE_SEPARATOR-1);
6245                  result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
6246                  return 1;
6247                  */

6248              }
6249
6250            case RANGE:
6251              if (isSet(options, RegularExpression.IGNORE_CASE)) {
6252                  result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
6253              } else {
6254                  result.mergeRanges(this);
6255              }
6256              return FC_TERMINAL;
6257
6258            case NRANGE: // ****
6259
if (isSet(options, RegularExpression.IGNORE_CASE)) {
6260                  result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
6261              } else {
6262                  result.mergeRanges(Token.complementRanges(this));
6263              }
6264              return FC_TERMINAL;
6265
6266            case INDEPENDENT:
6267            case PAREN:
6268              return this.getChild(0).analyzeFirstCharacter(result, options);
6269
6270            case MODIFIERGROUP:
6271              options |= ((ModifierToken)this).getOptions();
6272              options &= ~((ModifierToken)this).getOptionsMask();
6273              return this.getChild(0).analyzeFirstCharacter(result, options);
6274
6275            case BACKREFERENCE:
6276              result.addRange(0, UTF16_MAX); // **** We can not optimize.
6277
return FC_ANY;
6278
6279            case STRING:
6280              int cha = this.getString().charAt(0);
6281              int ch2;
6282              if (REUtil.isHighSurrogate(cha)
6283                  && this.getString().length() >= 2
6284                  && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
6285                  cha = REUtil.composeFromSurrogates(cha, ch2);
6286              result.addRange(cha, cha);
6287              if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
6288                  cha = Character.toUpperCase((char)cha);
6289                  result.addRange(cha, cha);
6290                  cha = Character.toLowerCase((char)cha);
6291                  result.addRange(cha, cha);
6292              }
6293              return FC_TERMINAL;
6294
6295            case LOOKAHEAD:
6296            case NEGATIVELOOKAHEAD:
6297            case LOOKBEHIND:
6298            case NEGATIVELOOKBEHIND:
6299              return FC_CONTINUE;
6300
6301            default:
6302              throw new RuntimeException JavaDoc("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
6303          }
6304      }
6305
6306      private final boolean isShorterThan(Token tok) {
6307          if (tok == null) return false;
6308          /*
6309          int mylength;
6310          if (this.type == STRING) mylength = this.getString().length();
6311          else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
6312          else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
6313          int otherlength;
6314          if (tok.type == STRING) otherlength = tok.getString().length();
6315          else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
6316          else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
6317          */

6318          int mylength;
6319          if (this.type == STRING) mylength = this.getString().length();
6320          else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+this.type);
6321          int otherlength;
6322          if (tok.type == STRING) otherlength = tok.getString().length();
6323          else throw new RuntimeException JavaDoc("Internal Error: Illegal type: "+tok.type);
6324          return mylength < otherlength;
6325      }
6326
6327      static class FixedStringContainer {
6328          Token token = null;
6329          int options = 0;
6330          FixedStringContainer() {
6331          }
6332      }
6333
6334      final void findFixedString(FixedStringContainer container, int options) {
6335          switch (this.type) {
6336            case CONCAT:
6337              Token prevToken = null;
6338              int prevOptions = 0;
6339              for (int i = 0; i < this.size(); i ++) {
6340                  this.getChild(i).findFixedString(container, options);
6341                  if (prevToken == null || prevToken.isShorterThan(container.token)) {
6342                      prevToken = container.token;
6343                      prevOptions = container.options;
6344                  }
6345              }
6346              container.token = prevToken;
6347              container.options = prevOptions;
6348              return;
6349
6350            case UNION:
6351            case CLOSURE:
6352            case NONGREEDYCLOSURE:
6353            case EMPTY:
6354            case ANCHOR:
6355            case RANGE:
6356            case DOT:
6357            case NRANGE:
6358            case BACKREFERENCE:
6359            case LOOKAHEAD:
6360            case NEGATIVELOOKAHEAD:
6361            case LOOKBEHIND:
6362            case NEGATIVELOOKBEHIND:
6363            case CONDITION:
6364              container.token = null;
6365              return;
6366
6367            case CHAR: // Ignore CHAR tokens.
6368
container.token = null; // **
6369
return; // **
6370

6371            case STRING:
6372              container.token = this;
6373              container.options = options;
6374              return;
6375
6376            case INDEPENDENT:
6377            case PAREN:
6378              this.getChild(0).findFixedString(container, options);
6379              return;
6380
6381            case MODIFIERGROUP:
6382              options |= ((ModifierToken)this).getOptions();
6383              options &= ~((ModifierToken)this).getOptionsMask();
6384              this.getChild(0).findFixedString(container, options);
6385              return;
6386
6387            default:
6388              throw new RuntimeException JavaDoc("Token#findFixedString(): Invalid Type: "+this.type);
6389          }
6390      }
6391
6392      boolean match(int ch) {
6393          throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
6394      }
6395
6396      // ------------------------------------------------------
6397
private final static Hashtable JavaDoc categories = new Hashtable JavaDoc();
6398      private final static Hashtable JavaDoc categories2 = new Hashtable JavaDoc();
6399      private static final String JavaDoc[] categoryNames = {
6400          "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
6401          "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
6402          "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
6403
"Pi", "Pf", // 29, 30
6404
"L", "M", "N", "Z", "C", "P", "S", // 31-37
6405
};
6406
6407      // Schema Rec. {Datatypes} - Punctuation
6408
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
6409
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
6410
static final int CHAR_LETTER = 31;
6411      static final int CHAR_MARK = 32;
6412      static final int CHAR_NUMBER = 33;
6413      static final int CHAR_SEPARATOR = 34;
6414      static final int CHAR_OTHER = 35;
6415      static final int CHAR_PUNCTUATION = 36;
6416      static final int CHAR_SYMBOL = 37;
6417      
6418      //blockNames in UNICODE 3.1 that supported by XML Schema REC
6419
private static final String JavaDoc[] blockNames = {
6420          /*0000..007F;*/ "Basic Latin",
6421          /*0080..00FF;*/ "Latin-1 Supplement",
6422          /*0100..017F;*/ "Latin Extended-A",
6423          /*0180..024F;*/ "Latin Extended-B",
6424          /*0250..02AF;*/ "IPA Extensions",
6425          /*02B0..02FF;*/ "Spacing Modifier Letters",
6426          /*0300..036F;*/ "Combining Diacritical Marks",
6427          /*0370..03FF;*/ "Greek",
6428          /*0400..04FF;*/ "Cyrillic",
6429          /*0530..058F;*/ "Armenian",
6430          /*0590..05FF;*/ "Hebrew",
6431          /*0600..06FF;*/ "Arabic",
6432          /*0700..074F;*/ "Syriac",
6433          /*0780..07BF;*/ "Thaana",
6434          /*0900..097F;*/ "Devanagari",
6435          /*0980..09FF;*/ "Bengali",
6436          /*0A00..0A7F;*/ "Gurmukhi",
6437          /*0A80..0AFF;*/ "Gujarati",
6438          /*0B00..0B7F;*/ "Oriya",
6439          /*0B80..0BFF;*/ "Tamil",
6440          /*0C00..0C7F;*/ "Telugu",
6441          /*0C80..0CFF;*/ "Kannada",
6442          /*0D00..0D7F;*/ "Malayalam",
6443          /*0D80..0DFF;*/ "Sinhala",
6444          /*0E00..0E7F;*/ "Thai",
6445          /*0E80..0EFF;*/ "Lao",
6446          /*0F00..0FFF;*/ "Tibetan",
6447          /*1000..109F;*/ "Myanmar",
6448          /*10A0..10FF;*/ "Georgian",
6449          /*1100..11FF;*/ "Hangul Jamo",
6450          /*1200..137F;*/ "Ethiopic",
6451          /*13A0..13FF;*/ "Cherokee",
6452          /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
6453          /*1680..169F;*/ "Ogham",
6454          /*16A0..16FF;*/ "Runic",
6455          /*1780..17FF;*/ "Khmer",
6456          /*1800..18AF;*/ "Mongolian",
6457          /*1E00..1EFF;*/ "Latin Extended Additional",
6458          /*1F00..1FFF;*/ "Greek Extended",
6459          /*2000..206F;*/ "General Punctuation",
6460          /*2070..209F;*/ "Superscripts and Subscripts",
6461          /*20A0..20CF;*/ "Currency Symbols",
6462          /*20D0..20FF;*/ "Combining Marks for Symbols",
6463          /*2100..214F;*/ "Letterlike Symbols",
6464          /*2150..218F;*/ "Number Forms",
6465          /*2190..21FF;*/ "Arrows",
6466          /*2200..22FF;*/ "Mathematical Operators",
6467          /*2300..23FF;*/ "Miscellaneous Technical",
6468          /*2400..243F;*/ "Control Pictures",
6469          /*2440..245F;*/ "Optical Character Recognition",
6470          /*2460..24FF;*/ "Enclosed Alphanumerics",
6471          /*2500..257F;*/ "Box Drawing",
6472          /*2580..259F;*/ "Block Elements",
6473          /*25A0..25FF;*/ "Geometric Shapes",
6474          /*2600..26FF;*/ "Miscellaneous Symbols",
6475          /*2700..27BF;*/ "Dingbats",
6476          /*2800..28FF;*/ "Braille Patterns",
6477          /*2E80..2EFF;*/ "CJK Radicals Supplement",
6478          /*2F00..2FDF;*/ "Kangxi Radicals",
6479          /*2FF0..2FFF;*/ "Ideographic Description Characters",
6480          /*3000..303F;*/ "CJK Symbols and Punctuation",
6481          /*3040..309F;*/ "Hiragana",
6482          /*30A0..30FF;*/ "Katakana",
6483          /*3100..312F;*/ "Bopomofo",
6484          /*3130..318F;*/ "Hangul Compatibility Jamo",
6485          /*3190..319F;*/ "Kanbun",
6486          /*31A0..31BF;*/ "Bopomofo Extended",
6487          /*3200..32FF;*/ "Enclosed CJK Letters and Months",
6488          /*3300..33FF;*/ "CJK Compatibility",
6489          /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
6490          /*4E00..9FFF;*/ "CJK Unified Ideographs",
6491          /*A000..A48F;*/ "Yi Syllables",
6492          /*A490..A4CF;*/ "Yi Radicals",
6493          /*AC00..D7A3;*/ "Hangul Syllables",
6494          /*E000..F8FF;*/ "Private Use",
6495          /*F900..FAFF;*/ "CJK Compatibility Ideographs",
6496          /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
6497          /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
6498          /*FE20..FE2F;*/ "Combining Half Marks",
6499          /*FE30..FE4F;*/ "CJK Compatibility Forms",
6500          /*FE50..FE6F;*/ "Small Form Variants",
6501          /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
6502          /*FEFF..FEFF;*/ "Specials",
6503          /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
6504           //missing Specials add manually
6505
/*10300..1032F;*/ "Old Italic", // 84
6506
/*10330..1034F;*/ "Gothic",
6507          /*10400..1044F;*/ "Deseret",
6508          /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
6509          /*1D100..1D1FF;*/ "Musical Symbols",
6510          /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
6511          /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
6512          /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
6513          /*E0000..E007F;*/ "Tags",
6514          //missing 2 private use add manually
6515

6516      };
6517      //ADD THOSE MANUALLY
6518
//F0000..FFFFD; "Private Use",
6519
//100000..10FFFD; "Private Use"
6520
//FFF0..FFFD; "Specials",
6521
static final String JavaDoc blockRanges =
6522         "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
6523          +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
6524          +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
6525          +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
6526          +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
6527          +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
6528          +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
6529          +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
6530          +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
6531          +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
6532          +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
6533      static final int[] nonBMPBlockRanges = {
6534          0x10300, 0x1032F, // 84
6535
0x10330, 0x1034F,
6536          0x10400, 0x1044F,
6537          0x1D000, 0x1D0FF,
6538          0x1D100, 0x1D1FF,
6539          0x1D400, 0x1D7FF,
6540          0x20000, 0x2A6D6,
6541          0x2F800, 0x2FA1F,
6542          0xE0000, 0xE007F
6543      };
6544      private static final int NONBMP_BLOCK_START = 84;
6545
6546      static protected RangeToken getRange(String JavaDoc name, boolean positive) {
6547          if (Token.categories.size() == 0) {
6548              synchronized (Token.categories) {
6549                  Token[] ranges = new Token[Token.categoryNames.length];
6550                  for (int i = 0; i < ranges.length; i ++) {
6551                      ranges[i] = Token.createRange();
6552                  }
6553                  int type;
6554                  for (int i = 0; i < 0x10000; i ++) {
6555                      type = Character.getType((char)i);
6556                      if (type == Character.START_PUNCTUATION ||
6557                          type == Character.END_PUNCTUATION) {
6558                          //build table of Pi values
6559
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
6560                              i == 0x201F || i == 0x2039) {
6561                              type = CHAR_INIT_QUOTE;
6562                          }
6563                          //build table of Pf values
6564
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
6565                              type = CHAR_FINAL_QUOTE;
6566                          }
6567                      }
6568                      ranges[type].addRange(i, i);
6569                      switch (type) {
6570                        case Character.UPPERCASE_LETTER:
6571                        case Character.LOWERCASE_LETTER:
6572                        case Character.TITLECASE_LETTER:
6573                        case Character.MODIFIER_LETTER:
6574                        case Character.OTHER_LETTER:
6575                          type = CHAR_LETTER;
6576                          break;
6577                        case Character.NON_SPACING_MARK:
6578                        case Character.COMBINING_SPACING_MARK:
6579                        case Character.ENCLOSING_MARK:
6580                          type = CHAR_MARK;
6581                          break;
6582                        case Character.DECIMAL_DIGIT_NUMBER:
6583                        case Character.LETTER_NUMBER:
6584                        case Character.OTHER_NUMBER:
6585                          type = CHAR_NUMBER;
6586                          break;
6587                        case Character.SPACE_SEPARATOR:
6588                        case Character.LINE_SEPARATOR:
6589                        case Character.PARAGRAPH_SEPARATOR:
6590                          type = CHAR_SEPARATOR;
6591                          break;
6592                        case Character.CONTROL:
6593                        case Character.FORMAT:
6594                        case Character.SURROGATE:
6595                        case Character.PRIVATE_USE:
6596                        case Character.UNASSIGNED:
6597                          type = CHAR_OTHER;
6598                          break;
6599                        case Character.CONNECTOR_PUNCTUATION:
6600                        case Character.DASH_PUNCTUATION:
6601                        case Character.START_PUNCTUATION:
6602                        case Character.END_PUNCTUATION:
6603                        case CHAR_INIT_QUOTE:
6604                        case CHAR_FINAL_QUOTE:
6605                        case Character.OTHER_PUNCTUATION:
6606                          type = CHAR_PUNCTUATION;
6607                          break;
6608                        case Character.MATH_SYMBOL:
6609                        case Character.CURRENCY_SYMBOL:
6610                        case Character.MODIFIER_SYMBOL:
6611                        case Character.OTHER_SYMBOL:
6612                          type = CHAR_SYMBOL;
6613                          break;
6614                        default:
6615                          throw new RuntimeException JavaDoc("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
6616                      }
6617                      ranges[type].addRange(i, i);
6618                  } // for all characters
6619
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
6620
6621                  for (int i = 0; i < ranges.length; i ++) {
6622                      if (Token.categoryNames[i] != null) {
6623                          if (i == Character.UNASSIGNED) { // Unassigned
6624
ranges[i].addRange(0x10000, Token.UTF16_MAX);
6625                          }
6626                          Token.categories.put(Token.categoryNames[i], ranges[i]);
6627                          Token.categories2.put(Token.categoryNames[i],
6628                                                Token.complementRanges(ranges[i]));
6629                      }
6630                  }
6631                  //REVISIT: do we really need to support block names as in Unicode 3.1
6632
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
6633
//
6634
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(50);
6635                  for (int i = 0; i < Token.blockNames.length; i ++) {
6636                      Token r1 = Token.createRange();
6637                      int location;
6638                      if (i < NONBMP_BLOCK_START) {
6639                          location = i*2;
6640                          int rstart = Token.blockRanges.charAt(location);
6641                          int rend = Token.blockRanges.charAt(location+1);
6642                          //DEBUGING
6643
//System.out.println(n+" " +Integer.toHexString(rstart)
6644
// +"-"+ Integer.toHexString(rend));
6645
r1.addRange(rstart, rend);
6646                      } else {
6647                          location = (i - NONBMP_BLOCK_START) * 2;
6648                          r1.addRange(Token.nonBMPBlockRanges[location],
6649                                      Token.nonBMPBlockRanges[location + 1]);
6650                      }
6651                      String JavaDoc n = Token.blockNames[i];
6652                      if (n.equals("Specials"))
6653                          r1.addRange(0xfff0, 0xfffd);
6654                      if (n.equals("Private Use")) {
6655                          r1.addRange(0xF0000,0xFFFFD);
6656                          r1.addRange(0x100000,0x10FFFD);
6657                      }
6658                      Token.categories.put(n, r1);
6659                      Token.categories2.put(n, Token.complementRanges(r1));
6660                      buffer.setLength(0);
6661                      buffer.append("Is");
6662                      if (n.indexOf(' ') >= 0) {
6663                          for (int ci = 0; ci < n.length(); ci ++)
6664                              if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci));
6665                      }
6666                      else {
6667                          buffer.append(n);
6668                      }
6669                      Token.setAlias(buffer.toString(), n, true);
6670                  }
6671
6672                  // TR#18 1.2
6673
Token.setAlias("ASSIGNED", "Cn", false);
6674                  Token.setAlias("UNASSIGNED", "Cn", true);
6675                  Token all = Token.createRange();
6676                  all.addRange(0, Token.UTF16_MAX);
6677                  Token.categories.put("ALL", all);
6678                  Token.categories2.put("ALL", Token.complementRanges(all));
6679                  Token.registerNonXS("ASSIGNED");
6680                  Token.registerNonXS("UNASSIGNED");
6681                  Token.registerNonXS("ALL");
6682
6683                  Token isalpha = Token.createRange();
6684                  isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
6685
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
6686
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
6687
Token.categories.put("IsAlpha", isalpha);
6688                  Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
6689                  Token.registerNonXS("IsAlpha");
6690
6691                  Token isalnum = Token.createRange();
6692                  isalnum.mergeRanges(isalpha); // Lu Ll Lo
6693
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
6694
Token.categories.put("IsAlnum", isalnum);
6695                  Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
6696                  Token.registerNonXS("IsAlnum");
6697
6698                  Token isspace = Token.createRange();
6699                  isspace.mergeRanges(Token.token_spaces);
6700                  isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
6701
Token.categories.put("IsSpace", isspace);
6702                  Token.categories2.put("IsSpace", Token.complementRanges(isspace));
6703                  Token.registerNonXS("IsSpace");
6704
6705                  Token isword = Token.createRange();
6706                  isword.mergeRanges(isalnum); // Lu Ll Lo Nd
6707
isword.addRange('_', '_');
6708                  Token.categories.put("IsWord", isword);
6709                  Token.categories2.put("IsWord", Token.complementRanges(isword));
6710                  Token.registerNonXS("IsWord");
6711
6712                  Token isascii = Token.createRange();
6713                  isascii.addRange(0, 127);
6714                  Token.categories.put("IsASCII", isascii);
6715                  Token.categories2.put("IsASCII", Token.complementRanges(isascii));
6716                  Token.registerNonXS("IsASCII");
6717
6718                  Token isnotgraph = Token.createRange();
6719                  isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
6720                  isnotgraph.addRange(' ', ' ');
6721                  Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
6722                  Token.categories2.put("IsGraph", isnotgraph);
6723                  Token.registerNonXS("IsGraph");
6724
6725                  Token isxdigit = Token.createRange();
6726                  isxdigit.addRange('0', '9');
6727                  isxdigit.addRange('A', 'F');
6728                  isxdigit.addRange('a', 'f');
6729                  Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
6730                  Token.categories2.put("IsXDigit", isxdigit);
6731                  Token.registerNonXS("IsXDigit");
6732
6733                  Token.setAlias("IsDigit", "Nd", true);
6734                  Token.setAlias("IsUpper", "Lu", true);
6735                  Token.setAlias("IsLower", "Ll", true);
6736                  Token.setAlias("IsCntrl", "C", true);
6737                  Token.setAlias("IsPrint", "C", false);
6738                  Token.setAlias("IsPunct", "P", true);
6739                  Token.registerNonXS("IsDigit");
6740                  Token.registerNonXS("IsUpper");
6741                  Token.registerNonXS("IsLower");
6742                  Token.registerNonXS("IsCntrl");
6743                  Token.registerNonXS("IsPrint");
6744                  Token.registerNonXS("IsPunct");
6745
6746                  Token.setAlias("alpha", "IsAlpha", true);
6747                  Token.setAlias("alnum", "IsAlnum", true);
6748                  Token.setAlias("ascii", "IsASCII", true);
6749                  Token.setAlias("cntrl", "IsCntrl", true);
6750                  Token.setAlias("digit", "IsDigit", true);
6751                  Token.setAlias("graph", "IsGraph", true);
6752                  Token.setAlias("lower", "IsLower", true);
6753                  Token.setAlias("print", "IsPrint", true);
6754                  Token.setAlias("punct", "IsPunct", true);
6755                  Token.setAlias("space", "IsSpace", true);
6756                  Token.setAlias("upper", "IsUpper", true);
6757                  Token.setAlias("word", "IsWord", true); // Perl extension
6758
Token.setAlias("xdigit", "IsXDigit", true);
6759                  Token.registerNonXS("alpha");
6760                  Token.registerNonXS("alnum");
6761                  Token.registerNonXS("ascii");
6762                  Token.registerNonXS("cntrl");
6763                  Token.registerNonXS("digit");
6764                  Token.registerNonXS("graph");
6765                  Token.registerNonXS("lower");
6766                  Token.registerNonXS("print");
6767                  Token.registerNonXS("punct");
6768                  Token.registerNonXS("space");
6769                  Token.registerNonXS("upper");
6770                  Token.registerNonXS("word");
6771                  Token.registerNonXS("xdigit");
6772              } // synchronized
6773
} // if null
6774
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
6775              : (RangeToken)Token.categories2.get(name);
6776          //if (tok == null) System.out.println(name);
6777
return tok;
6778      }
6779      static protected RangeToken getRange(String JavaDoc name, boolean positive, boolean xs) {
6780          RangeToken range = Token.getRange(name, positive);
6781          if (xs && range != null && Token.isRegisterNonXS(name))
6782              range = null;
6783          return range;
6784      }
6785
6786      static Hashtable JavaDoc nonxs = null;
6787      /**
6788       * This method is called by only getRange().
6789       * So this method need not MT-safe.
6790       */

6791      static protected void registerNonXS(String JavaDoc name) {
6792          if (Token.nonxs == null)
6793              Token.nonxs = new Hashtable JavaDoc();
6794          Token.nonxs.put(name, name);
6795      }
6796      static protected boolean isRegisterNonXS(String JavaDoc name) {
6797          if (Token.nonxs == null)
6798              return false;
6799          //DEBUG
6800
//System.err.println("isRegisterNonXS: "+name);
6801
return Token.nonxs.containsKey(name);
6802      }
6803
6804      private static void setAlias(String JavaDoc newName, String JavaDoc name, boolean positive) {
6805          Token t1 = (Token)Token.categories.get(name);
6806          Token t2 = (Token)Token.categories2.get(name);
6807          if (positive) {
6808              Token.categories.put(newName, t1);
6809              Token.categories2.put(newName, t2);
6810          } else {
6811              Token.categories2.put(newName, t1);
6812              Token.categories.put(newName, t2);
6813          }
6814      }
6815
6816      // ------------------------------------------------------
6817

6818      static final String JavaDoc viramaString =
6819      "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6820
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6821
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6822
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6823
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6824
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6825
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6826
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6827
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
6828
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
6829
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
6830

6831      static private Token token_grapheme = null;
6832      static synchronized Token getGraphemePattern() {
6833          if (Token.token_grapheme != null)
6834              return Token.token_grapheme;
6835
6836          Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
6837
base_char.mergeRanges(Token.getRange("ASSIGNED", true));
6838          base_char.subtractRanges(Token.getRange("M", true));
6839          base_char.subtractRanges(Token.getRange("C", true));
6840
6841          Token virama = Token.createRange();
6842          for (int i = 0; i < Token.viramaString.length(); i ++) {
6843              virama.addRange(i, i);
6844          }
6845
6846          Token combiner_wo_virama = Token.createRange();
6847          combiner_wo_virama.mergeRanges(Token.getRange("M", true));
6848          combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
6849
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
6850

6851          Token left = Token.createUnion(); // base_char?
6852
left.addChild(base_char);
6853          left.addChild(Token.token_empty);
6854
6855          Token foo = Token.createUnion();
6856          foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
6857          foo.addChild(combiner_wo_virama);
6858
6859          foo = Token.createClosure(foo);
6860
6861          foo = Token.createConcat(left, foo);
6862
6863          Token.token_grapheme = foo;
6864          return Token.token_grapheme;
6865      }
6866
6867      /**
6868       * Combing Character Sequence in Perl 5.6.
6869       */

6870      static private Token token_ccs = null;
6871      static synchronized Token getCombiningCharacterSequence() {
6872          if (Token.token_ccs != null)
6873              return Token.token_ccs;
6874
6875          Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
6876
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
6877
Token.token_ccs = foo;
6878          return Token.token_ccs;
6879      }
6880
6881      // ------------------------------------------------------
6882

6883      // ------------------------------------------------------
6884
/**
6885       * This class represents a node in parse tree.
6886       */

6887      static class StringToken extends Token implements java.io.Serializable JavaDoc {
6888          String JavaDoc string;
6889          int refNumber;
6890
6891          StringToken(int type, String JavaDoc str, int n) {
6892              super(type);
6893              this.string = str;
6894              this.refNumber = n;
6895          }
6896
6897          int getReferenceNumber() { // for STRING
6898
return this.refNumber;
6899          }
6900          String JavaDoc getString() { // for STRING
6901
return this.string;
6902          }
6903          
6904          public String JavaDoc toString(int options) {
6905              if (this.type == BACKREFERENCE)
6906                  return "\\"+this.refNumber;
6907              else
6908                  return REUtil.quoteMeta(this.string);
6909          }
6910      }
6911
6912      /**
6913       * This class represents a node in parse tree.
6914       */

6915      static class ConcatToken extends Token implements java.io.Serializable JavaDoc {
6916          Token child;
6917          Token child2;
6918          
6919          ConcatToken(Token t1, Token t2) {
6920              super(Token.CONCAT);
6921              this.child = t1;
6922              this.child2 = t2;
6923          }
6924
6925          int size() {
6926              return 2;
6927          }
6928          Token getChild(int index) {
6929              return index == 0 ? this.child : this.child2;
6930          }
6931
6932          public String JavaDoc toString(int options) {
6933              String JavaDoc ret;
6934              if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
6935                  ret = this.child.toString(options)+"+";
6936              } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
6937                  ret = this.child.toString(options)+"+?";
6938              } else
6939                  ret = this.child.toString(options)+this.child2.toString(options);
6940              return ret;
6941          }
6942      }
6943
6944      /**
6945       * This class represents a node in parse tree.
6946       */

6947      static class CharToken extends Token implements java.io.Serializable JavaDoc {
6948          int chardata;
6949
6950          CharToken(int type, int ch) {
6951              super(type);
6952              this.chardata = ch;
6953          }
6954
6955          int getChar() {
6956              return this.chardata;
6957          }
6958
6959          public String JavaDoc toString(int options) {
6960              String JavaDoc ret;
6961              switch (this.type) {
6962                case CHAR:
6963                  switch (this.chardata) {
6964                    case '|': case '*': case '+': case '?':
6965                    case '(': case ')': case '.': case '[':
6966                    case '{': case '\\':
6967                      ret = "\\"+(char)this.chardata;
6968                      break;
6969                    case '\f': ret = "\\f"; break;
6970                    case '\n': ret = "\\n"; break;
6971                    case '\r': ret = "\\r"; break;
6972                    case '\t': ret = "\\t"; break;
6973                    case 0x1b: ret = "\\e"; break;
6974                      //case 0x0b: ret = "\\v"; break;
6975
default:
6976                      if (this.chardata >= 0x10000) {
6977                          String JavaDoc pre = "0"+Integer.toHexString(this.chardata);
6978                          ret = "\\v"+pre.substring(pre.length()-6, pre.length());
6979                      } else
6980                          ret = ""+(char)this.chardata;
6981                  }
6982                  break;
6983
6984                case ANCHOR:
6985                  if (this == Token.token_linebeginning || this == Token.token_lineend)
6986                      ret = ""+(char)this.chardata;
6987                  else
6988                      ret = "\\"+(char)this.chardata;
6989                  break;
6990
6991                default:
6992                  ret = null;
6993              }
6994              return ret;
6995          }
6996
6997          boolean match(int ch) {
6998              if (this.type == CHAR) {
6999                  return ch == this.chardata;
7000              } else
7001                  throw new RuntimeException JavaDoc("NFAArrow#match(): Internal error: "+this.type);
7002          }
7003      }
7004
7005      /**
7006       * This class represents a node in parse tree.
7007       */

7008      static class ClosureToken extends Token implements java.io.Serializable JavaDoc {
7009          int min;
7010          int max;
7011          Token child;
7012
7013          ClosureToken(int type, Token tok) {
7014              super(type);
7015              this.child = tok;
7016              this.setMin(-1);
7017              this.setMax(-1);
7018          }
7019
7020          int size() {
7021              return 1;
7022          }
7023          Token getChild(int index) {
7024              return this.child;
7025          }
7026
7027          final void setMin(int min) {
7028              this.min = min;
7029          }
7030          final void setMax(int max) {
7031              this.max = max;
7032          }
7033          final int getMin() {
7034              return this.min;
7035          }
7036          final int getMax() {
7037              return this.max;
7038          }
7039
7040          public String JavaDoc toString(int options) {
7041              String JavaDoc ret;
7042              if (this.type == CLOSURE) {
7043                  if (this.getMin() < 0 && this.getMax() < 0) {
7044                      ret = this.child.toString(options)+"*";
7045                  } else if (this.getMin() == this.getMax()) {
7046                      ret = this.child.toString(options)+"{"+this.getMin()+"}";
7047                  } else if (this.getMin() >= 0 && this.getMax() >= 0) {
7048                      ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
7049                  } else if (this.getMin() >= 0 && this.getMax() < 0) {
7050                      ret = this.child.toString(options)+"{"+this.getMin()+",}";
7051                  } else
7052                      throw new RuntimeException JavaDoc("Token#toString(): CLOSURE "
7053                                                 +this.getMin()+", "+this.getMax());
7054              } else {
7055                  if (this.getMin() < 0 && this.getMax() < 0) {
7056                      ret = this.child.toString(options)+"*?";
7057                  } else if (this.getMin() == this.getMax()) {
7058                      ret = this.child.toString(options)+"{"+this.getMin()+"}?";
7059                  } else if (this.getMin() >= 0 && this.getMax() >= 0) {
7060                      ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
7061                  } else if (this.getMin() >= 0 && this.getMax() < 0) {
7062                      ret = this.child.toString(options)+"{"+this.getMin()+",}?";
7063                  } else
7064                      throw new RuntimeException JavaDoc("Token#toString(): NONGREEDYCLOSURE "
7065                                                 + this.getMin() + ", " + this.getMax());
7066        }
7067        return ret;
7068      }
7069    }
7070
7071    /**
7072     * This class represents a node in parse tree.
7073     */

7074    static class ParenToken extends Token implements java.io.Serializable JavaDoc
7075    {
7076      Token child;
7077
7078      int parennumber;
7079
7080      ParenToken(int type, Token tok, int paren)
7081      {
7082        super(type);
7083        this.child = tok;
7084        this.parennumber = paren;
7085      }
7086
7087      int size()
7088      {
7089        return 1;
7090      }
7091
7092      Token getChild(int index)
7093      {
7094        return this.child;
7095      }
7096
7097      int getParenNumber()
7098      {
7099        return this.parennumber;
7100      }
7101
7102      public String JavaDoc toString(int options)
7103      {
7104        String JavaDoc ret = null;
7105        switch (this.type)
7106        {
7107          case PAREN:
7108            if (this.parennumber == 0)
7109            {
7110              ret = "(?:" + this.child.toString(options) + ")";
7111            }
7112            else
7113            {
7114              ret = "(" + this.child.toString(options) + ")";
7115            }
7116            break;
7117
7118          case LOOKAHEAD:
7119            ret = "(?=" + this.child.toString(options) + ")";
7120            break;
7121          case NEGATIVELOOKAHEAD:
7122            ret = "(?!" + this.child.toString(options) + ")";
7123            break;
7124          case LOOKBEHIND:
7125            ret = "(?<=" + this.child.toString(options) + ")";
7126            break;
7127          case NEGATIVELOOKBEHIND:
7128            ret = "(?<!" + this.child.toString(options) + ")";
7129            break;
7130          case INDEPENDENT:
7131            ret = "(?>" + this.child.toString(options) + ")";
7132            break;
7133        }
7134        return ret;
7135      }
7136    }
7137
7138    /**
7139     * (?(condition)yes-pattern|no-pattern)
7140     */

7141    static class ConditionToken extends Token implements java.io.Serializable JavaDoc
7142    {
7143      int refNumber;
7144
7145      Token condition;
7146
7147      Token yes;
7148
7149      Token no;
7150
7151      ConditionToken(int refno, Token cond, Token yespat, Token nopat)
7152      {
7153        super(Token.CONDITION);
7154        this.refNumber = refno;
7155        this.condition = cond;
7156        this.yes = yespat;
7157        this.no = nopat;
7158      }
7159
7160      int size()
7161      {
7162        return this.no == null ? 1 : 2;
7163      }
7164
7165      Token getChild(int index)
7166      {
7167        if (index == 0)
7168          return this.yes;
7169        if (index == 1)
7170          return this.no;
7171        throw new RuntimeException JavaDoc("Internal Error: " + index);
7172      }
7173
7174      public String JavaDoc toString(int options)
7175      {
7176        String JavaDoc ret;
7177        if (refNumber > 0)
7178        {
7179          ret = "(?(" + refNumber + ")";
7180        }
7181        else if (this.condition.type == Token.ANCHOR)
7182        {
7183          ret = "(?(" + this.condition + ")";
7184        }
7185        else
7186        {
7187          ret = "(?" + this.condition;
7188        }
7189
7190        if (this.no == null)
7191        {
7192          ret += this.yes + ")";
7193        }
7194        else
7195        {
7196          ret += this.yes + "|" + this.no + ")";
7197        }
7198        return ret;
7199      }
7200    }
7201
7202    /**
7203     * (ims-ims: .... )
7204     */

7205    static class ModifierToken extends Token implements java.io.Serializable JavaDoc
7206    {
7207      Token child;
7208
7209      int add;
7210
7211      int mask;
7212
7213      ModifierToken(Token tok, int add, int mask)
7214      {
7215        super(Token.MODIFIERGROUP);
7216        this.child = tok;
7217        this.add = add;
7218        this.mask = mask;
7219      }
7220
7221      int size()
7222      {
7223        return 1;
7224      }
7225
7226      Token getChild(int index)
7227      {
7228        return this.child;
7229      }
7230
7231      int getOptions()
7232      {
7233        return this.add;
7234      }
7235
7236      int getOptionsMask()
7237      {
7238        return this.mask;
7239      }
7240
7241      public String JavaDoc toString(int options)
7242      {
7243        return "(?" + (this.add == 0 ? "" : REUtil.createOptionString(this.add))
7244            + (this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) + ":" + this.child.toString(options) + ")";
7245      }
7246    }
7247
7248    /**
7249     * This class represents a node in parse tree.
7250     * for UNION or CONCAT.
7251     */

7252    static class UnionToken extends Token implements java.io.Serializable JavaDoc
7253    {
7254      Vector JavaDoc children;
7255
7256      UnionToken(int type)
7257      {
7258        super(type);
7259      }
7260
7261      void addChild(Token tok)
7262      {
7263        if (tok == null)
7264          return;
7265        if (this.children == null)
7266          this.children = new Vector JavaDoc();
7267        if (this.type == UNION)
7268        {
7269          this.children.addElement(tok);
7270          return;
7271        }
7272        // This is CONCAT, and new child is CONCAT.
7273
if (tok.type == CONCAT)
7274        {
7275          for (int i = 0; i < tok.size(); i++)
7276            this.addChild(tok.getChild(i)); // Recursion
7277
return;
7278        }
7279        int size = this.children.size();
7280        if (size == 0)
7281        {
7282          this.children.addElement(tok);
7283          return;
7284        }
7285        Token previous = (Token)this.children.elementAt(size - 1);
7286        if (!((previous.type == CHAR || previous.type == STRING) && (tok.type == CHAR || tok.type == STRING)))
7287        {
7288          this.children.addElement(tok);
7289          return;
7290        }
7291
7292        //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
7293

7294        StringBuffer JavaDoc buffer;
7295        int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
7296        if (previous.type == CHAR)
7297        { // Replace previous token by STRING
7298
buffer = new StringBuffer JavaDoc(2 + nextMaxLength);
7299          int ch = previous.getChar();
7300          if (ch >= 0x10000)
7301            buffer.append(REUtil.decomposeToSurrogates(ch));
7302          else
7303            buffer.append((char)ch);
7304          previous = Token.createString(null);
7305          this.children.setElementAt(previous, size - 1);
7306        }
7307        else
7308        { // STRING
7309
buffer = new StringBuffer JavaDoc(previous.getString().length() + nextMaxLength);
7310          buffer.append(previous.getString());
7311        }
7312
7313        if (tok.type == CHAR)
7314        {
7315          int ch = tok.getChar();
7316          if (ch >= 0x10000)
7317            buffer.append(REUtil.decomposeToSurrogates(ch));
7318          else
7319            buffer.append((char)ch);
7320        }
7321        else
7322        {
7323          buffer.append(tok.getString());
7324        }
7325
7326        ((StringToken)previous).string = new String JavaDoc(buffer);
7327      }
7328
7329      int size()
7330      {
7331        return this.children == null ? 0 : this.children.size();
7332      }
7333
7334      Token getChild(int index)
7335      {
7336        return (Token)this.children.elementAt(index);
7337      }
7338
7339      public String JavaDoc toString(int options)
7340      {
7341        String JavaDoc ret;
7342        if (this.type == CONCAT)
7343        {
7344          if (this.children.size() == 2)
7345          {
7346            Token ch = this.getChild(0);
7347            Token ch2 = this.getChild(1);
7348            if (ch2.type == CLOSURE && ch2.getChild(0) == ch)
7349            {
7350              ret = ch.toString(options) + "+";
7351            }
7352            else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch)
7353            {
7354              ret = ch.toString(options) + "+?";
7355            }
7356            else
7357              ret = ch.toString(options) + ch2.toString(options);
7358          }
7359          else
7360          {
7361            StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
7362            for (int i = 0; i < this.children.size(); i++)
7363            {
7364              sb.append(((Token)this.children.elementAt(i)).toString(options));
7365            }
7366            ret = new String JavaDoc(sb);
7367          }
7368          return ret;
7369        }
7370        if (this.children.size() == 2 && this.getChild(1).type == EMPTY)
7371        {
7372          ret = this.getChild(0).toString(options) + "?";
7373        }
7374        else if (this.children.size() == 2 && this.getChild(0).type == EMPTY)
7375        {
7376          ret = this.getChild(1).toString(options) + "??";
7377        }
7378        else
7379        {
7380          StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
7381          sb.append(((Token)this.children.elementAt(0)).toString(options));
7382          for (int i = 1; i < this.children.size(); i++)
7383          {
7384            sb.append('|');
7385            sb.append(((Token)this.children.elementAt(i)).toString(options));
7386          }
7387          ret = new String JavaDoc(sb);
7388        }
7389        return ret;
7390      }
7391    }
7392  }
7393
7394  /**
7395   * A regular expression parser for the XML Shema.
7396   *
7397   * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
7398   * @version $Id: RegEx.java,v 1.8 2005/06/12 13:29:22 emerks Exp $
7399   */

7400  static class ParserForXMLSchema extends RegexParser
7401  {
7402
7403    public ParserForXMLSchema()
7404    {
7405      //this.setLocale(Locale.getDefault());
7406
}
7407
7408    public ParserForXMLSchema(Locale JavaDoc locale)
7409    {
7410      //this.setLocale(locale);
7411
}
7412
7413    Token processCaret() throws ParseException
7414    {
7415      this.next();
7416      return Token.createChar('^');
7417    }
7418
7419    Token processDollar() throws ParseException
7420    {
7421      this.next();
7422      return Token.createChar('$');
7423    }
7424
7425    Token processLookahead() throws ParseException
7426    {
7427      throw ex("parser.process.1", this.offset);
7428    }
7429
7430    Token processNegativelookahead() throws ParseException
7431    {
7432      throw ex("parser.process.1", this.offset);
7433    }
7434
7435    Token processLookbehind() throws ParseException
7436    {
7437      throw ex("parser.process.1", this.offset);
7438    }
7439
7440    Token processNegativelookbehind() throws ParseException
7441    {
7442      throw ex("parser.process.1", this.offset);
7443    }
7444
7445    Token processBacksolidus_A() throws ParseException
7446    {
7447      throw ex("parser.process.1", this.offset);
7448    }
7449
7450    Token processBacksolidus_Z() throws ParseException
7451    {
7452      throw ex("parser.process.1", this.offset);
7453    }
7454
7455    Token processBacksolidus_z() throws ParseException
7456    {
7457      throw ex("parser.process.1", this.offset);
7458    }
7459
7460    Token processBacksolidus_b() throws ParseException
7461    {
7462      throw ex("parser.process.1", this.offset);
7463    }
7464
7465    Token processBacksolidus_B() throws ParseException
7466    {
7467      throw ex("parser.process.1", this.offset);
7468    }
7469
7470    Token processBacksolidus_lt() throws ParseException
7471    {
7472      throw ex("parser.process.1", this.offset);
7473    }
7474
7475    Token processBacksolidus_gt() throws ParseException
7476    {
7477      throw ex("parser.process.1", this.offset);
7478    }
7479
7480    Token processStar(Token tok) throws ParseException
7481    {
7482      this.next();
7483      return Token.createClosure(tok);
7484    }
7485
7486    Token processPlus(Token tok) throws ParseException
7487    {
7488      // X+ -> XX*
7489
this.next();
7490      return Token.createConcat(tok, Token.createClosure(tok));
7491    }
7492
7493    Token processQuestion(Token tok) throws ParseException
7494    {
7495      // X? -> X|
7496
this.next();
7497      Token par = Token.createUnion();
7498      par.addChild(tok);
7499      par.addChild(Token.createEmpty());
7500      return par;
7501    }
7502
7503    boolean checkQuestion(int off)
7504    {
7505      return false;
7506    }
7507
7508    Token processParen() throws ParseException
7509    {
7510      this.next();
7511      Token tok = Token.createParen(this.parseRegex(), 0);
7512      if (this.read() != T_RPAREN)
7513        throw ex("parser.factor.1", this.offset - 1);
7514      this.next(); // Skips ')'
7515
return tok;
7516    }
7517
7518    Token processParen2() throws ParseException
7519    {
7520      throw ex("parser.process.1", this.offset);
7521    }
7522
7523    Token processCondition() throws ParseException
7524    {
7525      throw ex("parser.process.1", this.offset);
7526    }
7527
7528    Token processModifiers() throws ParseException
7529    {
7530      throw ex("parser.process.1", this.offset);
7531    }
7532
7533    Token processIndependent() throws ParseException
7534    {
7535      throw ex("parser.process.1", this.offset);
7536    }
7537
7538    Token processBacksolidus_c() throws ParseException
7539    {
7540      this.next();
7541      return this.getTokenForShorthand('c');
7542    }
7543
7544    Token processBacksolidus_C() throws ParseException
7545    {
7546      this.next();
7547      return this.getTokenForShorthand('C');
7548    }
7549
7550    Token processBacksolidus_i() throws ParseException
7551    {
7552      this.next();
7553      return this.getTokenForShorthand('i');
7554    }
7555
7556    Token processBacksolidus_I() throws ParseException
7557    {
7558      this.next();
7559      return this.getTokenForShorthand('I');
7560    }
7561
7562    Token processBacksolidus_g() throws ParseException
7563    {
7564      throw this.ex("parser.process.1", this.offset - 2);
7565    }
7566
7567    Token processBacksolidus_X() throws ParseException
7568    {
7569      throw ex("parser.process.1", this.offset - 2);
7570    }
7571
7572    Token processBackreference() throws ParseException
7573    {
7574      throw ex("parser.process.1", this.offset - 4);
7575    }
7576
7577    int processCIinCharacterClass(RangeToken tok, int c)
7578    {
7579      tok.mergeRanges(this.getTokenForShorthand(c));
7580      return -1;
7581    }
7582
7583    /**
7584     * Parses a character-class-expression, not a character-class-escape.
7585     *
7586     * c-c-expression ::= '[' c-group ']'
7587     * c-group ::= positive-c-group | negative-c-group | c-c-subtraction
7588     * positive-c-group ::= (c-range | c-c-escape)+
7589     * negative-c-group ::= '^' positive-c-group
7590     * c-c-subtraction ::= (positive-c-group | negative-c-group) subtraction
7591     * subtraction ::= '-' c-c-expression
7592     * c-range ::= single-range | from-to-range
7593     * single-range ::= multi-c-escape | category-c-escape | block-c-escape | <any XML char>
7594     * cc-normal-c ::= <any character except [, ], \>
7595     * from-to-range ::= cc-normal-c '-' cc-normal-c
7596     *
7597     * @param useNrage Ignored.
7598     * @return This returns no NrageToken.
7599     */

7600    protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException
7601    {
7602      this.setContext(S_INBRACKETS);
7603      this.next(); // '['
7604
boolean nrange = false;
7605      RangeToken base = null;
7606      RangeToken tok;
7607      if (this.read() == T_CHAR && this.chardata == '^')
7608      {
7609        nrange = true;
7610        this.next(); // '^'
7611
base = Token.createRange();
7612        base.addRange(0, Token.UTF16_MAX);
7613        tok = Token.createRange();
7614      }
7615      else
7616      {
7617        tok = Token.createRange();
7618      }
7619      int type;
7620      boolean firstloop = true;
7621      while ((type = this.read()) != T_EOF)
7622      { // Don't use 'cotinue' for this loop.
7623
// single-range | from-to-range | subtraction
7624
if (type == T_CHAR && this.chardata == ']' && !firstloop)
7625        {
7626          if (nrange)
7627          {
7628            base.subtractRanges(tok);
7629            tok = base;
7630          }
7631          break;
7632        }
7633        int c = this.chardata;
7634        boolean end = false;
7635        if (type == T_BACKSOLIDUS)
7636        {
7637          switch (c)
7638          {
7639            case 'd':
7640            case 'D':
7641            case 'w':
7642            case 'W':
7643            case 's':
7644            case 'S':
7645              tok.mergeRanges(this.getTokenForShorthand(c));
7646              end = true;
7647              break;
7648
7649            case 'i':
7650            case 'I':
7651            case 'c':
7652            case 'C':
7653              c = this.processCIinCharacterClass(tok, c);
7654              if (c < 0)
7655                end = true;
7656              break;
7657
7658            case 'p':
7659            case 'P':
7660              int pstart = this.offset;
7661              RangeToken tok2 = this.processBacksolidus_pP(c);
7662              if (tok2 == null)
7663                throw this.ex("parser.atom.5", pstart);
7664              tok.mergeRanges(tok2);
7665              end = true;
7666              break;
7667
7668            default:
7669              c = this.decodeEscaped();
7670          } // \ + c
7671
} // backsolidus
7672
else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop)
7673        {
7674          // Subraction
7675
if (nrange)
7676          {
7677            base.subtractRanges(tok);
7678            tok = base;
7679          }
7680          RangeToken range2 = this.parseCharacterClass(false);
7681          tok.subtractRanges(range2);
7682          if (this.read() != T_CHAR || this.chardata != ']')
7683            throw this.ex("parser.cc.5", this.offset);
7684          break; // Exit this loop
7685
}
7686        this.next();
7687        if (!end)
7688        { // if not shorthands...
7689
if (type == T_CHAR)
7690          {
7691            if (c == '[')
7692              throw this.ex("parser.cc.6", this.offset - 2);
7693            if (c == ']')
7694              throw this.ex("parser.cc.7", this.offset - 2);
7695            if (c == '-')
7696              throw this.ex("parser.cc.8", this.offset - 2);
7697          }
7698          if (this.read() != T_CHAR || this.chardata != '-')
7699          { // Here is no '-'.
7700
tok.addRange(c, c);
7701          }
7702          else
7703          { // Found '-'
7704
// Is this '-' is a from-to token??
7705
this.next(); // Skips '-'
7706
if ((type = this.read()) == T_EOF)
7707              throw this.ex("parser.cc.2", this.offset);
7708            // c '-' ']' -> '-' is a single-range.
7709
if ((type == T_CHAR && this.chardata == ']') || type == T_XMLSCHEMA_CC_SUBTRACTION)
7710            {
7711              throw this.ex("parser.cc.8", this.offset - 1);
7712            }
7713            else
7714            {
7715              int rangeend = this.chardata;
7716              if (type == T_CHAR)
7717              {
7718                if (rangeend == '[')
7719                  throw this.ex("parser.cc.6", this.offset - 1);
7720                if (rangeend == ']')
7721                  throw this.ex("parser.cc.7", this.offset - 1);
7722                if (rangeend == '-')
7723                  throw this.ex("parser.cc.8", this.offset - 2);
7724              }
7725              else if (type == T_BACKSOLIDUS)
7726                rangeend = this.decodeEscaped();
7727              this.next();
7728
7729              if (c > rangeend)
7730                throw this.ex("parser.ope.3", this.offset - 1);
7731              tok.addRange(c, rangeend);
7732            }
7733          }
7734        }
7735        firstloop = false;
7736      }
7737      if (this.read() == T_EOF)
7738        throw this.ex("parser.cc.2", this.offset);
7739      tok.sortRanges();
7740      tok.compactRanges();
7741      //tok.dumpRanges();
7742
this.setContext(S_NORMAL);
7743      this.next(); // Skips ']'
7744

7745      return tok;
7746    }
7747
7748    protected RangeToken parseSetOperations() throws ParseException
7749    {
7750      throw this.ex("parser.process.1", this.offset);
7751    }
7752
7753    Token getTokenForShorthand(int ch)
7754    {
7755      switch (ch)
7756      {
7757        case 'd':
7758          return ParserForXMLSchema.getRange("xml:isDigit", true);
7759        case 'D':
7760          return ParserForXMLSchema.getRange("xml:isDigit", false);
7761        case 'w':
7762          return ParserForXMLSchema.getRange("xml:isWord", true);
7763        case 'W':
7764          return ParserForXMLSchema.getRange("xml:isWord", false);
7765        case 's':
7766          return ParserForXMLSchema.getRange("xml:isSpace", true);
7767        case 'S':
7768          return ParserForXMLSchema.getRange("xml:isSpace", false);
7769        case 'c':
7770          return ParserForXMLSchema.getRange("xml:isNameChar", true);
7771        case 'C':
7772          return ParserForXMLSchema.getRange("xml:isNameChar", false);
7773        case 'i':
7774          return ParserForXMLSchema.getRange("xml:isInitialNameChar", true);
7775        case 'I':
7776          return ParserForXMLSchema.getRange("xml:isInitialNameChar", false);
7777        default:
7778          throw new RuntimeException JavaDoc("Internal Error: shorthands: \\u" + Integer.toString(ch, 16));
7779      }
7780    }
7781
7782    int decodeEscaped() throws ParseException
7783    {
7784      if (this.read() != T_BACKSOLIDUS)
7785        throw ex("parser.next.1", this.offset - 1);
7786      int c = this.chardata;
7787      switch (c)
7788      {
7789        case 'n':
7790          c = '\n';
7791          break; // LINE FEED U+000A
7792
case 'r':
7793          c = '\r';
7794          break; // CRRIAGE RETURN U+000D
7795
case 't':
7796          c = '\t';
7797          break; // HORIZONTAL TABULATION U+0009
7798
case '\\':
7799        case '|':
7800        case '.':
7801        case '^':
7802        case '-':
7803        case '?':
7804        case '*':
7805        case '+':
7806        case '{':
7807        case '}':
7808        case '(':
7809        case ')':
7810        case '[':
7811        case ']':
7812          break; // return actucal char
7813
default:
7814          throw ex("parser.process.1", this.offset - 2);
7815      }
7816      return c;
7817    }
7818
7819    static private Hashtable JavaDoc ranges = null;
7820
7821    static private Hashtable JavaDoc ranges2 = null;
7822
7823    static synchronized protected RangeToken getRange(String JavaDoc name, boolean positive)
7824    {
7825      if (ranges == null)
7826      {
7827        ranges = new Hashtable JavaDoc();
7828        ranges2 = new Hashtable JavaDoc();
7829
7830        Token tok = Token.createRange();
7831        setupRange(tok, SPACES);
7832        ranges.put("xml:isSpace", tok);
7833        ranges2.put("xml:isSpace", Token.complementRanges(tok));
7834
7835        tok = Token.createRange();
7836        setupRange(tok, DIGITS);
7837        ranges.put("xml:isDigit", tok);
7838        ranges2.put("xml:isDigit", Token.complementRanges(tok));
7839
7840        tok = Token.createRange();
7841        setupRange(tok, DIGITS);
7842        ranges.put("xml:isDigit", tok);
7843        ranges2.put("xml:isDigit", Token.complementRanges(tok));
7844
7845        tok = Token.createRange();
7846        setupRange(tok, LETTERS);
7847        tok.mergeRanges((Token)ranges.get("xml:isDigit"));
7848        ranges.put("xml:isWord", tok);
7849        ranges2.put("xml:isWord", Token.complementRanges(tok));
7850
7851        tok = Token.createRange();
7852        setupRange(tok, NAMECHARS);
7853        ranges.put("xml:isNameChar", tok);
7854        ranges2.put("xml:isNameChar", Token.complementRanges(tok));
7855
7856        tok = Token.createRange();
7857        setupRange(tok, LETTERS);
7858        tok.addRange('_', '_');
7859        tok.addRange(':', ':');
7860        ranges.put("xml:isInitialNameChar", tok);
7861        ranges2.put("xml:isInitialNameChar", Token.complementRanges(tok));
7862      }
7863      RangeToken tok = positive ? (RangeToken)ranges.get(name) : (RangeToken)ranges2.get(name);
7864      return tok;
7865    }
7866
7867    static void setupRange(Token range, String JavaDoc src)
7868    {
7869      int len = src.length();
7870      for (int i = 0; i < len; i += 2)
7871        range.addRange(src.charAt(i), src.charAt(i + 1));
7872    }
7873
7874    private static final String JavaDoc SPACES = "\t\n\r\r ";
7875
7876    private static final String JavaDoc NAMECHARS = "\u002d\u002e\u0030\u003a\u0041\u005a\u005f\u005f\u0061\u007a\u00b7\u00b7\u00c0\u00d6"
7877        + "\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0"
7878        + "\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u02d0\u02d1\u0300\u0345\u0360\u0361"
7879        + "\u0386\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da\u03dc\u03dc"
7880        + "\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481"
7881        + "\u0483\u0486\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9"
7882        + "\u0531\u0556\u0559\u0559\u0561\u0586\u0591\u05a1\u05a3\u05b9\u05bb\u05bd\u05bf\u05bf"
7883        + "\u05c1\u05c2\u05c4\u05c4\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0640\u0652\u0660\u0669"
7884        + "\u0670\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06e8\u06ea\u06ed\u06f0\u06f9"
7885        + "\u0901\u0903\u0905\u0939\u093c\u094d\u0951\u0954\u0958\u0963\u0966\u096f\u0981\u0983"
7886        + "\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2\u09b6\u09b9\u09bc\u09bc"
7887        + "\u09be\u09c4\u09c7\u09c8\u09cb\u09cd\u09d7\u09d7\u09dc\u09dd\u09df\u09e3\u09e6\u09f1"
7888        + "\u0a02\u0a02\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36"
7889        + "\u0a38\u0a39\u0a3c\u0a3c\u0a3e\u0a42\u0a47\u0a48\u0a4b\u0a4d\u0a59\u0a5c\u0a5e\u0a5e"
7890        + "\u0a66\u0a74\u0a81\u0a83\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0"
7891        + "\u0ab2\u0ab3\u0ab5\u0ab9\u0abc\u0ac5\u0ac7\u0ac9\u0acb\u0acd\u0ae0\u0ae0\u0ae6\u0aef"
7892        + "\u0b01\u0b03\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39"
7893        + "\u0b3c\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b5c\u0b5d\u0b5f\u0b61\u0b66\u0b6f"
7894        + "\u0b82\u0b83\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f"
7895        + "\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd"
7896        + "\u0bd7\u0bd7\u0be7\u0bef\u0c01\u0c03\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33"
7897        + "\u0c35\u0c39\u0c3e\u0c44\u0c46\u0c48\u0c4a\u0c4d\u0c55\u0c56\u0c60\u0c61\u0c66\u0c6f"
7898        + "\u0c82\u0c83\u0c85\u0c8c\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cbe\u0cc4"
7899        + "\u0cc6\u0cc8\u0cca\u0ccd\u0cd5\u0cd6\u0cde\u0cde\u0ce0\u0ce1\u0ce6\u0cef\u0d02\u0d03"
7900        + "\u0d05\u0d0c\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d"
7901        + "\u0d57\u0d57\u0d60\u0d61\u0d66\u0d6f\u0e01\u0e2e\u0e30\u0e3a\u0e40\u0e4e\u0e50\u0e59"
7902        + "\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97\u0e99\u0e9f"
7903        + "\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb9\u0ebb\u0ebd"
7904        + "\u0ec0\u0ec4\u0ec6\u0ec6\u0ec8\u0ecd\u0ed0\u0ed9\u0f18\u0f19\u0f20\u0f29\u0f35\u0f35"
7905        + "\u0f37\u0f37\u0f39\u0f39\u0f3e\u0f47\u0f49\u0f69\u0f71\u0f84\u0f86\u0f8b\u0f90\u0f95"
7906        + "\u0f97\u0f97\u0f99\u0fad\u0fb1\u0fb7\u0fb9\u0fb9\u10a0\u10c5\u10d0\u10f6\u1100\u1100"
7907        + "\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c\u113e\u113e"
7908        + "\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159\u115f\u1161"
7909        + "\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173\u1175\u1175"
7910        + "\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba\u11bc\u11c2"
7911        + "\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d"
7912        + "\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d\u1f5f\u1f7d"
7913        + "\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb"
7914        + "\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u20d0\u20dc\u20e1\u20e1\u2126\u2126\u212a\u212b"
7915        + "\u212e\u212e\u2180\u2182\u3005\u3005\u3007\u3007\u3021\u302f\u3031\u3035\u3041\u3094"
7916        + "\u3099\u309a\u309d\u309e\u30a1\u30fa\u30fc\u30fe\u3105\u312c\u4e00\u9fa5\uac00\ud7a3" + "";
7917
7918    private static final String JavaDoc LETTERS = "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148"
7919        + "\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1"
7920        + "\u0386\u0386\u0388\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da"
7921        + "\u03dc\u03dc\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c"
7922        + "\u045e\u0481\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9"
7923        + "\u0531\u0556\u0559\u0559\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a"
7924        + "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06d5\u06e5\u06e6\u0905\u0939"
7925        + "\u093d\u093d\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2"
7926        + "\u09b6\u09b9\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28"
7927        + "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a5e\u0a5e\u0a72\u0a74"
7928        + "\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9"
7929        + "\u0abd\u0abd\u0ae0\u0ae0\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33"
7930        + "\u0b36\u0b39\u0b3d\u0b3d\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95"
7931        + "\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9"
7932        + "\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c"
7933        + "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cde\u0cde\u0ce0\u0ce1\u0d05\u0d0c"
7934        + "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e30\u0e30\u0e32\u0e33"
7935        + "\u0e40\u0e45\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97"
7936        + "\u0e99\u0e9f\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb0"
7937        + "\u0eb2\u0eb3\u0ebd\u0ebd\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69\u10a0\u10c5\u10d0\u10f6"
7938        + "\u1100\u1100\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c"
7939        + "\u113e\u113e\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159"
7940        + "\u115f\u1161\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173"
7941        + "\u1175\u1175\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba"
7942        + "\u11bc\u11c2\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15"
7943        + "\u1f18\u1f1d\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d"
7944        + "\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3"
7945        + "\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u2126\u2126\u212a\u212b\u212e\u212e"
7946        + "\u2180\u2182\u3007\u3007\u3021\u3029\u3041\u3094\u30a1\u30fa\u3105\u312c\u4e00\u9fa5" + "\uac00\ud7a3";
7947
7948    private static final String JavaDoc DIGITS = "\u0030\u0039\u0660\u0669\u06F0\u06F9\u0966\u096F\u09E6\u09EF\u0A66\u0A6F\u0AE6\u0AEF"
7949        + "\u0B66\u0B6F\u0BE7\u0BEF\u0C66\u0C6F\u0CE6\u0CEF\u0D66\u0D6F\u0E50\u0E59\u0ED0\u0ED9" + "\u0F20\u0F29";
7950  }
7951
7952}
Popular Tags