RegularExpression


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999,2000 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package org.enhydra.apache.xerces.utils.regex;
59  
60  
61  import java.text.CharacterIterator  ;
62  
63  /**
64   * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
65   * This engine does not conform to the POSIX regular expression.
66   *
67   * <hr width="50%">
68   * <h3>How to use</h3>
69   *
70   * <dl>
71   *   <dt>A. Standard way
72   *   <dd>
73   * <pre>
74   * RegularExpression re = new RegularExpression(<var>regex</var>);
75   * if (re.matches(text)) { ... }
76   * </pre>
77   *
78   *   <dt>B. Capturing groups
79   *   <dd>
80   * <pre>
81   * RegularExpression re = new RegularExpression(<var>regex</var>);
82   * Match match = new Match();
83   * if (re.matches(text, match)) {
84   *     ... // You can refer captured texts with methods of the <code>Match</code> class.
85   * }
86   * </pre>
87   *
88   * </dl>
89   *
90   * <h4>Case-insensitive matching</h4>
91   * <pre>
92   * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
93   * if (re.matches(text) >= 0) { ...}
94   * </pre>
95   *
96   * <h4>Options</h4>
97   * <p>You can specify options to <a HREF="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
98   *    or <a HREF="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
99   *    This <var>options</var> parameter consists of the following characters.
100  * </p>
101  * <dl>
102  *   <dt><a name="I_OPTION"><code>"i"</code></a>
103  *   <dd>This option indicates case-insensitive matching.
104  *   <dt><a name="M_OPTION"><code>"m"</code></a>
105  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
106  *   <dt><a name="S_OPTION"><code>"s"</code></a>
107  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
108  *   <dt><a name="U_OPTION"><code>"u"</code></a>
109  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
110  *   <dt><a name="W_OPTION"><code>"w"</code></a>
111  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
112  *      'Unicode Regular Expression Guidelines' Revision 4.
113  *      When "w" and "u" are specified at the same time,
114  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
115  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
116  *   <dd>The parser treats a comma in a character class as a range separator.
117  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
118  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
119  *
120  *   <dt><a name="X_OPTION"><code>"X"</code></a>
121  *   <dd class="REGEX">
122  *       By this option, the engine confoms to <a HREF="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
123  *       The <code>match()</code> method does not do subsring matching
124  *       but entire string matching.
125  *
126  * </dl>
127  * 
128  * <hr width="50%">
129  * <h3>Syntax</h3>
130  * <table border="1" bgcolor="#ddeeff">
131  *   <tr>
132  *    <td>
133  *     <h4>Differences from the Perl 5 regular expression</h4>
134  *     <ul>
135  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\v</kbd><var>HHHHHH</var>.)
136  *      <li>Supports subtraction, union, and intersection operations for character classes.
137  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
138  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
139  *          <kbd>\ u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
140  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
141  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
142  *     </ul>
143  *    </td>
144  *   </tr>
145  * </table>
146  *
147  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
148  * <ul>
149  *   <li>Character
150  *     <dl>
151  *       <dt class="REGEX"><kbd>.</kbd> (A period)
152  *       <dd>Matches any one character except the following characters.
153  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
154  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
155  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
156  *       <dd>When <a HREF="#S_OPTION">the "s" option</a> is specified,
157  *           it matches any character including the above four characters.
158  *
159  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
160  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
161  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
162  *
163  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
164  *       <dd>Matches a control character.
165  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
166  *           '<kbd>[</kbd>', '<kbd>\</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
167  *           It matches a control character of which the character code is less than
168  *           the character code of the <var>C</var> by 0x0040.
169  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
170  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
171  *
172  *       <dt class="REGEX">a non-meta character
173  *       <dd>Matches the character.
174  *
175  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
176  *       <dd>Matches the meta character.
177  *
178  *       <dt class="REGEX"><kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>
179  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
180  *           You can write just 2 digits for <kbd>\x</kbd><var>HH</var>, and
181  *           variable length digits for <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>.
182  *
183  *       <!--
184  *       <dt class="REGEX"><kbd>\ u</kbd><var>HHHH</var>
185  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
186  *       -->
187  *
188  *       <dt class="REGEX"><kbd>\v</kbd><var>HHHHHH</var>
189  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
190  *
191  *       <dt class="REGEX"><kbd>\g</kbd>
192  *       <dd>Matches a grapheme.
193  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
194  *
195  *       <dt class="REGEX"><kbd>\X</kbd>
196  *       <dd class="REGEX">Matches a combining character sequence.
197  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
198  *     </dl>
199  *   </li>
200  *
201  *   <li>Character class
202  *     <dl>
203 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a HREF="#COMMA_OPTION">"," option</a>)
204 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a HREF="#COMMA_OPTION">"," option</a>)
205  *       <dd>Positive character class.  It matches a character in ranges.
206  *       <dd><var>R<sub>n</sub></var>:
207  *       <ul>
208  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\ u</kbd><var>HHHH</var--> <kbd>\v</kbd><var>HHHHHH</var>)
209  *             <p>This range matches the character.
210  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
211  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
212 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
213 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
214  *             <p>...
215  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
216  *             <p>These expressions specifies the same ranges as the following expressions.
217  *       </ul>
218  *       <p class="REGEX">Enumerated ranges are merged (union operation).
219  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
220  *
221  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a HREF="#COMMA_OPTION">"," option</a>)
222  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a HREF="#COMMA_OPTION">"," option</a>)
223  *       <dd>Negative character class.  It matches a character not in ranges.
224  *
225  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
226  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
227  *       <dd>Subtraction or union or intersection for character classes.
228  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
229  *       <dd>The result of this operations is a <u>positive character class</u>
230  *           even if an expression includes any negative character classes.
231  *           You have to take care on this in case-insensitive matching.
232  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
233  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
234  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
235  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
236  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
237  *
238  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a HREF="#X_OPTION">"X" option</a>)</dt>
239  *       <dd>Character class subtraction for the XML Schema.
240  *           You can use this syntax when you specify an <a HREF="#X_OPTION">"X" option</a>.
241  *           
242  *       <dt class="REGEX"><kbd>\d</kbd>
243  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
244  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
245  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
246  *
247  *       <dt class="REGEX"><kbd>\D</kbd>
248  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
249  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
250  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
251  *
252  *       <dt class="REGEX"><kbd>\s</kbd>
253  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
254  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
255  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
256  *
257  *       <dt class="REGEX"><kbd>\S</kbd>
258  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
259  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
260  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
261  *
262  *       <dt class="REGEX"><kbd>\w</kbd>
263  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
264  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
265  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
266  *
267  *       <dt class="REGEX"><kbd>\W</kbd>
268  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
269  *       <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
270  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
271  *
272  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
273  *       <dd>Matches one character in the specified General Category (the second field in <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
274  *       The following names are available:
275  *       <dl>
276  *         <dt>Unicode General Categories:
277  *         <dd><kbd>
278  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
279  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
280  *         </kbd>
281  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
282  *         <dt>Unicode Blocks:
283  *         <dd><kbd>
284  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
285  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
286  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
287  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
288  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
289  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
290  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
291  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
292  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
293  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
294  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
295  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
296  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
297  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
298  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
299  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
300  *       Halfwidth and Fullwidth Forms
301  *         </kbd>
302  *         <dt>Others:
303  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\ u0000-\v10FFFF]</kbd>)
304  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
305  *         <dd><kbd>UNASSGINED</kbd>
306  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
307  *       </dl>
308  *
309  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
310  *       <dd>Matches one character not in the specified General Category or the specified Block.
311  *     </dl>
312  *   </li>
313  *
314  *   <li>Selection and Quantifier
315  *     <dl>
316  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
317  *       <dd>...
318  *
319  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
320  *       <dd>Matches 0 or more <var>X</var>.
321  *
322  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
323  *       <dd>Matches 1 or more <var>X</var>.
324  *
325  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
326  *       <dd>Matches 0 or 1 <var>X</var>.
327  *
328  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
329  *       <dd>Matches <var>number</var> times.
330  *
331  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
332  *       <dd>...
333  *
334  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
335  *       <dd>...
336  *
337  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
338  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
339  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
340  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
341  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
342  *       <dd>Non-greedy matching.
343  *     </dl>
344  *   </li>
345  *
346  *   <li>Grouping, Capturing, and Back-reference
347  *     <dl>
348  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
349  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
350  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
351  *       you have to write "<KBD>(?:foo)+</KBD>".
352  *
353  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
354  *       <dd>Grouping with capturing.
355  * It make a group and applications can know
356  * where in target text a group matched with methods of a <code>Match</code> instance
357  * after <code><a HREF="#matches(java.lang.String, org.enhydra.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
358  * The 0th group means whole of this regular expression.
359  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
360  * 
361  *   <p>For instance, a regular expression is
362  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
363  *   and target text is
364  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
365  *   <ul>
366  *     <li><code>Match.getCapturedText(0)</code>:
367  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
368  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
369  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
370  *   </ul>
371  *
372  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
373  *       <dd>
374  *
375  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
376  *       <dd>Independent expression group. ................
377  *
378  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
379  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
380  *       <dd>............................
381  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
382  *           Note that it can not contain 'u'.
383  *
384  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
385  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
386  *       <dd>......
387  *       <dd>These expressions must be at the beginning of a group.
388  *     </dl>
389  *   </li>
390  *
391  *   <li>Anchor
392  *     <dl>
393  *       <dt class="REGEX"><kbd>\A</kbd>
394  *       <dd>Matches the beginnig of the text.
395  *
396  *       <dt class="REGEX"><kbd>\Z</kbd>
397  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
398  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
399  *
400  *       <dt class="REGEX"><kbd>\z</kbd>
401  *       <dd>Matches the end of the text.
402  *
403  *       <dt class="REGEX"><kbd>^</kbd>
404  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
405  *       <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
406  *           it matches the beginning of the text, or after one of EOL characters (
407  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
408  *           PARAGRAPH SEPARATOR (U+2029).)
409  *
410  *       <dt class="REGEX"><kbd>$</kbd>
411  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
412  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
413  *       <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
414  *           it matches the end of the text, or before an EOL character.
415  *
416  *       <dt class="REGEX"><kbd>\b</kbd>
417  *       <dd>Matches word boundary.
418  *           (See <a HREF="#W_OPTION">a "w" option</a>)
419  *
420  *       <dt class="REGEX"><kbd>\B</kbd>
421  *       <dd>Matches non word boundary.
422  *           (See <a HREF="#W_OPTION">a "w" option</a>)
423  *
424  *       <dt class="REGEX"><kbd>\&lt;</kbd>
425  *       <dd>Matches the beginning of a word.
426  *           (See <a HREF="#W_OPTION">a "w" option</a>)
427  *
428  *       <dt class="REGEX"><kbd>\&gt;</kbd>
429  *       <dd>Matches the end of a word.
430  *           (See <a HREF="#W_OPTION">a "w" option</a>)
431  *     </dl>
432  *   </li>
433  *   <li>Lookahead and lookbehind
434  *     <dl>
435  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
436  *       <dd>Lookahead.
437  *
438  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
439  *       <dd>Negative lookahead.
440  *
441  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
442  *       <dd>Lookbehind.
443  *       <dd>(Note for text capturing......)
444  *
445  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
446  *       <dd>Negative lookbehind.
447  *     </dl>
448  *   </li>
449  *
450  *   <li>Misc.
451  *     <dl>
452  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
453  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
454  *       <dd>......
455  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
456  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
457  *           You can not write comments in character classes and before quantifiers.
458  *     </dl>
459  *   </li>
460  * </ul>
461  *
462  *
463  * <hr width="50%">
464  * <h3>BNF for the regular expression</h3>
465  * <pre>
466  * regex ::= ('(?' options ')')? term ('|' term)*
467  * term ::= factor+
468  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
469  *            | '(?#' [^)]* ')'
470  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
471  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
472  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
473  *          | '(?>' regex ')' | '(?' options ':' regex ')'
474  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
475  * options ::= [imsw]* ('-' [imsw]+)?
476  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
477  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
478  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
479  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
480  * category-block ::= '\' [pP] category-symbol-1
481  *                    | ('\p{' | '\P{') (category-symbol | block-name
482  *                                       | other-properties) '}'
483  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
484  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
485  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
486  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
487  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
488  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
489  * block-name ::= (See above)
490  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
491  * character-1 ::= (any character except meta-characters)
492  *
493  * char-class ::= '[' ranges ']'
494  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
495  * ranges ::= '^'? (range <a HREF="#COMMA_OPTION">','?</a>)+
496  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
497  *           | range-char | range-char '-' range-char
498  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
499  * code-point ::= '\x' hex-char hex-char
500  *                | '\x{' hex-char+ '}'
501  * <!--               | '\ u' hex-char hex-char hex-char hex-char
502  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
503  * hex-char ::= [0-9a-fA-F]
504  * character-2 ::= (any character except \[]-,)
505  * </pre>
506  *
507  * <hr width="50%">
508  * <h3>TODO</h3>
509  * <ul>
510  *   <li><a HREF="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
511  *     <ul>
512  *       <li>2.4 Canonical Equivalents
513  *       <li>Level 3
514  *     </ul>
515  *   <li>Parsing performance
516  * </ul>
517  *
518  * <hr width="50%">
519  *
520  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
521  */
522 public class RegularExpression implements java.io.Serializable   {
523     static final boolean DEBUG = false;
524 
525     /**
526      * Compiles a token tree into an operation flow.
527      */
528     private synchronized void compile(Token tok) {
529         if (this.operations != null)
530             return;
531         this.numberOfClosures = 0;
532         this.operations = this.compile(tok, null, false);
533     }
534 
535     /**
536      * Converts a token to an operation.
537      */
538     private Op compile(Token tok, Op next, boolean reverse) {
539         Op ret;
540         switch (tok.type) {
541         case Token.DOT:
542             ret = Op.createDot();
543             ret.next = next;
544             break;
545 
546         case Token.CHAR:
547             ret = Op.createChar(tok.getChar());
548             ret.next = next;
549             break;
550 
551         case Token.ANCHOR:
552             ret = Op.createAnchor(tok.getChar());
553             ret.next = next;
554             break;
555 
556         case Token.RANGE:
557         case Token.NRANGE:
558             ret = Op.createRange(tok);
559             ret.next = next;
560             break;
561 
562         case Token.CONCAT:
563             ret = next;
564             if (!reverse) {
565                 for (int i = tok.size()-1;  i >= 0;  i --) {
566                     ret = compile(tok.getChild(i), ret, false);
567                 }
568             } else {
569                 for (int i = 0;  i < tok.size();  i ++) {
570                     ret = compile(tok.getChild(i), ret, true);
571                 }
572             }
573             break;
574 
575         case Token.UNION:
576             Op.UnionOp uni = Op.createUnion(tok.size());
577             for (int i = 0;  i < tok.size();  i ++) {
578                 uni.addElement(compile(tok.getChild(i), next, reverse));
579             }
580             ret = uni;                          // ret.next is null.
581             break;
582 
583         case Token.CLOSURE:
584         case Token.NONGREEDYCLOSURE:
585             Token child = tok.getChild(0);
586             int min = tok.getMin();
587             int max = tok.getMax();
588             if (min >= 0 && min == max) { // {n}
589                 ret = next;
590                 for (int i = 0; i < min;  i ++) {
591                     ret = compile(child, ret, reverse);
592                 }
593                 break;
594             }
595             if (min > 0 && max > 0)
596                 max -= min;
597             if (max > 0) {
598                 // X{2,6} -> XX(X(X(XX?)?)?)?
599                 ret = next;
600                 for (int i = 0;  i < max;  i ++) {
601                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
602                     q.next = next;
603                     q.setChild(compile(child, ret, reverse));
604                     ret = q;
605                 }
606             } else {
607                 Op.ChildOp op;
608                 if (tok.type == Token.NONGREEDYCLOSURE) {
609                     op = Op.createNonGreedyClosure();
610                 } else {                        // Token.CLOSURE
611                     if (child.getMinLength() == 0)
612                         op = Op.createClosure(this.numberOfClosures++);
613                     else
614                         op = Op.createClosure(-1);
615                 }
616                 op.next = next;
617                 op.setChild(compile(child, op, reverse));
618                 ret = op;
619             }
620             if (min > 0) {
621                 for (int i = 0;  i < min;  i ++) {
622                     ret = compile(child, ret, reverse);
623                 }
624             }
625             break;
626 
627         case Token.EMPTY:
628             ret = next;
629             break;
630 
631         case Token.STRING:
632             ret = Op.createString(tok.getString());
633             ret.next = next;
634             break;
635 
636         case Token.BACKREFERENCE:
637             ret = Op.createBackReference(tok.getReferenceNumber());
638             ret.next = next;
639             break;
640 
641         case Token.PAREN:
642             if (tok.getParenNumber() == 0) {
643                 ret = compile(tok.getChild(0), next, reverse);
644             } else if (reverse) {
645                 next = Op.createCapture(tok.getParenNumber(), next);
646                 next = compile(tok.getChild(0), next, reverse);
647                 ret = Op.createCapture(-tok.getParenNumber(), next);
648             } else {
649                 next = Op.createCapture(-tok.getParenNumber(), next);
650                 next = compile(tok.getChild(0), next, reverse);
651                 ret = Op.createCapture(tok.getParenNumber(), next);
652             }
653             break;
654 
655         case Token.LOOKAHEAD:
656             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
657             break;
658         case Token.NEGATIVELOOKAHEAD:
659             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
660             break;
661         case Token.LOOKBEHIND:
662             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
663             break;
664         case Token.NEGATIVELOOKBEHIND:
665             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
666             break;
667 
668         case Token.INDEPENDENT:
669             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
670             break;
671 
672         case Token.MODIFIERGROUP:
673             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
674                                     ((Token.ModifierToken)tok).getOptions(),
675                                     ((Token.ModifierToken)tok).getOptionsMask());
676             break;
677 
678         case Token.CONDITION:
679             Token.ConditionToken ctok = (Token.ConditionToken)tok;
680             int ref = ctok.refNumber;
681             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
682             Op yes = compile(ctok.yes, next, reverse);
683             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
684             ret = Op.createCondition(next, ref, condition, yes, no);
685             break;
686 
687         default:
688             throw new RuntimeException  ("Unknown token type: "+tok.type);
689         } // switch (tok.type)
690         return ret;
691     }
692 
693 
694 //Public
695 
696     /**
697      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
698      *
699      * @return true if the target is matched to this regular expression.
700      */
701     public boolean matches(char[]  target) {
702         return this.matches(target, 0,  target .length , (Match)null);
703     }
704 
705     /**
706      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
707      * in specified range or not.
708      *
709      * @param start Start offset of the range.
710      * @param end  End offset +1 of the range.
711      * @return true if the target is matched to this regular expression.
712      */
713     public boolean matches(char[]  target, int start, int end) {
714         return this.matches(target, start, end, (Match)null);
715     }
716 
717     /**
718      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
719      *
720      * @param match A Match instance for storing matching result.
721      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
722      */
723     public boolean matches(char[]  target, Match match) {
724         return this.matches(target, 0,  target .length , match);
725     }
726 
727 
728     /**
729      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
730      * in specified range or not.
731      *
732      * @param start Start offset of the range.
733      * @param end  End offset +1 of the range.
734      * @param match A Match instance for storing matching result.
735      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
736      */
737     public boolean matches(char[]  target, int start, int end, Match match) {
738 
739         synchronized (this) {
740             if (this.operations == null)
741                 this.prepare();
742             if (this.context == null)
743                 this.context = new Context();
744         }
745         Context con = null;
746         synchronized (this.context) {
747             con = this.context.inuse ? new Context() : this.context;
748             con.reset(target, start, end, this.numberOfClosures);
749         }
750         if (match != null) {
751             match.setNumberOfGroups(this.nofparen);
752             match.setSource(target);
753         } else if (this.hasBackReferences) {
754             match = new Match();
755             match.setNumberOfGroups(this.nofparen);
756             // Need not to call setSource() because
757             // a caller can not access this match instance.
758         }
759         con.match = match;
760 
761         if (isSet(this.options, XMLSCHEMA_MODE)) {
762             int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
763             //System.err.println("DEBUG: matchEnd="+matchEnd);
764             if (matchEnd == con.limit) {
765                 if (con.match != null) {
766                     con.match.setBeginning(0, con.start);
767                     con.match.setEnd(0, matchEnd);
768                 }
769                 con.inuse = false;
770                 return true;
771             }
772             return false;
773         }
774 
775         /*
776          * The pattern has only fixed string.
777          * The engine uses Boyer-Moore.
778          */
779         if (this.fixedStringOnly) {
780             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
781             int o = this.fixedStringTable.matches(target, con.start, con.limit);
782             if (o >= 0) {
783                 if (con.match != null) {
784                     con.match.setBeginning(0, o);
785                     con.match.setEnd(0, o+this.fixedString.length());
786                 }
787                 con.inuse = false;
788                 return true;
789             }
790             con.inuse = false;
791             return false;
792         }
793 
794         /*
795          * The pattern contains a fixed string.
796          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
797          * If not, it return with false.
798          */
799         if (this.fixedString != null) {
800             int o = this.fixedStringTable.matches(target, con.start, con.limit);
801             if (o < 0) {
802                 //System.err.println("Non-match in fixed-string search.");
803                 con.inuse = false;
804                 return false;
805             }
806         }
807 
808         int limit = con.limit-this.minlength;
809         int matchStart;
810         int matchEnd = -1;
811 
812         /*
813          * Checks whether the expression starts with ".*".
814          */
815         if (this.operations != null
816             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
817             if (isSet(this.options, SINGLE_LINE)) {
818                 matchStart = con.start;
819                 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
820             } else {
821                 boolean previousIsEOL = true;
822                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
823                     int ch =  target [  matchStart ] ;
824                     if (isEOLChar(ch)) {
825                         previousIsEOL = true;
826                     } else {
827                         if (previousIsEOL) {
828                             if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
829                                                                        matchStart, 1, this.options)))
830                                 break;
831                         }
832                         previousIsEOL = false;
833                     }
834                 }
835             }
836         }
837 
838         /*
839          * Optimization against the first character.
840          */
841         else if (this.firstChar != null) {
842             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
843             RangeToken range = this.firstChar;
844             if (isSet(this.options, IGNORE_CASE)) {
845                 range = this.firstChar.getCaseInsensitiveToken();
846                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
847                     int ch =  target [  matchStart ] ;
848                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
849                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
850                         if (!range.match(ch))  continue;
851                     } else {
852                         if (!range.match(ch)) {
853                             char ch1 = Character.toUpperCase((char)ch);
854                             if (!range.match(ch1))
855                                 if (!range.match(Character.toLowerCase(ch1)))
856                                     continue;
857                         }
858                     }
859                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
860                                                                matchStart, 1, this.options)))
861                         break;
862                 }
863             } else {
864                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
865                     int ch =  target [  matchStart ] ;
866                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
867                         ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
868                     if (!range.match(ch))  continue;
869                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
870                                                                matchStart, 1, this.options)))
871                         break;
872                 }
873             }
874         }
875 
876         /*
877          * Straightforward matching.
878          */
879         else {
880             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
881                 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
882                     break;
883             }
884         }
885 
886         if (matchEnd >= 0) {
887             if (con.match != null) {
888                 con.match.setBeginning(0, matchStart);
889                 con.match.setEnd(0, matchEnd);
890             }
891             con.inuse = false;
892             return true;
893         } else {
894             con.inuse = false;
895             return false;
896         }
897     }
898 
899 /**
900  * @return -1 when not match; offset of the end of matched string when match.
901  */
902     private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
903 
904         char[] target = con.charTarget;
905 
906 
907         while (true) {
908             if (op == null)
909                 return offset;
910             if (offset > con.limit || offset < con.start)
911                 return -1;
912             switch (op.type) {
913             case Op.CHAR:
914                 if (isSet(opts, IGNORE_CASE)) {
915                     int ch = op.getData();
916                     if (dx > 0) {
917                         if (offset >= con.limit || !matchIgnoreCase(ch,  target [  offset ] ))
918                             return -1;
919                         offset ++;
920                     } else {
921                         int o1 = offset-1;
922                         if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target [  o1 ] ))
923                             return -1;
924                         offset = o1;
925                     }
926                 } else {
927                     int ch = op.getData();
928                     if (dx > 0) {
929                         if (offset >= con.limit || ch !=  target [  offset ] )
930                             return -1;
931                         offset ++;
932                     } else {
933                         int o1 = offset-1;
934                         if (o1 >= con.limit || o1 < 0 || ch !=  target [  o1 ] )
935                             return -1;
936                         offset = o1;
937                     }
938                 }
939                 op = op.next;
940                 break;
941 
942             case Op.DOT:
943                 if (dx > 0) {
944                     if (offset >= con.limit)
945                         return -1;
946                     int ch =  target [  offset ] ;
947                     if (isSet(opts, SINGLE_LINE)) {
948                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
949                             offset ++;
950                     } else {
951                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
952                             ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
953                         if (isEOLChar(ch))
954                             return -1;
955                     }
956                     offset ++;
957                 } else {
958                     int o1 = offset-1;
959                     if (o1 >= con.limit || o1 < 0)
960                         return -1;
961                     int ch =  target [  o1 ] ;
962                     if (isSet(opts, SINGLE_LINE)) {
963                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
964                             o1 --;
965                     } else {
966                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
967                             ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
968                         if (!isEOLChar(ch))
969                             return -1;
970                     }
971                     offset = o1;
972                 }
973                 op = op.next;
974                 break;
975 
976             case Op.RANGE:
977             case Op.NRANGE:
978                 if (dx > 0) {
979                     if (offset >= con.limit)
980                         return -1;
981                     int ch =  target [  offset ] ;
982                     if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
983                         ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
984                     RangeToken tok = op.getToken();
985                     if (isSet(opts, IGNORE_CASE)) {
986                         tok = tok.getCaseInsensitiveToken();
987                         if (!tok.match(ch)) {
988                             if (ch >= 0x10000)  return -1;
989                             char uch;
990                             if (!tok.match(uch = Character.toUpperCase((char)ch))
991                                 && !tok.match(Character.toLowerCase(uch)))
992                                 return -1;
993                         }
994                     } else {
995                         if (!tok.match(ch))  return -1;
996                     }
997                     offset ++;
998                 } else {
999                     int o1 = offset-1;
1000                    if (o1 >= con.limit || o1 < 0)
1001                        return -1;
1002                    int ch =  target [  o1 ] ;
1003                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1004                        ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
1005                    RangeToken tok = op.getToken();
1006                    if (isSet(opts, IGNORE_CASE)) {
1007                        tok = tok.getCaseInsensitiveToken();
1008                        if (!tok.match(ch)) {
1009                            if (ch >= 0x10000)  return -1;
1010                            char uch;
1011                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1012                                && !tok.match(Character.toLowerCase(uch)))
1013                                return -1;
1014                        }
1015                    } else {
1016                        if (!tok.match(ch))  return -1;
1017                    }
1018                    offset = o1;
1019                }
1020                op = op.next;
1021                break;
1022
1023            case Op.ANCHOR:
1024                boolean go = false;
1025                switch (op.getData()) {
1026                case '^':
1027                    if (isSet(opts, MULTIPLE_LINES)) {
1028                        if (!(offset == con.start
1029                              || offset > con.start && isEOLChar( target [  offset-1 ] )))
1030                            return -1;
1031                    } else {
1032                        if (offset != con.start)
1033                            return -1;
1034                    }
1035                    break;
1036
1037                case '@':                         // Internal use only.
1038                    // The @ always matches line beginnings.
1039                    if (!(offset == con.start
1040                          || offset > con.start && isEOLChar( target [  offset-1 ] )))
1041                        return -1;
1042                    break;
1043
1044                case '$':
1045                    if (isSet(opts, MULTIPLE_LINES)) {
1046                        if (!(offset == con.limit
1047                              || offset < con.limit && isEOLChar( target [  offset ] )))
1048                            return -1;
1049                    } else {
1050                        if (!(offset == con.limit
1051                              || offset+1 == con.limit && isEOLChar( target [  offset ] )
1052                              || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1053                              &&  target [  offset+1 ]  == LINE_FEED))
1054                            return -1;
1055                    }
1056                    break;
1057
1058                case 'A':
1059                    if (offset != con.start)  return -1;
1060                    break;
1061
1062                case 'Z':
1063                    if (!(offset == con.limit
1064                          || offset+1 == con.limit && isEOLChar( target [  offset ] )
1065                          || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
1066                          &&  target [  offset+1 ]  == LINE_FEED))
1067                        return -1;
1068                    break;
1069
1070                case 'z':
1071                    if (offset != con.limit)  return -1;
1072                    break;
1073
1074                case 'b':
1075                    if (con.length == 0)  return -1;
1076                    {
1077                        int after = getWordType(target, con.start, con.limit, offset, opts);
1078                        if (after == WT_IGNORE)  return -1;
1079                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1080                        if (after == before)  return -1;
1081                    }
1082                    break;
1083
1084                case 'B':
1085                    if (con.length == 0)
1086                        go = true;
1087                    else {
1088                        int after = getWordType(target, con.start, con.limit, offset, opts);
1089                        go = after == WT_IGNORE
1090                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1091                    }
1092                    if (!go)  return -1;
1093                    break;
1094
1095                case '<':
1096                    if (con.length == 0 || offset == con.limit)  return -1;
1097                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1098                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1099                        return -1;
1100                    break;
1101
1102                case '>':
1103                    if (con.length == 0 || offset == con.start)  return -1;
1104                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1105                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1106                        return -1;
1107                    break;
1108                } // switch anchor type
1109                op = op.next;
1110                break;
1111
1112            case Op.BACKREFERENCE:
1113                {
1114                    int refno = op.getData();
1115                    if (refno <= 0 || refno >= this.nofparen)
1116                        throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+refno);
1117                    if (con.match.getBeginning(refno) < 0
1118                        || con.match.getEnd(refno) < 0)
1119                        return -1;                // ********
1120                    int o2 = con.match.getBeginning(refno);
1121                    int literallen = con.match.getEnd(refno)-o2;
1122                    if (!isSet(opts, IGNORE_CASE)) {
1123                        if (dx > 0) {
1124                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1125                                return -1;
1126                            offset += literallen;
1127                        } else {
1128                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1129                                return -1;
1130                            offset -= literallen;
1131                        }
1132                    } else {
1133                        if (dx > 0) {
1134                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1135                                return -1;
1136                            offset += literallen;
1137                        } else {
1138                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1139                                                         o2, literallen))
1140                                return -1;
1141                            offset -= literallen;
1142                        }
1143                    }
1144                }
1145                op = op.next;
1146                break;
1147            case Op.STRING:
1148                {
1149                    String   literal = op.getString();
1150                    int literallen = literal.length();
1151                    if (!isSet(opts, IGNORE_CASE)) {
1152                        if (dx > 0) {
1153                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1154                                return -1;
1155                            offset += literallen;
1156                        } else {
1157                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1158                                return -1;
1159                            offset -= literallen;
1160                        }
1161                    } else {
1162                        if (dx > 0) {
1163                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1164                                return -1;
1165                            offset += literallen;
1166                        } else {
1167                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1168                                                         literal, literallen))
1169                                return -1;
1170                            offset -= literallen;
1171                        }
1172                    }
1173                }
1174                op = op.next;
1175                break;
1176
1177            case Op.CLOSURE:
1178                {
1179                    /*
1180                     * Saves current position to avoid
1181                     * zero-width repeats.
1182                     */
1183                    int id = op.getData();
1184                    if (id >= 0) {
1185                        int previousOffset = con.offsets[id];
1186                        if (previousOffset < 0 || previousOffset != offset) {
1187                            con.offsets[id] = offset;
1188                        } else {
1189                            con.offsets[id] = -1;
1190                            op = op.next;
1191                            break;
1192                        }
1193                    }
1194
1195                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1196                    if (id >= 0)  con.offsets[id] = -1;
1197                    if (ret >= 0)  return ret;
1198                    op = op.next;
1199                }
1200                break;
1201
1202            case Op.QUESTION:
1203                {
1204                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1205                    if (ret >= 0)  return ret;
1206                    op = op.next;
1207                }
1208                break;
1209
1210            case Op.NONGREEDYCLOSURE:
1211            case Op.NONGREEDYQUESTION:
1212                {
1213                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1214                    if (ret >= 0)  return ret;
1215                    op = op.getChild();
1216                }
1217                break;
1218
1219            case Op.UNION:
1220                for (int i = 0;  i < op.size();  i ++) {
1221                    int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1222                    if (DEBUG) {
1223                        System.err.println("UNION: "+i+", ret="+ret);
1224                    }
1225                    if (ret == con.length )  return ret;
1226                }
1227                return -1;
1228
1229            case Op.CAPTURE:
1230                int refno = op.getData();
1231                if (con.match != null && refno > 0) {
1232                    int save = con.match.getBeginning(refno);
1233                    con.match.setBeginning(refno, offset);
1234                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1235                    if (ret < 0)  con.match.setBeginning(refno, save);
1236                    return ret;
1237                } else if (con.match != null && refno < 0) {
1238                    int index = -refno;
1239                    int save = con.match.getEnd(index);
1240                    con.match.setEnd(index, offset);
1241                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1242                    if (ret < 0)  con.match.setEnd(index, save);
1243                    return ret;
1244                }
1245                op = op.next;
1246                break;
1247
1248            case Op.LOOKAHEAD:
1249                if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
1250                op = op.next;
1251                break;
1252            case Op.NEGATIVELOOKAHEAD:
1253                if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
1254                op = op.next;
1255                break;
1256            case Op.LOOKBEHIND:
1257                if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
1258                op = op.next;
1259                break;
1260            case Op.NEGATIVELOOKBEHIND:
1261                if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
1262                op = op.next;
1263                break;
1264
1265            case Op.INDEPENDENT:
1266                {
1267                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1268                    if (ret < 0)  return ret;
1269                    offset = ret;
1270                    op = op.next;
1271                }
1272                break;
1273
1274            case Op.MODIFIER:
1275                {
1276                    int localopts = opts;
1277                    localopts |= op.getData();
1278                    localopts &= ~op.getData2();
1279                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1280                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
1281                    if (ret < 0)  return ret;
1282                    offset = ret;
1283                    op = op.next;
1284                }
1285                break;
1286
1287            case Op.CONDITION:
1288                {
1289                    Op.ConditionOp cop = (Op.ConditionOp)op;
1290                    boolean matchp = false;
1291                    if (cop.refNumber > 0) {
1292                        if (cop.refNumber >= this.nofparen)
1293                            throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1294                        matchp = con.match.getBeginning(cop.refNumber) >= 0
1295                                 && con.match.getEnd(cop.refNumber) >= 0;
1296                    } else {
1297                        matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
1298                    }
1299
1300                    if (matchp) {
1301                        op = cop.yes;
1302                    } else if (cop.no != null) {
1303                        op = cop.no;
1304                    } else {
1305                        op = cop.next;
1306                    }
1307                }
1308                break;
1309
1310            default:
1311                throw new RuntimeException  ("Unknown operation type: "+op.type);
1312            } // switch (op.type)
1313        } // while
1314    }
1315
1316    private static final int getPreviousWordType(char[]  target, int begin, int end,
1317                                                 int offset, int opts) {
1318        int ret = getWordType(target, begin, end, --offset, opts);
1319        while (ret == WT_IGNORE)
1320            ret = getWordType(target, begin, end, --offset, opts);
1321        return ret;
1322    }
1323
1324    private static final int getWordType(char[]  target, int begin, int end,
1325                                         int offset, int opts) {
1326        if (offset < begin || offset >= end)  return WT_OTHER;
1327        return getWordType0( target [  offset ] , opts);
1328    }
1329
1330
1331
1332    private static final boolean regionMatches(char[]  target, int offset, int limit,
1333                                               String   part, int partlen) {
1334        if (offset < 0)  return false;
1335        if (limit-offset < partlen)
1336            return false;
1337        int i = 0;
1338        while (partlen-- > 0) {
1339            if ( target [  offset++ ]  != part.charAt(i++))
1340                return false;
1341        }
1342        return true;
1343    }
1344
1345    private static final boolean regionMatches(char[]  target, int offset, int limit,
1346                                               int offset2, int partlen) {
1347        if (offset < 0)  return false;
1348        if (limit-offset < partlen)
1349            return false;
1350        int i = offset2;
1351        while (partlen-- > 0) {
1352            if ( target [  offset++ ]  !=  target [  i++ ] )
1353                return false;
1354        }
1355        return true;
1356    }
1357
1358/**
1359 * @see java.lang.String#regionMatches
1360 */
1361    private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
1362                                                         String   part, int partlen) {
1363        if (offset < 0)  return false;
1364        if (limit-offset < partlen)
1365            return false;
1366        int i = 0;
1367        while (partlen-- > 0) {
1368            char ch1 =  target [  offset++ ] ;
1369            char ch2 = part.charAt(i++);
1370            if (ch1 == ch2)
1371                continue;
1372            char uch1 = Character.toUpperCase(ch1);
1373            char uch2 = Character.toUpperCase(ch2);
1374            if (uch1 == uch2)
1375                continue;
1376            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
1377                return false;
1378        }
1379        return true;
1380    }
1381
1382    private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
1383                                                         int offset2, int partlen) {
1384        if (offset < 0)  return false;
1385        if (limit-offset < partlen)
1386            return false;
1387        int i = offset2;
1388        while (partlen-- > 0) {
1389            char ch1 =  target [  offset++ ] ;
1390            char ch2 =  target [  i++ ] ;
1391            if (ch1 == ch2)
1392                continue;
1393            char uch1 = Character.toUpperCase(ch1);
1394            char uch2 = Character.toUpperCase(ch2);
1395            if (uch1 == uch2)
1396                continue;
1397            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
1398                return false;
1399        }
1400        return true;
1401    }
1402
1403
1404
1405
1406    /**
1407     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1408     *
1409     * @return true if the target is matched to this regular expression.
1410     */
1411    public boolean matches(String    target) {
1412        return this.matches(target, 0,  target .length() , (Match)null);
1413    }
1414
1415    /**
1416     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1417     * in specified range or not.
1418     *
1419     * @param start Start offset of the range.
1420     * @param end  End offset +1 of the range.
1421     * @return true if the target is matched to this regular expression.
1422     */
1423    public boolean matches(String    target, int start, int end) {
1424        return this.matches(target, start, end, (Match)null);
1425    }
1426
1427    /**
1428     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1429     *
1430     * @param match A Match instance for storing matching result.
1431     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1432     */
1433    public boolean matches(String    target, Match match) {
1434        return this.matches(target, 0,  target .length() , match);
1435    }
1436
1437    /**
1438     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1439     * in specified range or not.
1440     *
1441     * @param start Start offset of the range.
1442     * @param end  End offset +1 of the range.
1443     * @param match A Match instance for storing matching result.
1444     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1445     */
1446    public boolean matches(String    target, int start, int end, Match match) {
1447
1448        synchronized (this) {
1449            if (this.operations == null)
1450                this.prepare();
1451            if (this.context == null)
1452                this.context = new Context();
1453        }
1454        Context con = null;
1455        synchronized (this.context) {
1456            con = this.context.inuse ? new Context() : this.context;
1457            con.reset(target, start, end, this.numberOfClosures);
1458        }
1459        if (match != null) {
1460            match.setNumberOfGroups(this.nofparen);
1461            match.setSource(target);
1462        } else if (this.hasBackReferences) {
1463            match = new Match();
1464            match.setNumberOfGroups(this.nofparen);
1465            // Need not to call setSource() because
1466            // a caller can not access this match instance.
1467        }
1468        con.match = match;
1469
1470        if (isSet(this.options, XMLSCHEMA_MODE)) {
1471            if (DEBUG) {
1472                System.err.println("target string="+target);
1473            }
1474            int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
1475            if (DEBUG) {
1476                System.err.println("matchEnd="+matchEnd);
1477                System.err.println("con.limit="+con.limit);
1478            }
1479            if (matchEnd == con.limit) {
1480                if (con.match != null) {
1481                    con.match.setBeginning(0, con.start);
1482                    con.match.setEnd(0, matchEnd);
1483                }
1484                con.inuse = false;
1485                return true;
1486            }
1487            return false;
1488        }
1489
1490        /*
1491         * The pattern has only fixed string.
1492         * The engine uses Boyer-Moore.
1493         */
1494        if (this.fixedStringOnly) {
1495            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1496            int o = this.fixedStringTable.matches(target, con.start, con.limit);
1497            if (o >= 0) {
1498                if (con.match != null) {
1499                    con.match.setBeginning(0, o);
1500                    con.match.setEnd(0, o+this.fixedString.length());
1501                }
1502                con.inuse = false;
1503                return true;
1504            }
1505            con.inuse = false;
1506            return false;
1507        }
1508
1509        /*
1510         * The pattern contains a fixed string.
1511         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1512         * If not, it return with false.
1513         */
1514        if (this.fixedString != null) {
1515            int o = this.fixedStringTable.matches(target, con.start, con.limit);
1516            if (o < 0) {
1517                //System.err.println("Non-match in fixed-string search.");
1518                con.inuse = false;
1519                return false;
1520            }
1521        }
1522
1523        int limit = con.limit-this.minlength;
1524        int matchStart;
1525        int matchEnd = -1;
1526
1527        /*
1528         * Checks whether the expression starts with ".*".
1529         */
1530        if (this.operations != null
1531            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1532            if (isSet(this.options, SINGLE_LINE)) {
1533                matchStart = con.start;
1534                matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
1535            } else {
1536                boolean previousIsEOL = true;
1537                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1538                    int ch =  target .charAt(  matchStart ) ;
1539                    if (isEOLChar(ch)) {
1540                        previousIsEOL = true;
1541                    } else {
1542                        if (previousIsEOL) {
1543                            if (0 <= (matchEnd = this. matchString (con, this.operations,
1544                                                                    matchStart, 1, this.options)))
1545                                break;
1546                        }
1547                        previousIsEOL = false;
1548                    }
1549                }
1550            }
1551        }
1552
1553        /*
1554         * Optimization against the first character.
1555         */
1556        else if (this.firstChar != null) {
1557            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1558            RangeToken range = this.firstChar;
1559            if (isSet(this.options, IGNORE_CASE)) {
1560                range = this.firstChar.getCaseInsensitiveToken();
1561                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1562                    int ch =  target .charAt(  matchStart ) ;
1563                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1564                        ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
1565                        if (!range.match(ch))  continue;
1566                    } else {
1567                        if (!range.match(ch)) {
1568                            char ch1 = Character.toUpperCase((char)ch);
1569                            if (!range.match(ch1))
1570                                if (!range.match(Character.toLowerCase(ch1)))
1571                                    continue;
1572                        }
1573                    }
1574                    if (0 <= (matchEnd = this. matchString (con, this.operations,
1575                                                            matchStart, 1, this.options)))
1576                        break;
1577                }
1578            } else {
1579                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1580                    int ch =  target .charAt(  matchStart ) ;
1581                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
1582                        ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
1583                    if (!range.match(ch))  continue;
1584                    if (0 <= (matchEnd = this. matchString (con, this.operations,
1585                                                            matchStart, 1, this.options)))
1586                        break;
1587                }
1588            }
1589        }
1590
1591        /*
1592         * Straightforward matching.
1593         */
1594        else {
1595            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1596                if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
1597                    break;
1598            }
1599        }
1600
1601        if (matchEnd >= 0) {
1602            if (con.match != null) {
1603                con.match.setBeginning(0, matchStart);
1604                con.match.setEnd(0, matchEnd);
1605            }
1606            con.inuse = false;
1607            return true;
1608        } else {
1609            con.inuse = false;
1610            return false;
1611        }
1612    }
1613
1614    /**
1615     * @return -1 when not match; offset of the end of matched string when match.
1616     */
1617    private int matchString (Context con, Op op, int offset, int dx, int opts) {
1618
1619
1620
1621
1622        String   target = con.strTarget;
1623
1624
1625
1626
1627        while (true) {
1628            if (op == null)
1629                return offset;
1630            if (offset > con.limit || offset < con.start)
1631                return -1;
1632            switch (op.type) {
1633            case Op.CHAR:
1634                if (isSet(opts, IGNORE_CASE)) {
1635                    int ch = op.getData();
1636                    if (dx > 0) {
1637                        if (offset >= con.limit || !matchIgnoreCase(ch,  target .charAt(  offset ) ))
1638                            return -1;
1639                        offset ++;
1640                    } else {
1641                        int o1 = offset-1;
1642                        if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .charAt(  o1 ) ))
1643                            return -1;
1644                        offset = o1;
1645                    }
1646                } else {
1647                    int ch = op.getData();
1648                    if (dx > 0) {
1649                        if (offset >= con.limit || ch !=  target .charAt(  offset ) )
1650                            return -1;
1651                        offset ++;
1652                    } else {
1653                        int o1 = offset-1;
1654                        if (o1 >= con.limit || o1 < 0 || ch !=  target .charAt(  o1 ) )
1655                            return -1;
1656                        offset = o1;
1657                    }
1658                }
1659                op = op.next;
1660                break;
1661
1662            case Op.DOT:
1663                if (dx > 0) {
1664                    if (offset >= con.limit)
1665                        return -1;
1666                    int ch =  target .charAt(  offset ) ;
1667                    if (isSet(opts, SINGLE_LINE)) {
1668                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1669                            offset ++;
1670                    } else {
1671                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1672                            ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
1673                        if (isEOLChar(ch))
1674                            return -1;
1675                    }
1676                    offset ++;
1677                } else {
1678                    int o1 = offset-1;
1679                    if (o1 >= con.limit || o1 < 0)
1680                        return -1;
1681                    int ch =  target .charAt(  o1 ) ;
1682                    if (isSet(opts, SINGLE_LINE)) {
1683                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1684                            o1 --;
1685                    } else {
1686                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1687                            ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
1688                        if (!isEOLChar(ch))
1689                            return -1;
1690                    }
1691                    offset = o1;
1692                }
1693                op = op.next;
1694                break;
1695
1696            case Op.RANGE:
1697            case Op.NRANGE:
1698                if (dx > 0) {
1699                    if (offset >= con.limit)
1700                        return -1;
1701                    int ch =  target .charAt(  offset ) ;
1702                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1703                        ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
1704                    RangeToken tok = op.getToken();
1705                    if (isSet(opts, IGNORE_CASE)) {
1706                        tok = tok.getCaseInsensitiveToken();
1707                        if (!tok.match(ch)) {
1708                            if (ch >= 0x10000)  return -1;
1709                            char uch;
1710                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1711                                && !tok.match(Character.toLowerCase(uch)))
1712                                return -1;
1713                        }
1714                    } else {
1715                        if (!tok.match(ch))  return -1;
1716                    }
1717                    offset ++;
1718                } else {
1719                    int o1 = offset-1;
1720                    if (o1 >= con.limit || o1 < 0)
1721                        return -1;
1722                    int ch =  target .charAt(  o1 ) ;
1723                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1724                        ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
1725                    RangeToken tok = op.getToken();
1726                    if (isSet(opts, IGNORE_CASE)) {
1727                        tok = tok.getCaseInsensitiveToken();
1728                        if (!tok.match(ch)) {
1729                            if (ch >= 0x10000)  return -1;
1730                            char uch;
1731                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1732                                && !tok.match(Character.toLowerCase(uch)))
1733                                return -1;
1734                        }
1735                    } else {
1736                        if (!tok.match(ch))  return -1;
1737                    }
1738                    offset = o1;
1739                }
1740                op = op.next;
1741                break;
1742
1743            case Op.ANCHOR:
1744                boolean go = false;
1745                switch (op.getData()) {
1746                case '^':
1747                    if (isSet(opts, MULTIPLE_LINES)) {
1748                        if (!(offset == con.start
1749                              || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
1750                            return -1;
1751                    } else {
1752                        if (offset != con.start)
1753                            return -1;
1754                    }
1755                    break;
1756
1757                case '@':                         // Internal use only.
1758                    // The @ always matches line beginnings.
1759                    if (!(offset == con.start
1760                          || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
1761                        return -1;
1762                    break;
1763
1764                case '$':
1765                    if (isSet(opts, MULTIPLE_LINES)) {
1766                        if (!(offset == con.limit
1767                              || offset < con.limit && isEOLChar( target .charAt(  offset ) )))
1768                            return -1;
1769                    } else {
1770                        if (!(offset == con.limit
1771                              || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
1772                              || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
1773                              &&  target .charAt(  offset+1 )  == LINE_FEED))
1774                            return -1;
1775                    }
1776                    break;
1777
1778                case 'A':
1779                    if (offset != con.start)  return -1;
1780                    break;
1781
1782                case 'Z':
1783                    if (!(offset == con.limit
1784                          || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
1785                          || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
1786                          &&  target .charAt(  offset+1 )  == LINE_FEED))
1787                        return -1;
1788                    break;
1789
1790                case 'z':
1791                    if (offset != con.limit)  return -1;
1792                    break;
1793
1794                case 'b':
1795                    if (con.length == 0)  return -1;
1796                    {
1797                        int after = getWordType(target, con.start, con.limit, offset, opts);
1798                        if (after == WT_IGNORE)  return -1;
1799                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1800                        if (after == before)  return -1;
1801                    }
1802                    break;
1803
1804                case 'B':
1805                    if (con.length == 0)
1806                        go = true;
1807                    else {
1808                        int after = getWordType(target, con.start, con.limit, offset, opts);
1809                        go = after == WT_IGNORE
1810                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1811                    }
1812                    if (!go)  return -1;
1813                    break;
1814
1815                case '<':
1816                    if (con.length == 0 || offset == con.limit)  return -1;
1817                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1818                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1819                        return -1;
1820                    break;
1821
1822                case '>':
1823                    if (con.length == 0 || offset == con.start)  return -1;
1824                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1825                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1826                        return -1;
1827                    break;
1828                } // switch anchor type
1829                op = op.next;
1830                break;
1831
1832            case Op.BACKREFERENCE:
1833                {
1834                    int refno = op.getData();
1835                    if (refno <= 0 || refno >= this.nofparen)
1836                        throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+refno);
1837                    if (con.match.getBeginning(refno) < 0
1838                        || con.match.getEnd(refno) < 0)
1839                        return -1;                // ********
1840                    int o2 = con.match.getBeginning(refno);
1841                    int literallen = con.match.getEnd(refno)-o2;
1842                    if (!isSet(opts, IGNORE_CASE)) {
1843                        if (dx > 0) {
1844                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1845                                return -1;
1846                            offset += literallen;
1847                        } else {
1848                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1849                                return -1;
1850                            offset -= literallen;
1851                        }
1852                    } else {
1853                        if (dx > 0) {
1854                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1855                                return -1;
1856                            offset += literallen;
1857                        } else {
1858                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1859                                                         o2, literallen))
1860                                return -1;
1861                            offset -= literallen;
1862                        }
1863                    }
1864                }
1865                op = op.next;
1866                break;
1867            case Op.STRING:
1868                {
1869                    String   literal = op.getString();
1870                    int literallen = literal.length();
1871                    if (!isSet(opts, IGNORE_CASE)) {
1872                        if (dx > 0) {
1873                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1874                                return -1;
1875                            offset += literallen;
1876                        } else {
1877                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1878                                return -1;
1879                            offset -= literallen;
1880                        }
1881                    } else {
1882                        if (dx > 0) {
1883                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1884                                return -1;
1885                            offset += literallen;
1886                        } else {
1887                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1888                                                         literal, literallen))
1889                                return -1;
1890                            offset -= literallen;
1891                        }
1892                    }
1893                }
1894                op = op.next;
1895                break;
1896
1897            case Op.CLOSURE:
1898                {
1899                    /*
1900                     * Saves current position to avoid
1901                     * zero-width repeats.
1902                     */
1903                    int id = op.getData();
1904                    if (id >= 0) {
1905                        int previousOffset = con.offsets[id];
1906                        if (previousOffset < 0 || previousOffset != offset) {
1907                            con.offsets[id] = offset;
1908                        } else {
1909                            con.offsets[id] = -1;
1910                            op = op.next;
1911                            break;
1912                        }
1913                    }
1914                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1915                    if (id >= 0)  con.offsets[id] = -1;
1916                    if (ret >= 0)  return ret;
1917                    op = op.next;
1918                }
1919                break;
1920
1921            case Op.QUESTION:
1922                {
1923                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1924                    if (ret >= 0)  return ret;
1925                    op = op.next;
1926                }
1927                break;
1928
1929            case Op.NONGREEDYCLOSURE:
1930            case Op.NONGREEDYQUESTION:
1931                {
1932                    int ret = this. matchString (con, op.next, offset, dx, opts);
1933                    if (ret >= 0)  return ret;
1934                    op = op.getChild();
1935                }
1936                break;
1937
1938            case Op.UNION:
1939                for (int i = 0;  i < op.size();  i ++) {
1940                    int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
1941                    if (DEBUG) {
1942                        System.err.println("UNION: "+i+", ret="+ret);
1943                    }
1944                    if (ret == con.length )  return ret;
1945                }
1946                return -1;
1947
1948            case Op.CAPTURE:
1949                int refno = op.getData();
1950                if (con.match != null && refno > 0) {
1951                    int save = con.match.getBeginning(refno);
1952                    con.match.setBeginning(refno, offset);
1953                    int ret = this. matchString (con, op.next, offset, dx, opts);
1954                    if (ret < 0)  con.match.setBeginning(refno, save);
1955                    return ret;
1956                } else if (con.match != null && refno < 0) {
1957                    int index = -refno;
1958                    int save = con.match.getEnd(index);
1959                    con.match.setEnd(index, offset);
1960                    int ret = this. matchString (con, op.next, offset, dx, opts);
1961                    if (ret < 0)  con.match.setEnd(index, save);
1962                    return ret;
1963                }
1964                op = op.next;
1965                break;
1966
1967            case Op.LOOKAHEAD:
1968                if (0 > this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
1969                op = op.next;
1970                break;
1971            case Op.NEGATIVELOOKAHEAD:
1972                if (0 <= this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
1973                op = op.next;
1974                break;
1975            case Op.LOOKBEHIND:
1976                if (0 > this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
1977                op = op.next;
1978                break;
1979            case Op.NEGATIVELOOKBEHIND:
1980                if (0 <= this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
1981                op = op.next;
1982                break;
1983
1984            case Op.INDEPENDENT:
1985                {
1986                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1987                    if (ret < 0)  return ret;
1988                    offset = ret;
1989                    op = op.next;
1990                }
1991                break;
1992
1993            case Op.MODIFIER:
1994                {
1995                    int localopts = opts;
1996                    localopts |= op.getData();
1997                    localopts &= ~op.getData2();
1998                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1999                    int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
2000                    if (ret < 0)  return ret;
2001                    offset = ret;
2002                    op = op.next;
2003                }
2004                break;
2005
2006            case Op.CONDITION:
2007                {
2008                    Op.ConditionOp cop = (Op.ConditionOp)op;
2009                    boolean matchp = false;
2010                    if (cop.refNumber > 0) {
2011                        if (cop.refNumber >= this.nofparen)
2012                            throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2013                        matchp = con.match.getBeginning(cop.refNumber) >= 0
2014                                 && con.match.getEnd(cop.refNumber) >= 0;
2015                    } else {
2016                        matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
2017                    }
2018
2019                    if (matchp) {
2020                        op = cop.yes;
2021                    } else if (cop.no != null) {
2022                        op = cop.no;
2023                    } else {
2024                        op = cop.next;
2025                    }
2026                }
2027                break;
2028
2029            default:
2030                throw new RuntimeException  ("Unknown operation type: "+op.type);
2031            } // switch (op.type)
2032        } // while
2033    }
2034
2035    private static final int getPreviousWordType(String    target, int begin, int end,
2036                                                 int offset, int opts) {
2037        int ret = getWordType(target, begin, end, --offset, opts);
2038        while (ret == WT_IGNORE)
2039            ret = getWordType(target, begin, end, --offset, opts);
2040        return ret;
2041    }
2042
2043    private static final int getWordType(String    target, int begin, int end,
2044                                         int offset, int opts) {
2045        if (offset < begin || offset >= end)  return WT_OTHER;
2046        return getWordType0( target .charAt(  offset ) , opts);
2047    }
2048
2049
2050    private static final boolean regionMatches(String   text, int offset, int limit,
2051                                               String   part, int partlen) {
2052        if (limit-offset < partlen)  return false;
2053        return text.regionMatches(offset, part, 0, partlen);
2054    }
2055
2056    private static final boolean regionMatches(String   text, int offset, int limit,
2057                                               int offset2, int partlen) {
2058        if (limit-offset < partlen)  return false;
2059        return text.regionMatches(offset, text, offset2, partlen);
2060    }
2061
2062    private static final boolean regionMatchesIgnoreCase(String   text, int offset, int limit,
2063                                                         String   part, int partlen) {
2064        return text.regionMatches(true, offset, part, 0, partlen);
2065    }
2066
2067    private static final boolean regionMatchesIgnoreCase(String   text, int offset, int limit,
2068                                                         int offset2, int partlen) {
2069        if (limit-offset < partlen)  return false;
2070        return text.regionMatches(true, offset, text, offset2, partlen);
2071    }
2072
2073
2074
2075
2076
2077
2078
2079    /**
2080     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2081     *
2082     * @return true if the target is matched to this regular expression.
2083     */
2084    public boolean matches(CharacterIterator   target) {
2085        return this.matches(target, (Match)null);
2086    }
2087
2088
2089    /**
2090     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2091     *
2092     * @param match A Match instance for storing matching result.
2093     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2094     */
2095    public boolean matches(CharacterIterator    target, Match match) {
2096        int start = target.getBeginIndex();
2097        int end = target.getEndIndex();
2098
2099
2100
2101        synchronized (this) {
2102            if (this.operations == null)
2103                this.prepare();
2104            if (this.context == null)
2105                this.context = new Context();
2106        }
2107        Context con = null;
2108        synchronized (this.context) {
2109            con = this.context.inuse ? new Context() : this.context;
2110            con.reset(target, start, end, this.numberOfClosures);
2111        }
2112        if (match != null) {
2113            match.setNumberOfGroups(this.nofparen);
2114            match.setSource(target);
2115        } else if (this.hasBackReferences) {
2116            match = new Match();
2117            match.setNumberOfGroups(this.nofparen);
2118            // Need not to call setSource() because
2119            // a caller can not access this match instance.
2120        }
2121        con.match = match;
2122
2123        if (isSet(this.options, XMLSCHEMA_MODE)) {
2124            int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2125            //System.err.println("DEBUG: matchEnd="+matchEnd);
2126            if (matchEnd == con.limit) {
2127                if (con.match != null) {
2128                    con.match.setBeginning(0, con.start);
2129                    con.match.setEnd(0, matchEnd);
2130                }
2131                con.inuse = false;
2132                return true;
2133            }
2134            return false;
2135        }
2136
2137        /*
2138         * The pattern has only fixed string.
2139         * The engine uses Boyer-Moore.
2140         */
2141        if (this.fixedStringOnly) {
2142            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2143            int o = this.fixedStringTable.matches(target, con.start, con.limit);
2144            if (o >= 0) {
2145                if (con.match != null) {
2146                    con.match.setBeginning(0, o);
2147                    con.match.setEnd(0, o+this.fixedString.length());
2148                }
2149                con.inuse = false;
2150                return true;
2151            }
2152            con.inuse = false;
2153            return false;
2154        }
2155
2156        /*
2157         * The pattern contains a fixed string.
2158         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2159         * If not, it return with false.
2160         */
2161        if (this.fixedString != null) {
2162            int o = this.fixedStringTable.matches(target, con.start, con.limit);
2163            if (o < 0) {
2164                //System.err.println("Non-match in fixed-string search.");
2165                con.inuse = false;
2166                return false;
2167            }
2168        }
2169
2170        int limit = con.limit-this.minlength;
2171        int matchStart;
2172        int matchEnd = -1;
2173
2174        /*
2175         * Checks whether the expression starts with ".*".
2176         */
2177        if (this.operations != null
2178            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2179            if (isSet(this.options, SINGLE_LINE)) {
2180                matchStart = con.start;
2181                matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2182            } else {
2183                boolean previousIsEOL = true;
2184                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
2185                    int ch =  target .setIndex(  matchStart ) ;
2186                    if (isEOLChar(ch)) {
2187                        previousIsEOL = true;
2188                    } else {
2189                        if (previousIsEOL) {
2190                            if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2191                                                                               matchStart, 1, this.options)))
2192                                break;
2193                        }
2194                        previousIsEOL = false;
2195                    }
2196                }
2197            }
2198        }
2199
2200        /*
2201         * Optimization against the first character.
2202         */
2203        else if (this.firstChar != null) {
2204            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2205            RangeToken range = this.firstChar;
2206            if (isSet(this.options, IGNORE_CASE)) {
2207                range = this.firstChar.getCaseInsensitiveToken();
2208                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
2209                    int ch =  target .setIndex(  matchStart ) ;
2210                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2211                        ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
2212                        if (!range.match(ch))  continue;
2213                    } else {
2214                        if (!range.match(ch)) {
2215                            char ch1 = Character.toUpperCase((char)ch);
2216                            if (!range.match(ch1))
2217                                if (!range.match(Character.toLowerCase(ch1)))
2218                                    continue;
2219                        }
2220                    }
2221                    if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2222                                                                       matchStart, 1, this.options)))
2223                        break;
2224                }
2225            } else {
2226                for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
2227                    int ch =  target .setIndex(  matchStart ) ;
2228                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2229                        ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
2230                    if (!range.match(ch))  continue;
2231                    if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2232                                                                       matchStart, 1, this.options)))
2233                        break;
2234                }
2235            }
2236        }
2237
2238        /*
2239         * Straightforward matching.
2240         */
2241        else {
2242            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
2243                if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
2244                    break;
2245            }
2246        }
2247
2248        if (matchEnd >= 0) {
2249            if (con.match != null) {
2250                con.match.setBeginning(0, matchStart);
2251                con.match.setEnd(0, matchEnd);
2252            }
2253            con.inuse = false;
2254            return true;
2255        } else {
2256            con.inuse = false;
2257            return false;
2258        }
2259    }
2260
2261    /**
2262     * @return -1 when not match; offset of the end of matched string when match.
2263     */
2264    private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
2265
2266
2267        CharacterIterator   target = con.ciTarget;
2268
2269
2270
2271
2272
2273
2274        while (true) {
2275            if (op == null)
2276                return offset;
2277            if (offset > con.limit || offset < con.start)
2278                return -1;
2279            switch (op.type) {
2280            case Op.CHAR:
2281                if (isSet(opts, IGNORE_CASE)) {
2282                    int ch = op.getData();
2283                    if (dx > 0) {
2284                        if (offset >= con.limit || !matchIgnoreCase(ch,  target .setIndex(  offset ) ))
2285                            return -1;
2286                        offset ++;
2287                    } else {
2288                        int o1 = offset-1;
2289                        if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .setIndex(  o1 ) ))
2290                            return -1;
2291                        offset = o1;
2292                    }
2293                } else {
2294                    int ch = op.getData();
2295                    if (dx > 0) {
2296                        if (offset >= con.limit || ch !=  target .setIndex(  offset ) )
2297                            return -1;
2298                        offset ++;
2299                    } else {
2300                        int o1 = offset-1;
2301                        if (o1 >= con.limit || o1 < 0 || ch !=  target .setIndex(  o1 ) )
2302                            return -1;
2303                        offset = o1;
2304                    }
2305                }
2306                op = op.next;
2307                break;
2308
2309            case Op.DOT:
2310                if (dx > 0) {
2311                    if (offset >= con.limit)
2312                        return -1;
2313                    int ch =  target .setIndex(  offset ) ;
2314                    if (isSet(opts, SINGLE_LINE)) {
2315                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2316                            offset ++;
2317                    } else {
2318                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2319                            ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
2320                        if (isEOLChar(ch))
2321                            return -1;
2322                    }
2323                    offset ++;
2324                } else {
2325                    int o1 = offset-1;
2326                    if (o1 >= con.limit || o1 < 0)
2327                        return -1;
2328                    int ch =  target .setIndex(  o1 ) ;
2329                    if (isSet(opts, SINGLE_LINE)) {
2330                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2331                            o1 --;
2332                    } else {
2333                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2334                            ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
2335                        if (!isEOLChar(ch))
2336                            return -1;
2337                    }
2338                    offset = o1;
2339                }
2340                op = op.next;
2341                break;
2342
2343            case Op.RANGE:
2344            case Op.NRANGE:
2345                if (dx > 0) {
2346                    if (offset >= con.limit)
2347                        return -1;
2348                    int ch =  target .setIndex(  offset ) ;
2349                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2350                        ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
2351                    RangeToken tok = op.getToken();
2352                    if (isSet(opts, IGNORE_CASE)) {
2353                        tok = tok.getCaseInsensitiveToken();
2354                        if (!tok.match(ch)) {
2355                            if (ch >= 0x10000)  return -1;
2356                            char uch;
2357                            if (!tok.match(uch = Character.toUpperCase((char)ch))
2358                                && !tok.match(Character.toLowerCase(uch)))
2359                                return -1;
2360                        }
2361                    } else {
2362                        if (!tok.match(ch))  return -1;
2363                    }
2364                    offset ++;
2365                } else {
2366                    int o1 = offset-1;
2367                    if (o1 >= con.limit || o1 < 0)
2368                        return -1;
2369                    int ch =  target .setIndex(  o1 ) ;
2370                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2371                        ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
2372                    RangeToken tok = op.getToken();
2373                    if (isSet(opts, IGNORE_CASE)) {
2374                        tok = tok.getCaseInsensitiveToken();
2375                        if (!tok.match(ch)) {
2376                            if (ch >= 0x10000)  return -1;
2377                            char uch;
2378                            if (!tok.match(uch = Character.toUpperCase((char)ch))
2379                                && !tok.match(Character.toLowerCase(uch)))
2380                                return -1;
2381                        }
2382                    } else {
2383                        if (!tok.match(ch))  return -1;
2384                    }
2385                    offset = o1;
2386                }
2387                op = op.next;
2388                break;
2389
2390            case Op.ANCHOR:
2391                boolean go = false;
2392                switch (op.getData()) {
2393                case '^':
2394                    if (isSet(opts, MULTIPLE_LINES)) {
2395                        if (!(offset == con.start
2396                              || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
2397                            return -1;
2398                    } else {
2399                        if (offset != con.start)
2400                            return -1;
2401                    }
2402                    break;
2403
2404                case '@':                         // Internal use only.
2405                    // The @ always matches line beginnings.
2406                    if (!(offset == con.start
2407                          || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
2408                        return -1;
2409                    break;
2410
2411                case '$':
2412                    if (isSet(opts, MULTIPLE_LINES)) {
2413                        if (!(offset == con.limit
2414                              || offset < con.limit && isEOLChar( target .setIndex(  offset ) )))
2415                            return -1;
2416                    } else {
2417                        if (!(offset == con.limit
2418                              || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
2419                              || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
2420                              &&  target .setIndex(  offset+1 )  == LINE_FEED))
2421                            return -1;
2422                    }
2423                    break;
2424
2425                case 'A':
2426                    if (offset != con.start)  return -1;
2427                    break;
2428
2429                case 'Z':
2430                    if (!(offset == con.limit
2431                          || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
2432                          || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
2433                          &&  target .setIndex(  offset+1 )  == LINE_FEED))
2434                        return -1;
2435                    break;
2436
2437                case 'z':
2438                    if (offset != con.limit)  return -1;
2439                    break;
2440
2441                case 'b':
2442                    if (con.length == 0)  return -1;
2443                    {
2444                        int after = getWordType(target, con.start, con.limit, offset, opts);
2445                        if (after == WT_IGNORE)  return -1;
2446                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
2447                        if (after == before)  return -1;
2448                    }
2449                    break;
2450
2451                case 'B':
2452                    if (con.length == 0)
2453                        go = true;
2454                    else {
2455                        int after = getWordType(target, con.start, con.limit, offset, opts);
2456                        go = after == WT_IGNORE
2457                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
2458                    }
2459                    if (!go)  return -1;
2460                    break;
2461
2462                case '<':
2463                    if (con.length == 0 || offset == con.limit)  return -1;
2464                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
2465                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
2466                        return -1;
2467                    break;
2468
2469                case '>':
2470                    if (con.length == 0 || offset == con.start)  return -1;
2471                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
2472                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
2473                        return -1;
2474                    break;
2475                } // switch anchor type
2476                op = op.next;
2477                break;
2478
2479            case Op.BACKREFERENCE:
2480                {
2481                    int refno = op.getData();
2482                    if (refno <= 0 || refno >= this.nofparen)
2483                        throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+refno);
2484                    if (con.match.getBeginning(refno) < 0
2485                        || con.match.getEnd(refno) < 0)
2486                        return -1;                // ********
2487                    int o2 = con.match.getBeginning(refno);
2488                    int literallen = con.match.getEnd(refno)-o2;
2489                    if (!isSet(opts, IGNORE_CASE)) {
2490                        if (dx > 0) {
2491                            if (!regionMatches(target, offset, con.limit, o2, literallen))
2492                                return -1;
2493                            offset += literallen;
2494                        } else {
2495                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
2496                                return -1;
2497                            offset -= literallen;
2498                        }
2499                    } else {
2500                        if (dx > 0) {
2501                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
2502                                return -1;
2503                            offset += literallen;
2504                        } else {
2505                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2506                                                         o2, literallen))
2507                                return -1;
2508                            offset -= literallen;
2509                        }
2510                    }
2511                }
2512                op = op.next;
2513                break;
2514            case Op.STRING:
2515                {
2516                    String   literal = op.getString();
2517                    int literallen = literal.length();
2518                    if (!isSet(opts, IGNORE_CASE)) {
2519                        if (dx > 0) {
2520                            if (!regionMatches(target, offset, con.limit, literal, literallen))
2521                                return -1;
2522                            offset += literallen;
2523                        } else {
2524                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
2525                                return -1;
2526                            offset -= literallen;
2527                        }
2528                    } else {
2529                        if (dx > 0) {
2530                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
2531                                return -1;
2532                            offset += literallen;
2533                        } else {
2534                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2535                                                         literal, literallen))
2536                                return -1;
2537                            offset -= literallen;
2538                        }
2539                    }
2540                }
2541                op = op.next;
2542                break;
2543
2544            case Op.CLOSURE:
2545                {
2546                    /*
2547                     * Saves current position to avoid
2548                     * zero-width repeats.
2549                     */
2550                    int id = op.getData();
2551                    if (id >= 0) {
2552                        int previousOffset = con.offsets[id];
2553                        if (previousOffset < 0 || previousOffset != offset) {
2554                            con.offsets[id] = offset;
2555                        } else {
2556                            con.offsets[id] = -1;
2557                            op = op.next;
2558                            break;
2559                        }
2560                    }
2561                    
2562                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2563                    if (id >= 0)  con.offsets[id] = -1;
2564                    if (ret >= 0)  return ret;
2565                    op = op.next;
2566                }
2567                break;
2568
2569            case Op.QUESTION:
2570                {
2571                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2572                    if (ret >= 0)  return ret;
2573                    op = op.next;
2574                }
2575                break;
2576
2577            case Op.NONGREEDYCLOSURE:
2578            case Op.NONGREEDYQUESTION:
2579                {
2580                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2581                    if (ret >= 0)  return ret;
2582                    op = op.getChild();
2583                }
2584                break;
2585
2586            case Op.UNION:
2587                for (int i = 0;  i < op.size();  i ++) {
2588                    int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
2589                    if (DEBUG) {
2590                        System.err.println("UNION: "+i+", ret="+ret);
2591                    }
2592                    if (ret == con.length)  return ret;
2593                }
2594                return -1;
2595
2596            case Op.CAPTURE:
2597                int refno = op.getData();
2598                if (con.match != null && refno > 0) {
2599                    int save = con.match.getBeginning(refno);
2600                    con.match.setBeginning(refno, offset);
2601                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2602                    if (ret < 0)  con.match.setBeginning(refno, save);
2603                    return ret;
2604                } else if (con.match != null && refno < 0) {
2605                    int index = -refno;
2606                    int save = con.match.getEnd(index);
2607                    con.match.setEnd(index, offset);
2608                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2609                    if (ret < 0)  con.match.setEnd(index, save);
2610                    return ret;
2611                }
2612                op = op.next;
2613                break;
2614
2615            case Op.LOOKAHEAD:
2616                if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
2617                op = op.next;
2618                break;
2619            case Op.NEGATIVELOOKAHEAD:
2620                if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
2621                op = op.next;
2622                break;
2623            case Op.LOOKBEHIND:
2624                if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
2625                op = op.next;
2626                break;
2627            case Op.NEGATIVELOOKBEHIND:
2628                if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
2629                op = op.next;
2630                break;
2631
2632            case Op.INDEPENDENT:
2633                {
2634                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2635                    if (ret < 0)  return ret;
2636                    offset = ret;
2637                    op = op.next;
2638                }
2639                break;
2640
2641            case Op.MODIFIER:
2642                {
2643                    int localopts = opts;
2644                    localopts |= op.getData();
2645                    localopts &= ~op.getData2();
2646                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2647                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
2648                    if (ret < 0)  return ret;
2649                    offset = ret;
2650                    op = op.next;
2651                }
2652                break;
2653
2654            case Op.CONDITION:
2655                {
2656                    Op.ConditionOp cop = (Op.ConditionOp)op;
2657                    boolean matchp = false;
2658                    if (cop.refNumber > 0) {
2659                        if (cop.refNumber >= this.nofparen)
2660                            throw new RuntimeException  ("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2661                        matchp = con.match.getBeginning(cop.refNumber) >= 0
2662                                 && con.match.getEnd(cop.refNumber) >= 0;
2663                    } else {
2664                        matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
2665                    }
2666
2667                    if (matchp) {
2668                        op = cop.yes;
2669                    } else if (cop.no != null) {
2670                        op = cop.no;
2671                    } else {
2672                        op = cop.next;
2673                    }
2674                }
2675                break;
2676
2677            default:
2678                throw new RuntimeException  ("Unknown operation type: "+op.type);
2679            } // switch (op.type)
2680        } // while
2681    }
2682
2683    private static final int getPreviousWordType(CharacterIterator    target, int begin, int end,
2684                                                 int offset, int opts) {
2685        int ret = getWordType(target, begin, end, --offset, opts);
2686        while (ret == WT_IGNORE)
2687            ret = getWordType(target, begin, end, --offset, opts);
2688        return ret;
2689    }
2690
2691    private static final int getWordType(CharacterIterator    target, int begin, int end,
2692                                         int offset, int opts) {
2693        if (offset < begin || offset >= end)  return WT_OTHER;
2694        return getWordType0( target .setIndex(  offset ) , opts);
2695    }
2696
2697
2698
2699    private static final boolean regionMatches(CharacterIterator    target, int offset, int limit,
2700                                               String   part, int partlen) {
2701        if (offset < 0)  return false;
2702        if (limit-offset < partlen)
2703            return false;
2704        int i = 0;
2705        while (partlen-- > 0) {
2706            if ( target .setIndex(  offset++ )  != part.charAt(i++))
2707                return false;
2708        }
2709        return true;
2710    }
2711
2712    private static final boolean regionMatches(CharacterIterator    target, int offset, int limit,
2713                                               int offset2, int partlen) {
2714        if (offset < 0)  return false;
2715        if (limit-offset < partlen)
2716            return false;
2717        int i = offset2;
2718        while (partlen-- > 0) {
2719            if ( target .setIndex(  offset++ )  !=  target .setIndex(  i++ ) )
2720                return false;
2721        }
2722        return true;
2723    }
2724
2725    /**
2726     * @see java.lang.String#regionMatches
2727     */
2728    private static final boolean regionMatchesIgnoreCase(CharacterIterator    target, int offset, int limit,
2729                                                         String   part, int partlen) {
2730        if (offset < 0)  return false;
2731        if (limit-offset < partlen)
2732            return false;
2733        int i = 0;
2734        while (partlen-- > 0) {
2735            char ch1 =  target .setIndex(  offset++ ) ;
2736            char ch2 = part.charAt(i++);
2737            if (ch1 == ch2)
2738                continue;
2739            char uch1 = Character.toUpperCase(ch1);
2740            char uch2 = Character.toUpperCase(ch2);
2741            if (uch1 == uch2)
2742                continue;
2743            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2744                return false;
2745        }
2746        return true;
2747    }
2748
2749    private static final boolean regionMatchesIgnoreCase(CharacterIterator    target, int offset, int limit,
2750                                                         int offset2, int partlen) {
2751        if (offset < 0)  return false;
2752        if (limit-offset < partlen)
2753            return false;
2754        int i = offset2;
2755        while (partlen-- > 0) {
2756            char ch1 =  target .setIndex(  offset++ ) ;
2757            char ch2 =  target .setIndex(  i++ ) ;
2758            if (ch1 == ch2)
2759                continue;
2760            char uch1 = Character.toUpperCase(ch1);
2761            char uch2 = Character.toUpperCase(ch2);
2762            if (uch1 == uch2)
2763                continue;
2764            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2765                return false;
2766        }
2767        return true;
2768    }
2769
2770
2771
2772
2773    // ================================================================
2774
2775    /**
2776     * A regular expression.
2777     * @serial
2778     */
2779    String   regex;
2780    /**
2781     * @serial
2782     */
2783    int options;
2784
2785    /**
2786     * The number of parenthesis in the regular expression.
2787     * @serial
2788     */
2789    int nofparen;
2790    /**
2791     * Internal representation of the regular expression.
2792     * @serial
2793     */
2794    Token tokentree;
2795
2796    boolean hasBackReferences = false;
2797
2798    transient int minlength;
2799    transient Op operations = null;
2800    transient int numberOfClosures;
2801    transient Context context = null;
2802    transient RangeToken firstChar = null;
2803
2804    transient String   fixedString = null;
2805    transient int fixedStringOptions;
2806    transient BMPattern fixedStringTable = null;
2807    transient boolean fixedStringOnly = false;
2808
2809
2810    static final class Context {
2811        CharacterIterator   ciTarget;
2812        String   strTarget;
2813        char[] charTarget;
2814        int start;
2815        int limit;
2816        int length;
2817        Match match;
2818        boolean inuse = false;
2819        int[] offsets;
2820
2821        Context() {
2822        }
2823
2824        private void resetCommon(int nofclosures) {
2825            this.length = this.limit-this.start;
2826            this.inuse = true;
2827            this.match = null;
2828            if (this.offsets == null || this.offsets.length != nofclosures)
2829                this.offsets = new int[nofclosures];
2830            for (int i = 0;  i < nofclosures;  i ++)  this.offsets[i] = -1;
2831        }
2832        void reset(CharacterIterator   target, int start, int limit, int nofclosures) {
2833            this.ciTarget = target;
2834            this.start = start;
2835            this.limit = limit;
2836            this.resetCommon(nofclosures);
2837        }
2838        void reset(String   target, int start, int limit, int nofclosures) {
2839            this.strTarget = target;
2840            this.start = start;
2841            this.limit = limit;
2842            this.resetCommon(nofclosures);
2843        }
2844        void reset(char[] target, int start, int limit, int nofclosures) {
2845            this.charTarget = target;
2846            this.start = start;
2847            this.limit = limit;
2848            this.resetCommon(nofclosures);
2849        }
2850    }
2851
2852    /**
2853     * Prepares for matching.  This method is called just before starting matching.
2854     */
2855    void prepare() {
2856        if (Op.COUNT)  Op.nofinstances = 0;
2857        this.compile(this.tokentree);
2858        /*
2859        if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2860            Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2861            anchor.next = this.operations;
2862            this.operations = anchor;
2863        }
2864        */
2865        if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2866
2867        this.minlength = this.tokentree.getMinLength();
2868
2869        this.firstChar = null;
2870        if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2871            && !isSet(this.options, XMLSCHEMA_MODE)) {
2872            RangeToken firstChar = Token.createRange();
2873            int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2874            if (fresult == Token.FC_TERMINAL) {
2875                firstChar.compactRanges();
2876                this.firstChar = firstChar;
2877                if (DEBUG)
2878                    System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2879            }
2880        }
2881
2882        if (this.operations != null
2883            && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2884            && this.operations.next == null) {
2885            if (DEBUG)
2886                System.err.print(" *** Only fixed string! *** ");
2887            this.fixedStringOnly = true;
2888            if (this.operations.type == Op.STRING)
2889                this.fixedString = this.operations.getString();
2890            else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2891                this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2892            } else {
2893                char[] ac = new char[1];
2894                ac[0] = (char)this.operations.getData();
2895                this.fixedString = new String  (ac);
2896            }
2897            this.fixedStringOptions = this.options;
2898            this.fixedStringTable = new BMPattern(this.fixedString, 256,
2899                                                  isSet(this.fixedStringOptions, IGNORE_CASE));
2900        } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2901                   && !isSet(this.options, XMLSCHEMA_MODE)) {
2902            Token.FixedStringContainer container = new Token.FixedStringContainer();
2903            this.tokentree.findFixedString(container, this.options);
2904            this.fixedString = container.token == null ? null : container.token.getString();
2905            this.fixedStringOptions = container.options;
2906            if (this.fixedString != null && this.fixedString.length() < 2)
2907                this.fixedString = null;
2908            // This pattern has a fixed string of which length is more than one.
2909            if (this.fixedString != null) {
2910                this.fixedStringTable = new BMPattern(this.fixedString, 256,
2911                                                      isSet(this.fixedStringOptions, IGNORE_CASE));
2912                if (DEBUG) {
2913                    System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2914                                       +"/" //+this.fixedString
2915                                       +"/"+REUtil.createOptionString(this.fixedStringOptions));
2916                    System.err.print("String: ");
2917                    REUtil.dumpString(this.fixedString);
2918                }
2919            }
2920        }
2921    }
2922
2923    /**
2924     * An option.
2925     * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2926     * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2927     * does not capture.
2928     *
2929     * @see #RegularExpression(java.lang.String,int)
2930     * @see #setPattern(java.lang.String,int)
2931    static final int MARK_PARENS = 1<<0;
2932     */
2933
2934    /**
2935     * "i"
2936     */
2937    static final int IGNORE_CASE = 1<<1;
2938
2939    /**
2940     * "s"
2941     */
2942    static final int SINGLE_LINE = 1<<2;
2943
2944    /**
2945     * "m"
2946     */
2947    static final int MULTIPLE_LINES = 1<<3;
2948
2949    /**
2950     * "x"
2951     */
2952    static final int EXTENDED_COMMENT = 1<<4;
2953
2954    /**
2955     * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2956     *
2957     * @see #RegularExpression(java.lang.String,int)
2958     * @see #setPattern(java.lang.String,int)
2959     * @see #UNICODE_WORD_BOUNDARY
2960     */
2961    static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2962
2963    /**
2964     * An option.
2965     * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2966     * <p>By default, the engine considers a position between a word character
2967     * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2968     * is a word boundary.
2969     * <p>By this option, the engine checks word boundaries with the method of
2970     * 'Unicode Regular Expression Guidelines' Revision 4.
2971     *
2972     * @see #RegularExpression(java.lang.String,int)
2973     * @see #setPattern(java.lang.String,int)
2974     */
2975    static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2976
2977    /**
2978     * "H"
2979     */
2980    static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2981    /**
2982     * "F"
2983     */
2984    static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2985    /**
2986     * "X". XML Schema mode.
2987     */
2988    static final int XMLSCHEMA_MODE = 1<<9;
2989    /**
2990     * ",".
2991     */
2992    static final int SPECIAL_COMMA = 1<<10;
2993
2994
2995    private static final boolean isSet(int options, int flag) {
2996        return (options & flag) == flag;
2997    }
2998
2999    /**
3000     * Creates a new RegularExpression instance.
3001     *
3002     * @param regex A regular expression
3003     * @exception org.enhydra.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3004     */
3005    public RegularExpression(String   regex) throws ParseException {
3006        this.setPattern(regex, null);
3007    }
3008
3009    /**
3010     * Creates a new RegularExpression instance with options.
3011     *
3012     * @param regex A regular expression
3013     * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
3014     * @exception org.enhydra.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
3015     */
3016    public RegularExpression(String   regex, String   options) throws ParseException {
3017        this.setPattern(regex, options);
3018    }
3019
3020    RegularExpression(String   regex, Token tok, int parens, boolean hasBackReferences, int options) {
3021        this.regex = regex;
3022        this.tokentree = tok;
3023        this.nofparen = parens;
3024        this.options = options;
3025        this.hasBackReferences = hasBackReferences;
3026    }
3027
3028    /**
3029     *
3030     */
3031    public void setPattern(String   newPattern) throws ParseException {
3032        this.setPattern(newPattern, this.options);
3033    }
3034
3035    private void setPattern(String   newPattern, int options) throws ParseException {
3036        this.regex = newPattern;
3037        this.options = options;
3038        RegexParser rp = isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
3039                         ? new ParserForXMLSchema() : new RegexParser();
3040        this.tokentree = rp.parse(this.regex, this.options);
3041        this.nofparen = rp.parennumber;
3042        this.hasBackReferences = rp.hasBackReferences;
3043
3044        this.operations = null;
3045        this.context = null;
3046    }
3047    /**
3048     *
3049     */
3050    public void setPattern(String   newPattern, String   options) throws ParseException {
3051        this.setPattern(newPattern, REUtil.parseOptions(options));
3052    }
3053
3054    /**
3055     *
3056     */
3057    public String   getPattern() {
3058        return this.regex;
3059    }
3060
3061    /**
3062     * Represents this instence in String.
3063     */
3064    public String   toString() {
3065        return this.tokentree.toString(this.options);
3066    }
3067
3068    /**
3069     * Returns a option string.
3070     * The order of letters in it may be different from a string specified
3071     * in a constructor or <code>setPattern()</code>.
3072     *
3073     * @see #RegularExpression(java.lang.String,java.lang.String)
3074     * @see #setPattern(java.lang.String,java.lang.String)
3075     */
3076    public String   getOptions() {
3077        return REUtil.createOptionString(this.options);
3078    }
3079
3080    /**
3081     *  Return true if patterns are the same and the options are equivalent.
3082     */
3083    public boolean equals(Object   obj) {
3084        if (obj == null)  return false;
3085        if (!(obj instanceof RegularExpression))
3086            return false;
3087        RegularExpression r = (RegularExpression)obj;
3088        return this.regex.equals(r.regex) && this.options == r.options;
3089    }
3090
3091    boolean equals(String   pattern, int options) {
3092        return this.regex.equals(pattern) && this.options == options;
3093    }
3094
3095    /**
3096     *
3097     */
3098    public int hashCode() {
3099        return (this.regex+"/"+this.getOptions()).hashCode();
3100    }
3101
3102    /**
3103     * Return the number of regular expression groups.
3104     * This method returns 1 when the regular expression has no capturing-parenthesis.
3105     *
3106     */
3107    public int getNumberOfGroups() {
3108        return this.nofparen;
3109    }
3110
3111    // ================================================================
3112
3113    private static final int WT_IGNORE = 0;
3114    private static final int WT_LETTER = 1;
3115    private static final int WT_OTHER = 2;
3116    transient static Token wordchar = null;
3117    private static final int getWordType0(char ch, int opts) {
3118        if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3119            if (isSet(opts, USE_UNICODE_CATEGORY)) {
3120                if (RegularExpression.wordchar == null)
3121                    RegularExpression.wordchar = Token.getRange("IsWord", true);
3122                return RegularExpression.wordchar.match(ch) ? WT_LETTER : WT_OTHER;
3123            }
3124            return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3125        }
3126
3127        switch (Character.getType(ch)) {
3128        case Character.UPPERCASE_LETTER:      // L
3129        case Character.LOWERCASE_LETTER:      // L
3130        case Character.TITLECASE_LETTER:      // L
3131        case Character.MODIFIER_LETTER:       // L
3132        case Character.OTHER_LETTER:          // L
3133        case Character.LETTER_NUMBER:         // N
3134        case Character.DECIMAL_DIGIT_NUMBER:  // N
3135        case Character.OTHER_NUMBER:          // N
3136        case Character.COMBINING_SPACING_MARK: // Mc
3137            return WT_LETTER;
3138
3139        case Character.FORMAT:                // Cf
3140        case Character.NON_SPACING_MARK:      // Mn
3141        case Character.ENCLOSING_MARK:        // Mc
3142            return WT_IGNORE;
3143
3144        case Character.CONTROL:               // Cc
3145            switch (ch) {
3146            case '\t':
3147            case '\n':
3148            case '\u000B':
3149            case '\f':
3150            case '\r':
3151                return WT_OTHER;
3152            default:
3153                return WT_IGNORE;
3154            }
3155
3156        default:
3157            return WT_OTHER;
3158        }
3159    }
3160
3161    // ================================================================
3162
3163    static final int LINE_FEED = 0x000A;
3164    static final int CARRIAGE_RETURN = 0x000D;
3165    static final int LINE_SEPARATOR = 0x2028;
3166    static final int PARAGRAPH_SEPARATOR = 0x2029;
3167
3168    private static final boolean isEOLChar(int ch) {
3169        return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
3170        || ch == PARAGRAPH_SEPARATOR;
3171    }
3172
3173    private static final boolean isWordChar(int ch) { // Legacy word characters
3174        if (ch == '_')  return true;
3175        if (ch < '0')  return false;
3176        if (ch > 'z')  return false;
3177        if (ch <= '9')  return true;
3178        if (ch < 'A')  return false;
3179        if (ch <= 'Z')  return true;
3180        if (ch < 'a')  return false;
3181        return true;
3182    }
3183
3184    private static final boolean matchIgnoreCase(int chardata, int ch) {
3185        if (chardata == ch)  return true;
3186        if (chardata > 0xffff || ch > 0xffff)  return false;
3187        char uch1 = Character.toUpperCase((char)chardata);
3188        char uch2 = Character.toUpperCase((char)ch);
3189        if (uch1 == uch2)  return true;
3190        return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
3191    }
3192}
3193
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags