KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xerces > impl > xpath > regex > RegularExpression


1 /*
2  * Copyright 1999-2002,2004,2005 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.xerces.impl.xpath.regex;
18
19 import java.text.CharacterIterator JavaDoc;
20
21 /**
22  * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
23  * This engine does not conform to the POSIX regular expression.
24  *
25  * <hr width="50%">
26  * <h3>How to use</h3>
27  *
28  * <dl>
29  * <dt>A. Standard way
30  * <dd>
31  * <pre>
32  * RegularExpression re = new RegularExpression(<var>regex</var>);
33  * if (re.matches(text)) { ... }
34  * </pre>
35  *
36  * <dt>B. Capturing groups
37  * <dd>
38  * <pre>
39  * RegularExpression re = new RegularExpression(<var>regex</var>);
40  * Match match = new Match();
41  * if (re.matches(text, match)) {
42  * ... // You can refer captured texts with methods of the <code>Match</code> class.
43  * }
44  * </pre>
45  *
46  * </dl>
47  *
48  * <h4>Case-insensitive matching</h4>
49  * <pre>
50  * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
51  * if (re.matches(text) >= 0) { ...}
52  * </pre>
53  *
54  * <h4>Options</h4>
55  * <p>You can specify options to <a HREF="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
56  * or <a HREF="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
57  * This <var>options</var> parameter consists of the following characters.
58  * </p>
59  * <dl>
60  * <dt><a name="I_OPTION"><code>"i"</code></a>
61  * <dd>This option indicates case-insensitive matching.
62  * <dt><a name="M_OPTION"><code>"m"</code></a>
63  * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
64  * <dt><a name="S_OPTION"><code>"s"</code></a>
65  * <dd class="REGEX"><kbd>.</kbd> matches any one character.
66  * <dt><a name="U_OPTION"><code>"u"</code></a>
67  * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
68  * <dt><a name="W_OPTION"><code>"w"</code></a>
69  * <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
70  * 'Unicode Regular Expression Guidelines' Revision 4.
71  * When "w" and "u" are specified at the same time,
72  * <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
73  * <dt><a name="COMMA_OPTION"><code>","</code></a>
74  * <dd>The parser treats a comma in a character class as a range separator.
75  * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
76  * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
77  *
78  * <dt><a name="X_OPTION"><code>"X"</code></a>
79  * <dd class="REGEX">
80  * By this option, the engine confoms to <a HREF="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
81  * The <code>match()</code> method does not do subsring matching
82  * but entire string matching.
83  *
84  * </dl>
85  *
86  * <hr width="50%">
87  * <h3>Syntax</h3>
88  * <table border="1" bgcolor="#ddeeff">
89  * <tr>
90  * <td>
91  * <h4>Differences from the Perl 5 regular expression</h4>
92  * <ul>
93  * <li>There is 6-digit hexadecimal character representation (<kbd>\v</kbd><var>HHHHHH</var>.)
94  * <li>Supports subtraction, union, and intersection operations for character classes.
95  * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
96  * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
97  * <kbd>\ u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
98  * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
99  * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
100  * </ul>
101  * </td>
102  * </tr>
103  * </table>
104  *
105  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
106  * <ul>
107  * <li>Character
108  * <dl>
109  * <dt class="REGEX"><kbd>.</kbd> (A period)
110  * <dd>Matches any one character except the following characters.
111  * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
112  * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
113  * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
114  * <dd>When <a HREF="#S_OPTION">the "s" option</a> is specified,
115  * it matches any character including the above four characters.
116  *
117  * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
118  * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
119  * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
120  *
121  * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
122  * <dd>Matches a control character.
123  * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
124  * '<kbd>[</kbd>', '<kbd>\</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
125  * It matches a control character of which the character code is less than
126  * the character code of the <var>C</var> by 0x0040.
127  * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
128  * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
129  *
130  * <dt class="REGEX">a non-meta character
131  * <dd>Matches the character.
132  *
133  * <dt class="REGEX"><KBD>\</KBD> + a meta character
134  * <dd>Matches the meta character.
135  *
136  * <dt class="REGEX"><kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>
137  * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
138  * You can write just 2 digits for <kbd>\x</kbd><var>HH</var>, and
139  * variable length digits for <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd>.
140  *
141  * <!--
142  * <dt class="REGEX"><kbd>\ u</kbd><var>HHHH</var>
143  * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
144  * -->
145  *
146  * <dt class="REGEX"><kbd>\v</kbd><var>HHHHHH</var>
147  * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
148  *
149  * <dt class="REGEX"><kbd>\g</kbd>
150  * <dd>Matches a grapheme.
151  * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
152  *
153  * <dt class="REGEX"><kbd>\X</kbd>
154  * <dd class="REGEX">Matches a combining character sequence.
155  * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
156  * </dl>
157  * </li>
158  *
159  * <li>Character class
160  * <dl>
161 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a HREF="#COMMA_OPTION">"," option</a>)
162 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a HREF="#COMMA_OPTION">"," option</a>)
163  * <dd>Positive character class. It matches a character in ranges.
164  * <dd><var>R<sub>n</sub></var>:
165  * <ul>
166  * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\x</kbd><var>HH</var> <kbd>\x{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\ u</kbd><var>HHHH</var--> <kbd>\v</kbd><var>HHHHHH</var>)
167  * <p>This range matches the character.
168  * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
169  * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
170 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
171 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
172  * <p>...
173  * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
174  * <p>These expressions specifies the same ranges as the following expressions.
175  * </ul>
176  * <p class="REGEX">Enumerated ranges are merged (union operation).
177  * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
178  *
179  * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a HREF="#COMMA_OPTION">"," option</a>)
180  * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a HREF="#COMMA_OPTION">"," option</a>)
181  * <dd>Negative character class. It matches a character not in ranges.
182  *
183  * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
184  * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
185  * <dd>Subtraction or union or intersection for character classes.
186  * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
187  * <dd>The result of this operations is a <u>positive character class</u>
188  * even if an expression includes any negative character classes.
189  * You have to take care on this in case-insensitive matching.
190  * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
191  * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
192  * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
193  * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
194  * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
195  *
196  * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a HREF="#X_OPTION">"X" option</a>)</dt>
197  * <dd>Character class subtraction for the XML Schema.
198  * You can use this syntax when you specify an <a HREF="#X_OPTION">"X" option</a>.
199  *
200  * <dt class="REGEX"><kbd>\d</kbd>
201  * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
202  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
203  * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
204  *
205  * <dt class="REGEX"><kbd>\D</kbd>
206  * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
207  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
208  * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
209  *
210  * <dt class="REGEX"><kbd>\s</kbd>
211  * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
212  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
213  * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
214  *
215  * <dt class="REGEX"><kbd>\S</kbd>
216  * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
217  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
218  * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
219  *
220  * <dt class="REGEX"><kbd>\w</kbd>
221  * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
222  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
223  * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
224  *
225  * <dt class="REGEX"><kbd>\W</kbd>
226  * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
227  * <dd>When <a HREF="#U_OPTION">a "u" option</a> is set, it is equivalent to
228  * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
229  *
230  * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
231  * <dd>Matches one character in the specified General Category (the second field in <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a HREF="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
232  * The following names are available:
233  * <dl>
234  * <dt>Unicode General Categories:
235  * <dd><kbd>
236  * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
237  * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
238  * </kbd>
239  * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
240  * <dt>Unicode Blocks:
241  * <dd><kbd>
242  * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
243  * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
244  * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
245  * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
246  * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
247  * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
248  * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
249  * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
250  * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
251  * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
252  * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
253  * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
254  * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
255  * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
256  * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
257  * Small Form Variants, Arabic Presentation Forms-B, Specials,
258  * Halfwidth and Fullwidth Forms
259  * </kbd>
260  * <dt>Others:
261  * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u0000-\v10FFFF]</kbd>)
262  * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
263  * <dd><kbd>UNASSGINED</kbd>
264  * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
265  * </dl>
266  *
267  * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
268  * <dd>Matches one character not in the specified General Category or the specified Block.
269  * </dl>
270  * </li>
271  *
272  * <li>Selection and Quantifier
273  * <dl>
274  * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
275  * <dd>...
276  *
277  * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
278  * <dd>Matches 0 or more <var>X</var>.
279  *
280  * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
281  * <dd>Matches 1 or more <var>X</var>.
282  *
283  * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
284  * <dd>Matches 0 or 1 <var>X</var>.
285  *
286  * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
287  * <dd>Matches <var>number</var> times.
288  *
289  * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
290  * <dd>...
291  *
292  * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
293  * <dd>...
294  *
295  * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
296  * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
297  * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
298  * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
299  * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
300  * <dd>Non-greedy matching.
301  * </dl>
302  * </li>
303  *
304  * <li>Grouping, Capturing, and Back-reference
305  * <dl>
306  * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
307  * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
308  * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
309  * you have to write "<KBD>(?:foo)+</KBD>".
310  *
311  * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
312  * <dd>Grouping with capturing.
313  * It make a group and applications can know
314  * where in target text a group matched with methods of a <code>Match</code> instance
315  * after <code><a HREF="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
316  * The 0th group means whole of this regular expression.
317  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
318  *
319  * <p>For instance, a regular expression is
320  * "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
321  * and target text is
322  * "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
323  * <ul>
324  * <li><code>Match.getCapturedText(0)</code>:
325  * "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
326  * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
327  * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
328  * </ul>
329  *
330  * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
331  * <dd>
332  *
333  * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
334  * <dd>Independent expression group. ................
335  *
336  * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
337  * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
338  * <dd>............................
339  * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
340  * Note that it can not contain 'u'.
341  *
342  * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
343  * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
344  * <dd>......
345  * <dd>These expressions must be at the beginning of a group.
346  * </dl>
347  * </li>
348  *
349  * <li>Anchor
350  * <dl>
351  * <dt class="REGEX"><kbd>\A</kbd>
352  * <dd>Matches the beginnig of the text.
353  *
354  * <dt class="REGEX"><kbd>\Z</kbd>
355  * <dd>Matches the end of the text, or before an EOL character at the end of the text,
356  * or CARRIAGE RETURN + LINE FEED at the end of the text.
357  *
358  * <dt class="REGEX"><kbd>\z</kbd>
359  * <dd>Matches the end of the text.
360  *
361  * <dt class="REGEX"><kbd>^</kbd>
362  * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
363  * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
364  * it matches the beginning of the text, or after one of EOL characters (
365  * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
366  * PARAGRAPH SEPARATOR (U+2029).)
367  *
368  * <dt class="REGEX"><kbd>$</kbd>
369  * <dd>Matches the end of the text, or before an EOL character at the end of the text,
370  * or CARRIAGE RETURN + LINE FEED at the end of the text.
371  * <dd>When <a HREF="#M_OPTION">a "m" option</a> is set,
372  * it matches the end of the text, or before an EOL character.
373  *
374  * <dt class="REGEX"><kbd>\b</kbd>
375  * <dd>Matches word boundary.
376  * (See <a HREF="#W_OPTION">a "w" option</a>)
377  *
378  * <dt class="REGEX"><kbd>\B</kbd>
379  * <dd>Matches non word boundary.
380  * (See <a HREF="#W_OPTION">a "w" option</a>)
381  *
382  * <dt class="REGEX"><kbd>\&lt;</kbd>
383  * <dd>Matches the beginning of a word.
384  * (See <a HREF="#W_OPTION">a "w" option</a>)
385  *
386  * <dt class="REGEX"><kbd>\&gt;</kbd>
387  * <dd>Matches the end of a word.
388  * (See <a HREF="#W_OPTION">a "w" option</a>)
389  * </dl>
390  * </li>
391  * <li>Lookahead and lookbehind
392  * <dl>
393  * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
394  * <dd>Lookahead.
395  *
396  * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
397  * <dd>Negative lookahead.
398  *
399  * <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
400  * <dd>Lookbehind.
401  * <dd>(Note for text capturing......)
402  *
403  * <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
404  * <dd>Negative lookbehind.
405  * </dl>
406  * </li>
407  *
408  * <li>Misc.
409  * <dl>
410  * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
411  * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
412  * <dd>......
413  * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
414  * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
415  * You can not write comments in character classes and before quantifiers.
416  * </dl>
417  * </li>
418  * </ul>
419  *
420  *
421  * <hr width="50%">
422  * <h3>BNF for the regular expression</h3>
423  * <pre>
424  * regex ::= ('(?' options ')')? term ('|' term)*
425  * term ::= factor+
426  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
427  * | '(?#' [^)]* ')'
428  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
429  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
430  * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
431  * | '(?>' regex ')' | '(?' options ':' regex ')'
432  * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
433  * options ::= [imsw]* ('-' [imsw]+)?
434  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
435  * looks ::= '(?=' regex ')' | '(?!' regex ')'
436  * | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
437  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
438  * category-block ::= '\' [pP] category-symbol-1
439  * | ('\p{' | '\P{') (category-symbol | block-name
440  * | other-properties) '}'
441  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
442  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
443  * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
444  * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
445  * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
446  * | 'Sm' | 'Sc' | 'Sk' | 'So'
447  * block-name ::= (See above)
448  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
449  * character-1 ::= (any character except meta-characters)
450  *
451  * char-class ::= '[' ranges ']'
452  * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
453  * ranges ::= '^'? (range <a HREF="#COMMA_OPTION">','?</a>)+
454  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
455  * | range-char | range-char '-' range-char
456  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
457  * code-point ::= '\x' hex-char hex-char
458  * | '\x{' hex-char+ '}'
459  * <!-- | '\ u' hex-char hex-char hex-char hex-char
460  * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
461  * hex-char ::= [0-9a-fA-F]
462  * character-2 ::= (any character except \[]-,)
463  * </pre>
464  *
465  * <hr width="50%">
466  * <h3>TODO</h3>
467  * <ul>
468  * <li><a HREF="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
469  * <ul>
470  * <li>2.4 Canonical Equivalents
471  * <li>Level 3
472  * </ul>
473  * <li>Parsing performance
474  * </ul>
475  *
476  * <hr width="50%">
477  *
478  * @xerces.internal
479  *
480  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
481  * @version $Id: RegularExpression.java,v 1.10 2005/03/22 03:26:24 mrglavas Exp $
482  */

483 public class RegularExpression implements java.io.Serializable JavaDoc {
484     
485     private static final long serialVersionUID = 3905241217112815923L;
486
487     static final boolean DEBUG = false;
488
489     /**
490      * Compiles a token tree into an operation flow.
491      */

492     private synchronized void compile(Token tok) {
493         if (this.operations != null)
494             return;
495         this.numberOfClosures = 0;
496         this.operations = this.compile(tok, null, false);
497     }
498
499     /**
500      * Converts a token to an operation.
501      */

502     private Op compile(Token tok, Op next, boolean reverse) {
503         Op ret;
504         switch (tok.type) {
505         case Token.DOT:
506             ret = Op.createDot();
507             ret.next = next;
508             break;
509
510         case Token.CHAR:
511             ret = Op.createChar(tok.getChar());
512             ret.next = next;
513             break;
514
515         case Token.ANCHOR:
516             ret = Op.createAnchor(tok.getChar());
517             ret.next = next;
518             break;
519
520         case Token.RANGE:
521         case Token.NRANGE:
522             ret = Op.createRange(tok);
523             ret.next = next;
524             break;
525
526         case Token.CONCAT:
527             ret = next;
528             if (!reverse) {
529                 for (int i = tok.size()-1; i >= 0; i --) {
530                     ret = compile(tok.getChild(i), ret, false);
531                 }
532             } else {
533                 for (int i = 0; i < tok.size(); i ++) {
534                     ret = compile(tok.getChild(i), ret, true);
535                 }
536             }
537             break;
538
539         case Token.UNION:
540             Op.UnionOp uni = Op.createUnion(tok.size());
541             for (int i = 0; i < tok.size(); i ++) {
542                 uni.addElement(compile(tok.getChild(i), next, reverse));
543             }
544             ret = uni; // ret.next is null.
545
break;
546
547         case Token.CLOSURE:
548         case Token.NONGREEDYCLOSURE:
549             Token child = tok.getChild(0);
550             int min = tok.getMin();
551             int max = tok.getMax();
552             if (min >= 0 && min == max) { // {n}
553
ret = next;
554                 for (int i = 0; i < min; i ++) {
555                     ret = compile(child, ret, reverse);
556                 }
557                 break;
558             }
559             if (min > 0 && max > 0)
560                 max -= min;
561             if (max > 0) {
562                 // X{2,6} -> XX(X(X(XX?)?)?)?
563
ret = next;
564                 for (int i = 0; i < max; i ++) {
565                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
566                     q.next = next;
567                     q.setChild(compile(child, ret, reverse));
568                     ret = q;
569                 }
570             } else {
571                 Op.ChildOp op;
572                 if (tok.type == Token.NONGREEDYCLOSURE) {
573                     op = Op.createNonGreedyClosure();
574                 } else { // Token.CLOSURE
575
if (child.getMinLength() == 0)
576                         op = Op.createClosure(this.numberOfClosures++);
577                     else
578                         op = Op.createClosure(-1);
579                 }
580                 op.next = next;
581                 op.setChild(compile(child, op, reverse));
582                 ret = op;
583             }
584             if (min > 0) {
585                 for (int i = 0; i < min; i ++) {
586                     ret = compile(child, ret, reverse);
587                 }
588             }
589             break;
590
591         case Token.EMPTY:
592             ret = next;
593             break;
594
595         case Token.STRING:
596             ret = Op.createString(tok.getString());
597             ret.next = next;
598             break;
599
600         case Token.BACKREFERENCE:
601             ret = Op.createBackReference(tok.getReferenceNumber());
602             ret.next = next;
603             break;
604
605         case Token.PAREN:
606             if (tok.getParenNumber() == 0) {
607                 ret = compile(tok.getChild(0), next, reverse);
608             } else if (reverse) {
609                 next = Op.createCapture(tok.getParenNumber(), next);
610                 next = compile(tok.getChild(0), next, reverse);
611                 ret = Op.createCapture(-tok.getParenNumber(), next);
612             } else {
613                 next = Op.createCapture(-tok.getParenNumber(), next);
614                 next = compile(tok.getChild(0), next, reverse);
615                 ret = Op.createCapture(tok.getParenNumber(), next);
616             }
617             break;
618
619         case Token.LOOKAHEAD:
620             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
621             break;
622         case Token.NEGATIVELOOKAHEAD:
623             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
624             break;
625         case Token.LOOKBEHIND:
626             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
627             break;
628         case Token.NEGATIVELOOKBEHIND:
629             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
630             break;
631
632         case Token.INDEPENDENT:
633             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
634             break;
635
636         case Token.MODIFIERGROUP:
637             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
638                                     ((Token.ModifierToken)tok).getOptions(),
639                                     ((Token.ModifierToken)tok).getOptionsMask());
640             break;
641
642         case Token.CONDITION:
643             Token.ConditionToken ctok = (Token.ConditionToken)tok;
644             int ref = ctok.refNumber;
645             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
646             Op yes = compile(ctok.yes, next, reverse);
647             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
648             ret = Op.createCondition(next, ref, condition, yes, no);
649             break;
650
651         default:
652             throw new RuntimeException JavaDoc("Unknown token type: "+tok.type);
653         } // switch (tok.type)
654
return ret;
655     }
656
657
658 //Public
659

660     /**
661      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
662      *
663      * @return true if the target is matched to this regular expression.
664      */

665     public boolean matches(char[] target) {
666         return this.matches(target, 0, target .length , (Match)null);
667     }
668
669     /**
670      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
671      * in specified range or not.
672      *
673      * @param start Start offset of the range.
674      * @param end End offset +1 of the range.
675      * @return true if the target is matched to this regular expression.
676      */

677     public boolean matches(char[] target, int start, int end) {
678         return this.matches(target, start, end, (Match)null);
679     }
680
681     /**
682      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
683      *
684      * @param match A Match instance for storing matching result.
685      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
686      */

687     public boolean matches(char[] target, Match match) {
688         return this.matches(target, 0, target .length , match);
689     }
690
691
692     /**
693      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
694      * in specified range or not.
695      *
696      * @param start Start offset of the range.
697      * @param end End offset +1 of the range.
698      * @param match A Match instance for storing matching result.
699      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
700      */

701     public boolean matches(char[] target, int start, int end, Match match) {
702
703         synchronized (this) {
704             if (this.operations == null)
705                 this.prepare();
706             if (this.context == null)
707                 this.context = new Context();
708         }
709         Context con = null;
710         synchronized (this.context) {
711             con = this.context.inuse ? new Context() : this.context;
712             con.reset(target, start, end, this.numberOfClosures);
713         }
714         if (match != null) {
715             match.setNumberOfGroups(this.nofparen);
716             match.setSource(target);
717         } else if (this.hasBackReferences) {
718             match = new Match();
719             match.setNumberOfGroups(this.nofparen);
720             // Need not to call setSource() because
721
// a caller can not access this match instance.
722
}
723         con.match = match;
724
725         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
726             int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
727             //System.err.println("DEBUG: matchEnd="+matchEnd);
728
if (matchEnd == con.limit) {
729                 if (con.match != null) {
730                     con.match.setBeginning(0, con.start);
731                     con.match.setEnd(0, matchEnd);
732                 }
733                 con.inuse = false;
734                 return true;
735             }
736             return false;
737         }
738
739         /*
740          * The pattern has only fixed string.
741          * The engine uses Boyer-Moore.
742          */

743         if (this.fixedStringOnly) {
744             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
745
int o = this.fixedStringTable.matches(target, con.start, con.limit);
746             if (o >= 0) {
747                 if (con.match != null) {
748                     con.match.setBeginning(0, o);
749                     con.match.setEnd(0, o+this.fixedString.length());
750                 }
751                 con.inuse = false;
752                 return true;
753             }
754             con.inuse = false;
755             return false;
756         }
757
758         /*
759          * The pattern contains a fixed string.
760          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
761          * If not, it return with false.
762          */

763         if (this.fixedString != null) {
764             int o = this.fixedStringTable.matches(target, con.start, con.limit);
765             if (o < 0) {
766                 //System.err.println("Non-match in fixed-string search.");
767
con.inuse = false;
768                 return false;
769             }
770         }
771
772         int limit = con.limit-this.minlength;
773         int matchStart;
774         int matchEnd = -1;
775
776         /*
777          * Checks whether the expression starts with ".*".
778          */

779         if (this.operations != null
780             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
781             if (isSet(this.options, SINGLE_LINE)) {
782                 matchStart = con.start;
783                 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
784             } else {
785                 boolean previousIsEOL = true;
786                 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
787                     int ch = target [ matchStart ] ;
788                     if (isEOLChar(ch)) {
789                         previousIsEOL = true;
790                     } else {
791                         if (previousIsEOL) {
792                             if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
793                                                                        matchStart, 1, this.options)))
794                                 break;
795                         }
796                         previousIsEOL = false;
797                     }
798                 }
799             }
800         }
801
802         /*
803          * Optimization against the first character.
804          */

805         else if (this.firstChar != null) {
806             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
807
RangeToken range = this.firstChar;
808             if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
809                 range = this.firstChar.getCaseInsensitiveToken();
810                 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
811                     int ch = target [ matchStart ] ;
812                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
813                         ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
814                         if (!range.match(ch)) continue;
815                     } else {
816                         if (!range.match(ch)) {
817                             char ch1 = Character.toUpperCase((char)ch);
818                             if (!range.match(ch1))
819                                 if (!range.match(Character.toLowerCase(ch1)))
820                                     continue;
821                         }
822                     }
823                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
824                                                                matchStart, 1, this.options)))
825                         break;
826                 }
827             } else {
828                 for (matchStart = con.start; matchStart <= limit; matchStart ++) {
829                     int ch = target [ matchStart ] ;
830                     if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
831                         ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
832                     if (!range.match(ch)) continue;
833                     if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
834                                                                matchStart, 1, this.options)))
835                         break;
836                 }
837             }
838         }
839
840         /*
841          * Straightforward matching.
842          */

843         else {
844             for (matchStart = con.start; matchStart <= limit; matchStart ++) {
845                 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
846                     break;
847             }
848         }
849
850         if (matchEnd >= 0) {
851             if (con.match != null) {
852                 con.match.setBeginning(0, matchStart);
853                 con.match.setEnd(0, matchEnd);
854             }
855             con.inuse = false;
856             return true;
857         } else {
858             con.inuse = false;
859             return false;
860         }
861     }
862
863 /**
864  * @return -1 when not match; offset of the end of matched string when match.
865  */

866     private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
867
868         char[] target = con.charTarget;
869
870
871         while (true) {
872             if (op == null)
873                 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
874             if (offset > con.limit || offset < con.start)
875                 return -1;
876             switch (op.type) {
877             case Op.CHAR:
878                 if (isSet(opts, IGNORE_CASE)) {
879                     int ch = op.getData();
880                     if (dx > 0) {
881                         if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
882                             return -1;
883                         offset ++;
884                     } else {
885                         int o1 = offset-1;
886                         if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
887                             return -1;
888                         offset = o1;
889                     }
890                 } else {
891                     int ch = op.getData();
892                     if (dx > 0) {
893                         if (offset >= con.limit || ch != target [ offset ] )
894                             return -1;
895                         offset ++;
896                     } else {
897                         int o1 = offset-1;
898                         if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
899                             return -1;
900                         offset = o1;
901                     }
902                 }
903                 op = op.next;
904                 break;
905
906             case Op.DOT:
907                 if (dx > 0) {
908                     if (offset >= con.limit)
909                         return -1;
910                     int ch = target [ offset ] ;
911                     if (isSet(opts, SINGLE_LINE)) {
912                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
913                             offset ++;
914                     } else {
915                         if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
916                             ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
917                         if (isEOLChar(ch))
918                             return -1;
919                     }
920                     offset ++;
921                 } else {
922                     int o1 = offset-1;
923                     if (o1 >= con.limit || o1 < 0)
924                         return -1;
925                     int ch = target [ o1 ] ;
926                     if (isSet(opts, SINGLE_LINE)) {
927                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
928                             o1 --;
929                     } else {
930                         if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
931                             ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
932                         if (!isEOLChar(ch))
933                             return -1;
934                     }
935                     offset = o1;
936                 }
937                 op = op.next;
938                 break;
939
940             case Op.RANGE:
941             case Op.NRANGE:
942                 if (dx > 0) {
943                     if (offset >= con.limit)
944                         return -1;
945                     int ch = target [ offset ] ;
946                     if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
947                         ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
948                     RangeToken tok = op.getToken();
949                     if (isSet(opts, IGNORE_CASE)) {
950                         tok = tok.getCaseInsensitiveToken();
951                         if (!tok.match(ch)) {
952                             if (ch >= 0x10000) return -1;
953                             char uch;
954                             if (!tok.match(uch = Character.toUpperCase((char)ch))
955                                 && !tok.match(Character.toLowerCase(uch)))
956                                 return -1;
957                         }
958                     } else {
959                         if (!tok.match(ch)) return -1;
960                     }
961                     offset ++;
962                 } else {
963                     int o1 = offset-1;
964                     if (o1 >= con.limit || o1 < 0)
965                         return -1;
966                     int ch = target [ o1 ] ;
967                     if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
968                         ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
969                     RangeToken tok = op.getToken();
970                     if (isSet(opts, IGNORE_CASE)) {
971                         tok = tok.getCaseInsensitiveToken();
972                         if (!tok.match(ch)) {
973                             if (ch >= 0x10000) return -1;
974                             char uch;
975                             if (!tok.match(uch = Character.toUpperCase((char)ch))
976                                 && !tok.match(Character.toLowerCase(uch)))
977                                 return -1;
978                         }
979                     } else {
980                         if (!tok.match(ch)) return -1;
981                     }
982                     offset = o1;
983                 }
984                 op = op.next;
985                 break;
986
987             case Op.ANCHOR:
988                 boolean go = false;
989                 switch (op.getData()) {
990                 case '^':
991                     if (isSet(opts, MULTIPLE_LINES)) {
992                         if (!(offset == con.start
993                               || offset > con.start && isEOLChar( target [ offset-1 ] )))
994                             return -1;
995                     } else {
996                         if (offset != con.start)
997                             return -1;
998                     }
999                     break;
1000
1001                case '@': // Internal use only.
1002
// The @ always matches line beginnings.
1003
if (!(offset == con.start
1004                          || offset > con.start && isEOLChar( target [ offset-1 ] )))
1005                        return -1;
1006                    break;
1007
1008                case '$':
1009                    if (isSet(opts, MULTIPLE_LINES)) {
1010                        if (!(offset == con.limit
1011                              || offset < con.limit && isEOLChar( target [ offset ] )))
1012                            return -1;
1013                    } else {
1014                        if (!(offset == con.limit
1015                              || offset+1 == con.limit && isEOLChar( target [ offset ] )
1016                              || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1017                              && target [ offset+1 ] == LINE_FEED))
1018                            return -1;
1019                    }
1020                    break;
1021
1022                case 'A':
1023                    if (offset != con.start) return -1;
1024                    break;
1025
1026                case 'Z':
1027                    if (!(offset == con.limit
1028                          || offset+1 == con.limit && isEOLChar( target [ offset ] )
1029                          || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
1030                          && target [ offset+1 ] == LINE_FEED))
1031                        return -1;
1032                    break;
1033
1034                case 'z':
1035                    if (offset != con.limit) return -1;
1036                    break;
1037
1038                case 'b':
1039                    if (con.length == 0) return -1;
1040                    {
1041                        int after = getWordType(target, con.start, con.limit, offset, opts);
1042                        if (after == WT_IGNORE) return -1;
1043                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1044                        if (after == before) return -1;
1045                    }
1046                    break;
1047
1048                case 'B':
1049                    if (con.length == 0)
1050                        go = true;
1051                    else {
1052                        int after = getWordType(target, con.start, con.limit, offset, opts);
1053                        go = after == WT_IGNORE
1054                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1055                    }
1056                    if (!go) return -1;
1057                    break;
1058
1059                case '<':
1060                    if (con.length == 0 || offset == con.limit) return -1;
1061                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1062                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1063                        return -1;
1064                    break;
1065
1066                case '>':
1067                    if (con.length == 0 || offset == con.start) return -1;
1068                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1069                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1070                        return -1;
1071                    break;
1072                } // switch anchor type
1073
op = op.next;
1074                break;
1075
1076            case Op.BACKREFERENCE:
1077                {
1078                    int refno = op.getData();
1079                    if (refno <= 0 || refno >= this.nofparen)
1080                        throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
1081                    if (con.match.getBeginning(refno) < 0
1082                        || con.match.getEnd(refno) < 0)
1083                        return -1; // ********
1084
int o2 = con.match.getBeginning(refno);
1085                    int literallen = con.match.getEnd(refno)-o2;
1086                    if (!isSet(opts, IGNORE_CASE)) {
1087                        if (dx > 0) {
1088                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1089                                return -1;
1090                            offset += literallen;
1091                        } else {
1092                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1093                                return -1;
1094                            offset -= literallen;
1095                        }
1096                    } else {
1097                        if (dx > 0) {
1098                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1099                                return -1;
1100                            offset += literallen;
1101                        } else {
1102                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1103                                                         o2, literallen))
1104                                return -1;
1105                            offset -= literallen;
1106                        }
1107                    }
1108                }
1109                op = op.next;
1110                break;
1111            case Op.STRING:
1112                {
1113                    String JavaDoc literal = op.getString();
1114                    int literallen = literal.length();
1115                    if (!isSet(opts, IGNORE_CASE)) {
1116                        if (dx > 0) {
1117                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1118                                return -1;
1119                            offset += literallen;
1120                        } else {
1121                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1122                                return -1;
1123                            offset -= literallen;
1124                        }
1125                    } else {
1126                        if (dx > 0) {
1127                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1128                                return -1;
1129                            offset += literallen;
1130                        } else {
1131                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1132                                                         literal, literallen))
1133                                return -1;
1134                            offset -= literallen;
1135                        }
1136                    }
1137                }
1138                op = op.next;
1139                break;
1140
1141            case Op.CLOSURE:
1142                {
1143                    /*
1144                     * Saves current position to avoid
1145                     * zero-width repeats.
1146                     */

1147                    int id = op.getData();
1148                    if (id >= 0) {
1149                        int previousOffset = con.offsets[id];
1150                        if (previousOffset < 0 || previousOffset != offset) {
1151                            con.offsets[id] = offset;
1152                        } else {
1153                            con.offsets[id] = -1;
1154                            op = op.next;
1155                            break;
1156                        }
1157                    }
1158
1159                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1160                    if (id >= 0) con.offsets[id] = -1;
1161                    if (ret >= 0) return ret;
1162                    op = op.next;
1163                }
1164                break;
1165
1166            case Op.QUESTION:
1167                {
1168                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1169                    if (ret >= 0) return ret;
1170                    op = op.next;
1171                }
1172                break;
1173
1174            case Op.NONGREEDYCLOSURE:
1175            case Op.NONGREEDYQUESTION:
1176                {
1177                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1178                    if (ret >= 0) return ret;
1179                    op = op.getChild();
1180                }
1181                break;
1182
1183            case Op.UNION:
1184                for (int i = 0; i < op.size(); i ++) {
1185                    int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
1186                    if (DEBUG) {
1187                        System.err.println("UNION: "+i+", ret="+ret);
1188                    }
1189                    if (ret >= 0) return ret;
1190                }
1191                return -1;
1192
1193            case Op.CAPTURE:
1194                int refno = op.getData();
1195                if (con.match != null && refno > 0) {
1196                    int save = con.match.getBeginning(refno);
1197                    con.match.setBeginning(refno, offset);
1198                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1199                    if (ret < 0) con.match.setBeginning(refno, save);
1200                    return ret;
1201                } else if (con.match != null && refno < 0) {
1202                    int index = -refno;
1203                    int save = con.match.getEnd(index);
1204                    con.match.setEnd(index, offset);
1205                    int ret = this. matchCharArray (con, op.next, offset, dx, opts);
1206                    if (ret < 0) con.match.setEnd(index, save);
1207                    return ret;
1208                }
1209                op = op.next;
1210                break;
1211
1212            case Op.LOOKAHEAD:
1213                if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1214                op = op.next;
1215                break;
1216            case Op.NEGATIVELOOKAHEAD:
1217                if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
1218                op = op.next;
1219                break;
1220            case Op.LOOKBEHIND:
1221                if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
1222                op = op.next;
1223                break;
1224            case Op.NEGATIVELOOKBEHIND:
1225                if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
1226                op = op.next;
1227                break;
1228
1229            case Op.INDEPENDENT:
1230                {
1231                    int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
1232                    if (ret < 0) return ret;
1233                    offset = ret;
1234                    op = op.next;
1235                }
1236                break;
1237
1238            case Op.MODIFIER:
1239                {
1240                    int localopts = opts;
1241                    localopts |= op.getData();
1242                    localopts &= ~op.getData2();
1243                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1244
int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
1245                    if (ret < 0) return ret;
1246                    offset = ret;
1247                    op = op.next;
1248                }
1249                break;
1250
1251            case Op.CONDITION:
1252                {
1253                    Op.ConditionOp cop = (Op.ConditionOp)op;
1254                    boolean matchp = false;
1255                    if (cop.refNumber > 0) {
1256                        if (cop.refNumber >= this.nofparen)
1257                            throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1258                        matchp = con.match.getBeginning(cop.refNumber) >= 0
1259                                 && con.match.getEnd(cop.refNumber) >= 0;
1260                    } else {
1261                        matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
1262                    }
1263
1264                    if (matchp) {
1265                        op = cop.yes;
1266                    } else if (cop.no != null) {
1267                        op = cop.no;
1268                    } else {
1269                        op = cop.next;
1270                    }
1271                }
1272                break;
1273
1274            default:
1275                throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
1276            } // switch (op.type)
1277
} // while
1278
}
1279
1280    private static final int getPreviousWordType(char[] target, int begin, int end,
1281                                                 int offset, int opts) {
1282        int ret = getWordType(target, begin, end, --offset, opts);
1283        while (ret == WT_IGNORE)
1284            ret = getWordType(target, begin, end, --offset, opts);
1285        return ret;
1286    }
1287
1288    private static final int getWordType(char[] target, int begin, int end,
1289                                         int offset, int opts) {
1290        if (offset < begin || offset >= end) return WT_OTHER;
1291        return getWordType0( target [ offset ] , opts);
1292    }
1293
1294
1295
1296    private static final boolean regionMatches(char[] target, int offset, int limit,
1297                                               String JavaDoc part, int partlen) {
1298        if (offset < 0) return false;
1299        if (limit-offset < partlen)
1300            return false;
1301        int i = 0;
1302        while (partlen-- > 0) {
1303            if ( target [ offset++ ] != part.charAt(i++))
1304                return false;
1305        }
1306        return true;
1307    }
1308
1309    private static final boolean regionMatches(char[] target, int offset, int limit,
1310                                               int offset2, int partlen) {
1311        if (offset < 0) return false;
1312        if (limit-offset < partlen)
1313            return false;
1314        int i = offset2;
1315        while (partlen-- > 0) {
1316            if ( target [ offset++ ] != target [ i++ ] )
1317                return false;
1318        }
1319        return true;
1320    }
1321
1322/**
1323 * @see java.lang.String#regionMatches
1324 */

1325    private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
1326                                                         String JavaDoc part, int partlen) {
1327        if (offset < 0) return false;
1328        if (limit-offset < partlen)
1329            return false;
1330        int i = 0;
1331        while (partlen-- > 0) {
1332            char ch1 = target [ offset++ ] ;
1333            char ch2 = part.charAt(i++);
1334            if (ch1 == ch2)
1335                continue;
1336            char uch1 = Character.toUpperCase(ch1);
1337            char uch2 = Character.toUpperCase(ch2);
1338            if (uch1 == uch2)
1339                continue;
1340            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
1341                return false;
1342        }
1343        return true;
1344    }
1345
1346    private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
1347                                                         int offset2, int partlen) {
1348        if (offset < 0) return false;
1349        if (limit-offset < partlen)
1350            return false;
1351        int i = offset2;
1352        while (partlen-- > 0) {
1353            char ch1 = target [ offset++ ] ;
1354            char ch2 = target [ i++ ] ;
1355            if (ch1 == ch2)
1356                continue;
1357            char uch1 = Character.toUpperCase(ch1);
1358            char uch2 = Character.toUpperCase(ch2);
1359            if (uch1 == uch2)
1360                continue;
1361            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
1362                return false;
1363        }
1364        return true;
1365    }
1366
1367
1368
1369
1370    /**
1371     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1372     *
1373     * @return true if the target is matched to this regular expression.
1374     */

1375    public boolean matches(String JavaDoc target) {
1376        return this.matches(target, 0, target .length() , (Match)null);
1377    }
1378
1379    /**
1380     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1381     * in specified range or not.
1382     *
1383     * @param start Start offset of the range.
1384     * @param end End offset +1 of the range.
1385     * @return true if the target is matched to this regular expression.
1386     */

1387    public boolean matches(String JavaDoc target, int start, int end) {
1388        return this.matches(target, start, end, (Match)null);
1389    }
1390
1391    /**
1392     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1393     *
1394     * @param match A Match instance for storing matching result.
1395     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1396     */

1397    public boolean matches(String JavaDoc target, Match match) {
1398        return this.matches(target, 0, target .length() , match);
1399    }
1400
1401    /**
1402     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
1403     * in specified range or not.
1404     *
1405     * @param start Start offset of the range.
1406     * @param end End offset +1 of the range.
1407     * @param match A Match instance for storing matching result.
1408     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1409     */

1410    public boolean matches(String JavaDoc target, int start, int end, Match match) {
1411
1412        synchronized (this) {
1413            if (this.operations == null)
1414                this.prepare();
1415            if (this.context == null)
1416                this.context = new Context();
1417        }
1418        Context con = null;
1419        synchronized (this.context) {
1420            con = this.context.inuse ? new Context() : this.context;
1421            con.reset(target, start, end, this.numberOfClosures);
1422        }
1423        if (match != null) {
1424            match.setNumberOfGroups(this.nofparen);
1425            match.setSource(target);
1426        } else if (this.hasBackReferences) {
1427            match = new Match();
1428            match.setNumberOfGroups(this.nofparen);
1429            // Need not to call setSource() because
1430
// a caller can not access this match instance.
1431
}
1432        con.match = match;
1433
1434        if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1435            if (DEBUG) {
1436                System.err.println("target string="+target);
1437            }
1438            int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
1439            if (DEBUG) {
1440                System.err.println("matchEnd="+matchEnd);
1441                System.err.println("con.limit="+con.limit);
1442            }
1443            if (matchEnd == con.limit) {
1444                if (con.match != null) {
1445                    con.match.setBeginning(0, con.start);
1446                    con.match.setEnd(0, matchEnd);
1447                }
1448                con.inuse = false;
1449                return true;
1450            }
1451            return false;
1452        }
1453
1454        /*
1455         * The pattern has only fixed string.
1456         * The engine uses Boyer-Moore.
1457         */

1458        if (this.fixedStringOnly) {
1459            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1460
int o = this.fixedStringTable.matches(target, con.start, con.limit);
1461            if (o >= 0) {
1462                if (con.match != null) {
1463                    con.match.setBeginning(0, o);
1464                    con.match.setEnd(0, o+this.fixedString.length());
1465                }
1466                con.inuse = false;
1467                return true;
1468            }
1469            con.inuse = false;
1470            return false;
1471        }
1472
1473        /*
1474         * The pattern contains a fixed string.
1475         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1476         * If not, it return with false.
1477         */

1478        if (this.fixedString != null) {
1479            int o = this.fixedStringTable.matches(target, con.start, con.limit);
1480            if (o < 0) {
1481                //System.err.println("Non-match in fixed-string search.");
1482
con.inuse = false;
1483                return false;
1484            }
1485        }
1486
1487        int limit = con.limit-this.minlength;
1488        int matchStart;
1489        int matchEnd = -1;
1490
1491        /*
1492         * Checks whether the expression starts with ".*".
1493         */

1494        if (this.operations != null
1495            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1496            if (isSet(this.options, SINGLE_LINE)) {
1497                matchStart = con.start;
1498                matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
1499            } else {
1500                boolean previousIsEOL = true;
1501                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1502                    int ch = target .charAt( matchStart ) ;
1503                    if (isEOLChar(ch)) {
1504                        previousIsEOL = true;
1505                    } else {
1506                        if (previousIsEOL) {
1507                            if (0 <= (matchEnd = this. matchString (con, this.operations,
1508                                                                    matchStart, 1, this.options)))
1509                                break;
1510                        }
1511                        previousIsEOL = false;
1512                    }
1513                }
1514            }
1515        }
1516
1517        /*
1518         * Optimization against the first character.
1519         */

1520        else if (this.firstChar != null) {
1521            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1522
RangeToken range = this.firstChar;
1523            if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
1524                range = this.firstChar.getCaseInsensitiveToken();
1525                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1526                    int ch = target .charAt( matchStart ) ;
1527                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1528                        ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
1529                        if (!range.match(ch)) continue;
1530                    } else {
1531                        if (!range.match(ch)) {
1532                            char ch1 = Character.toUpperCase((char)ch);
1533                            if (!range.match(ch1))
1534                                if (!range.match(Character.toLowerCase(ch1)))
1535                                    continue;
1536                        }
1537                    }
1538                    if (0 <= (matchEnd = this. matchString (con, this.operations,
1539                                                            matchStart, 1, this.options)))
1540                        break;
1541                }
1542            } else {
1543                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1544                    int ch = target .charAt( matchStart ) ;
1545                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
1546                        ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
1547                    if (!range.match(ch)) continue;
1548                    if (0 <= (matchEnd = this. matchString (con, this.operations,
1549                                                            matchStart, 1, this.options)))
1550                        break;
1551                }
1552            }
1553        }
1554
1555        /*
1556         * Straightforward matching.
1557         */

1558        else {
1559            for (matchStart = con.start; matchStart <= limit; matchStart ++) {
1560                if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
1561                    break;
1562            }
1563        }
1564
1565        if (matchEnd >= 0) {
1566            if (con.match != null) {
1567                con.match.setBeginning(0, matchStart);
1568                con.match.setEnd(0, matchEnd);
1569            }
1570            con.inuse = false;
1571            return true;
1572        } else {
1573            con.inuse = false;
1574            return false;
1575        }
1576    }
1577
1578    /**
1579     * @return -1 when not match; offset of the end of matched string when match.
1580     */

1581    private int matchString (Context con, Op op, int offset, int dx, int opts) {
1582
1583
1584
1585
1586        String JavaDoc target = con.strTarget;
1587
1588
1589
1590
1591        while (true) {
1592            if (op == null)
1593                return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1594            if (offset > con.limit || offset < con.start)
1595                return -1;
1596            switch (op.type) {
1597            case Op.CHAR:
1598                if (isSet(opts, IGNORE_CASE)) {
1599                    int ch = op.getData();
1600                    if (dx > 0) {
1601                        if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) ))
1602                            return -1;
1603                        offset ++;
1604                    } else {
1605                        int o1 = offset-1;
1606                        if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) ))
1607                            return -1;
1608                        offset = o1;
1609                    }
1610                } else {
1611                    int ch = op.getData();
1612                    if (dx > 0) {
1613                        if (offset >= con.limit || ch != target .charAt( offset ) )
1614                            return -1;
1615                        offset ++;
1616                    } else {
1617                        int o1 = offset-1;
1618                        if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) )
1619                            return -1;
1620                        offset = o1;
1621                    }
1622                }
1623                op = op.next;
1624                break;
1625
1626            case Op.DOT:
1627                if (dx > 0) {
1628                    if (offset >= con.limit)
1629                        return -1;
1630                    int ch = target .charAt( offset ) ;
1631                    if (isSet(opts, SINGLE_LINE)) {
1632                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1633                            offset ++;
1634                    } else {
1635                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1636                            ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
1637                        if (isEOLChar(ch))
1638                            return -1;
1639                    }
1640                    offset ++;
1641                } else {
1642                    int o1 = offset-1;
1643                    if (o1 >= con.limit || o1 < 0)
1644                        return -1;
1645                    int ch = target .charAt( o1 ) ;
1646                    if (isSet(opts, SINGLE_LINE)) {
1647                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1648                            o1 --;
1649                    } else {
1650                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1651                            ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
1652                        if (!isEOLChar(ch))
1653                            return -1;
1654                    }
1655                    offset = o1;
1656                }
1657                op = op.next;
1658                break;
1659
1660            case Op.RANGE:
1661            case Op.NRANGE:
1662                if (dx > 0) {
1663                    if (offset >= con.limit)
1664                        return -1;
1665                    int ch = target .charAt( offset ) ;
1666                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
1667                        ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
1668                    RangeToken tok = op.getToken();
1669                    if (isSet(opts, IGNORE_CASE)) {
1670                        tok = tok.getCaseInsensitiveToken();
1671                        if (!tok.match(ch)) {
1672                            if (ch >= 0x10000) return -1;
1673                            char uch;
1674                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1675                                && !tok.match(Character.toLowerCase(uch)))
1676                                return -1;
1677                        }
1678                    } else {
1679                        if (!tok.match(ch)) return -1;
1680                    }
1681                    offset ++;
1682                } else {
1683                    int o1 = offset-1;
1684                    if (o1 >= con.limit || o1 < 0)
1685                        return -1;
1686                    int ch = target .charAt( o1 ) ;
1687                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
1688                        ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
1689                    RangeToken tok = op.getToken();
1690                    if (isSet(opts, IGNORE_CASE)) {
1691                        tok = tok.getCaseInsensitiveToken();
1692                        if (!tok.match(ch)) {
1693                            if (ch >= 0x10000) return -1;
1694                            char uch;
1695                            if (!tok.match(uch = Character.toUpperCase((char)ch))
1696                                && !tok.match(Character.toLowerCase(uch)))
1697                                return -1;
1698                        }
1699                    } else {
1700                        if (!tok.match(ch)) return -1;
1701                    }
1702                    offset = o1;
1703                }
1704                op = op.next;
1705                break;
1706
1707            case Op.ANCHOR:
1708                boolean go = false;
1709                switch (op.getData()) {
1710                case '^':
1711                    if (isSet(opts, MULTIPLE_LINES)) {
1712                        if (!(offset == con.start
1713                              || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
1714                            return -1;
1715                    } else {
1716                        if (offset != con.start)
1717                            return -1;
1718                    }
1719                    break;
1720
1721                case '@': // Internal use only.
1722
// The @ always matches line beginnings.
1723
if (!(offset == con.start
1724                          || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
1725                        return -1;
1726                    break;
1727
1728                case '$':
1729                    if (isSet(opts, MULTIPLE_LINES)) {
1730                        if (!(offset == con.limit
1731                              || offset < con.limit && isEOLChar( target .charAt( offset ) )))
1732                            return -1;
1733                    } else {
1734                        if (!(offset == con.limit
1735                              || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
1736                              || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
1737                              && target .charAt( offset+1 ) == LINE_FEED))
1738                            return -1;
1739                    }
1740                    break;
1741
1742                case 'A':
1743                    if (offset != con.start) return -1;
1744                    break;
1745
1746                case 'Z':
1747                    if (!(offset == con.limit
1748                          || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
1749                          || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
1750                          && target .charAt( offset+1 ) == LINE_FEED))
1751                        return -1;
1752                    break;
1753
1754                case 'z':
1755                    if (offset != con.limit) return -1;
1756                    break;
1757
1758                case 'b':
1759                    if (con.length == 0) return -1;
1760                    {
1761                        int after = getWordType(target, con.start, con.limit, offset, opts);
1762                        if (after == WT_IGNORE) return -1;
1763                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1764                        if (after == before) return -1;
1765                    }
1766                    break;
1767
1768                case 'B':
1769                    if (con.length == 0)
1770                        go = true;
1771                    else {
1772                        int after = getWordType(target, con.start, con.limit, offset, opts);
1773                        go = after == WT_IGNORE
1774                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1775                    }
1776                    if (!go) return -1;
1777                    break;
1778
1779                case '<':
1780                    if (con.length == 0 || offset == con.limit) return -1;
1781                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1782                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1783                        return -1;
1784                    break;
1785
1786                case '>':
1787                    if (con.length == 0 || offset == con.start) return -1;
1788                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1789                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1790                        return -1;
1791                    break;
1792                } // switch anchor type
1793
op = op.next;
1794                break;
1795
1796            case Op.BACKREFERENCE:
1797                {
1798                    int refno = op.getData();
1799                    if (refno <= 0 || refno >= this.nofparen)
1800                        throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
1801                    if (con.match.getBeginning(refno) < 0
1802                        || con.match.getEnd(refno) < 0)
1803                        return -1; // ********
1804
int o2 = con.match.getBeginning(refno);
1805                    int literallen = con.match.getEnd(refno)-o2;
1806                    if (!isSet(opts, IGNORE_CASE)) {
1807                        if (dx > 0) {
1808                            if (!regionMatches(target, offset, con.limit, o2, literallen))
1809                                return -1;
1810                            offset += literallen;
1811                        } else {
1812                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
1813                                return -1;
1814                            offset -= literallen;
1815                        }
1816                    } else {
1817                        if (dx > 0) {
1818                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
1819                                return -1;
1820                            offset += literallen;
1821                        } else {
1822                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1823                                                         o2, literallen))
1824                                return -1;
1825                            offset -= literallen;
1826                        }
1827                    }
1828                }
1829                op = op.next;
1830                break;
1831            case Op.STRING:
1832                {
1833                    String JavaDoc literal = op.getString();
1834                    int literallen = literal.length();
1835                    if (!isSet(opts, IGNORE_CASE)) {
1836                        if (dx > 0) {
1837                            if (!regionMatches(target, offset, con.limit, literal, literallen))
1838                                return -1;
1839                            offset += literallen;
1840                        } else {
1841                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
1842                                return -1;
1843                            offset -= literallen;
1844                        }
1845                    } else {
1846                        if (dx > 0) {
1847                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
1848                                return -1;
1849                            offset += literallen;
1850                        } else {
1851                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
1852                                                         literal, literallen))
1853                                return -1;
1854                            offset -= literallen;
1855                        }
1856                    }
1857                }
1858                op = op.next;
1859                break;
1860
1861            case Op.CLOSURE:
1862                {
1863                    /*
1864                     * Saves current position to avoid
1865                     * zero-width repeats.
1866                     */

1867                    int id = op.getData();
1868                    if (id >= 0) {
1869                        int previousOffset = con.offsets[id];
1870                        if (previousOffset < 0 || previousOffset != offset) {
1871                            con.offsets[id] = offset;
1872                        } else {
1873                            con.offsets[id] = -1;
1874                            op = op.next;
1875                            break;
1876                        }
1877                    }
1878                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1879                    if (id >= 0) con.offsets[id] = -1;
1880                    if (ret >= 0) return ret;
1881                    op = op.next;
1882                }
1883                break;
1884
1885            case Op.QUESTION:
1886                {
1887                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1888                    if (ret >= 0) return ret;
1889                    op = op.next;
1890                }
1891                break;
1892
1893            case Op.NONGREEDYCLOSURE:
1894            case Op.NONGREEDYQUESTION:
1895                {
1896                    int ret = this. matchString (con, op.next, offset, dx, opts);
1897                    if (ret >= 0) return ret;
1898                    op = op.getChild();
1899                }
1900                break;
1901
1902            case Op.UNION:
1903                for (int i = 0; i < op.size(); i ++) {
1904                    int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
1905                    if (DEBUG) {
1906                        System.err.println("UNION: "+i+", ret="+ret);
1907                    }
1908                    if (ret >= 0) return ret;
1909                }
1910                return -1;
1911
1912            case Op.CAPTURE:
1913                int refno = op.getData();
1914                if (con.match != null && refno > 0) {
1915                    int save = con.match.getBeginning(refno);
1916                    con.match.setBeginning(refno, offset);
1917                    int ret = this. matchString (con, op.next, offset, dx, opts);
1918                    if (ret < 0) con.match.setBeginning(refno, save);
1919                    return ret;
1920                } else if (con.match != null && refno < 0) {
1921                    int index = -refno;
1922                    int save = con.match.getEnd(index);
1923                    con.match.setEnd(index, offset);
1924                    int ret = this. matchString (con, op.next, offset, dx, opts);
1925                    if (ret < 0) con.match.setEnd(index, save);
1926                    return ret;
1927                }
1928                op = op.next;
1929                break;
1930
1931            case Op.LOOKAHEAD:
1932                if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
1933                op = op.next;
1934                break;
1935            case Op.NEGATIVELOOKAHEAD:
1936                if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
1937                op = op.next;
1938                break;
1939            case Op.LOOKBEHIND:
1940                if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
1941                op = op.next;
1942                break;
1943            case Op.NEGATIVELOOKBEHIND:
1944                if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
1945                op = op.next;
1946                break;
1947
1948            case Op.INDEPENDENT:
1949                {
1950                    int ret = this. matchString (con, op.getChild(), offset, dx, opts);
1951                    if (ret < 0) return ret;
1952                    offset = ret;
1953                    op = op.next;
1954                }
1955                break;
1956
1957            case Op.MODIFIER:
1958                {
1959                    int localopts = opts;
1960                    localopts |= op.getData();
1961                    localopts &= ~op.getData2();
1962                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
1963
int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
1964                    if (ret < 0) return ret;
1965                    offset = ret;
1966                    op = op.next;
1967                }
1968                break;
1969
1970            case Op.CONDITION:
1971                {
1972                    Op.ConditionOp cop = (Op.ConditionOp)op;
1973                    boolean matchp = false;
1974                    if (cop.refNumber > 0) {
1975                        if (cop.refNumber >= this.nofparen)
1976                            throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1977                        matchp = con.match.getBeginning(cop.refNumber) >= 0
1978                                 && con.match.getEnd(cop.refNumber) >= 0;
1979                    } else {
1980                        matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
1981                    }
1982
1983                    if (matchp) {
1984                        op = cop.yes;
1985                    } else if (cop.no != null) {
1986                        op = cop.no;
1987                    } else {
1988                        op = cop.next;
1989                    }
1990                }
1991                break;
1992
1993            default:
1994                throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
1995            } // switch (op.type)
1996
} // while
1997
}
1998
1999    private static final int getPreviousWordType(String JavaDoc target, int begin, int end,
2000                                                 int offset, int opts) {
2001        int ret = getWordType(target, begin, end, --offset, opts);
2002        while (ret == WT_IGNORE)
2003            ret = getWordType(target, begin, end, --offset, opts);
2004        return ret;
2005    }
2006
2007    private static final int getWordType(String JavaDoc target, int begin, int end,
2008                                         int offset, int opts) {
2009        if (offset < begin || offset >= end) return WT_OTHER;
2010        return getWordType0( target .charAt( offset ) , opts);
2011    }
2012
2013
2014    private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2015                                               String JavaDoc part, int partlen) {
2016        if (limit-offset < partlen) return false;
2017        return text.regionMatches(offset, part, 0, partlen);
2018    }
2019
2020    private static final boolean regionMatches(String JavaDoc text, int offset, int limit,
2021                                               int offset2, int partlen) {
2022        if (limit-offset < partlen) return false;
2023        return text.regionMatches(offset, text, offset2, partlen);
2024    }
2025
2026    private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2027                                                         String JavaDoc part, int partlen) {
2028        return text.regionMatches(true, offset, part, 0, partlen);
2029    }
2030
2031    private static final boolean regionMatchesIgnoreCase(String JavaDoc text, int offset, int limit,
2032                                                         int offset2, int partlen) {
2033        if (limit-offset < partlen) return false;
2034        return text.regionMatches(true, offset, text, offset2, partlen);
2035    }
2036
2037
2038
2039
2040
2041
2042
2043    /**
2044     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2045     *
2046     * @return true if the target is matched to this regular expression.
2047     */

2048    public boolean matches(CharacterIterator JavaDoc target) {
2049        return this.matches(target, (Match)null);
2050    }
2051
2052
2053    /**
2054     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
2055     *
2056     * @param match A Match instance for storing matching result.
2057     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
2058     */

2059    public boolean matches(CharacterIterator JavaDoc target, Match match) {
2060        int start = target.getBeginIndex();
2061        int end = target.getEndIndex();
2062
2063
2064
2065        synchronized (this) {
2066            if (this.operations == null)
2067                this.prepare();
2068            if (this.context == null)
2069                this.context = new Context();
2070        }
2071        Context con = null;
2072        synchronized (this.context) {
2073            con = this.context.inuse ? new Context() : this.context;
2074            con.reset(target, start, end, this.numberOfClosures);
2075        }
2076        if (match != null) {
2077            match.setNumberOfGroups(this.nofparen);
2078            match.setSource(target);
2079        } else if (this.hasBackReferences) {
2080            match = new Match();
2081            match.setNumberOfGroups(this.nofparen);
2082            // Need not to call setSource() because
2083
// a caller can not access this match instance.
2084
}
2085        con.match = match;
2086
2087        if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
2088            int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2089            //System.err.println("DEBUG: matchEnd="+matchEnd);
2090
if (matchEnd == con.limit) {
2091                if (con.match != null) {
2092                    con.match.setBeginning(0, con.start);
2093                    con.match.setEnd(0, matchEnd);
2094                }
2095                con.inuse = false;
2096                return true;
2097            }
2098            return false;
2099        }
2100
2101        /*
2102         * The pattern has only fixed string.
2103         * The engine uses Boyer-Moore.
2104         */

2105        if (this.fixedStringOnly) {
2106            //System.err.println("DEBUG: fixed-only: "+this.fixedString);
2107
int o = this.fixedStringTable.matches(target, con.start, con.limit);
2108            if (o >= 0) {
2109                if (con.match != null) {
2110                    con.match.setBeginning(0, o);
2111                    con.match.setEnd(0, o+this.fixedString.length());
2112                }
2113                con.inuse = false;
2114                return true;
2115            }
2116            con.inuse = false;
2117            return false;
2118        }
2119
2120        /*
2121         * The pattern contains a fixed string.
2122         * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
2123         * If not, it return with false.
2124         */

2125        if (this.fixedString != null) {
2126            int o = this.fixedStringTable.matches(target, con.start, con.limit);
2127            if (o < 0) {
2128                //System.err.println("Non-match in fixed-string search.");
2129
con.inuse = false;
2130                return false;
2131            }
2132        }
2133
2134        int limit = con.limit-this.minlength;
2135        int matchStart;
2136        int matchEnd = -1;
2137
2138        /*
2139         * Checks whether the expression starts with ".*".
2140         */

2141        if (this.operations != null
2142            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
2143            if (isSet(this.options, SINGLE_LINE)) {
2144                matchStart = con.start;
2145                matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
2146            } else {
2147                boolean previousIsEOL = true;
2148                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2149                    int ch = target .setIndex( matchStart ) ;
2150                    if (isEOLChar(ch)) {
2151                        previousIsEOL = true;
2152                    } else {
2153                        if (previousIsEOL) {
2154                            if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2155                                                                               matchStart, 1, this.options)))
2156                                break;
2157                        }
2158                        previousIsEOL = false;
2159                    }
2160                }
2161            }
2162        }
2163
2164        /*
2165         * Optimization against the first character.
2166         */

2167        else if (this.firstChar != null) {
2168            //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
2169
RangeToken range = this.firstChar;
2170            if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
2171                range = this.firstChar.getCaseInsensitiveToken();
2172                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2173                    int ch = target .setIndex( matchStart ) ;
2174                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
2175                        ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2176                        if (!range.match(ch)) continue;
2177                    } else {
2178                        if (!range.match(ch)) {
2179                            char ch1 = Character.toUpperCase((char)ch);
2180                            if (!range.match(ch1))
2181                                if (!range.match(Character.toLowerCase(ch1)))
2182                                    continue;
2183                        }
2184                    }
2185                    if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2186                                                                       matchStart, 1, this.options)))
2187                        break;
2188                }
2189            } else {
2190                for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2191                    int ch = target .setIndex( matchStart ) ;
2192                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
2193                        ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
2194                    if (!range.match(ch)) continue;
2195                    if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
2196                                                                       matchStart, 1, this.options)))
2197                        break;
2198                }
2199            }
2200        }
2201
2202        /*
2203         * Straightforward matching.
2204         */

2205        else {
2206            for (matchStart = con.start; matchStart <= limit; matchStart ++) {
2207                if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
2208                    break;
2209            }
2210        }
2211
2212        if (matchEnd >= 0) {
2213            if (con.match != null) {
2214                con.match.setBeginning(0, matchStart);
2215                con.match.setEnd(0, matchEnd);
2216            }
2217            con.inuse = false;
2218            return true;
2219        } else {
2220            con.inuse = false;
2221            return false;
2222        }
2223    }
2224
2225    /**
2226     * @return -1 when not match; offset of the end of matched string when match.
2227     */

2228    private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
2229
2230
2231        CharacterIterator JavaDoc target = con.ciTarget;
2232
2233
2234
2235
2236
2237
2238        while (true) {
2239            if (op == null)
2240                return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
2241            if (offset > con.limit || offset < con.start)
2242                return -1;
2243            switch (op.type) {
2244            case Op.CHAR:
2245                if (isSet(opts, IGNORE_CASE)) {
2246                    int ch = op.getData();
2247                    if (dx > 0) {
2248                        if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) ))
2249                            return -1;
2250                        offset ++;
2251                    } else {
2252                        int o1 = offset-1;
2253                        if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) ))
2254                            return -1;
2255                        offset = o1;
2256                    }
2257                } else {
2258                    int ch = op.getData();
2259                    if (dx > 0) {
2260                        if (offset >= con.limit || ch != target .setIndex( offset ) )
2261                            return -1;
2262                        offset ++;
2263                    } else {
2264                        int o1 = offset-1;
2265                        if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) )
2266                            return -1;
2267                        offset = o1;
2268                    }
2269                }
2270                op = op.next;
2271                break;
2272
2273            case Op.DOT:
2274                if (dx > 0) {
2275                    if (offset >= con.limit)
2276                        return -1;
2277                    int ch = target .setIndex( offset ) ;
2278                    if (isSet(opts, SINGLE_LINE)) {
2279                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2280                            offset ++;
2281                    } else {
2282                        if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2283                            ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
2284                        if (isEOLChar(ch))
2285                            return -1;
2286                    }
2287                    offset ++;
2288                } else {
2289                    int o1 = offset-1;
2290                    if (o1 >= con.limit || o1 < 0)
2291                        return -1;
2292                    int ch = target .setIndex( o1 ) ;
2293                    if (isSet(opts, SINGLE_LINE)) {
2294                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2295                            o1 --;
2296                    } else {
2297                        if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2298                            ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
2299                        if (!isEOLChar(ch))
2300                            return -1;
2301                    }
2302                    offset = o1;
2303                }
2304                op = op.next;
2305                break;
2306
2307            case Op.RANGE:
2308            case Op.NRANGE:
2309                if (dx > 0) {
2310                    if (offset >= con.limit)
2311                        return -1;
2312                    int ch = target .setIndex( offset ) ;
2313                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
2314                        ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
2315                    RangeToken tok = op.getToken();
2316                    if (isSet(opts, IGNORE_CASE)) {
2317                        tok = tok.getCaseInsensitiveToken();
2318                        if (!tok.match(ch)) {
2319                            if (ch >= 0x10000) return -1;
2320                            char uch;
2321                            if (!tok.match(uch = Character.toUpperCase((char)ch))
2322                                && !tok.match(Character.toLowerCase(uch)))
2323                                return -1;
2324                        }
2325                    } else {
2326                        if (!tok.match(ch)) return -1;
2327                    }
2328                    offset ++;
2329                } else {
2330                    int o1 = offset-1;
2331                    if (o1 >= con.limit || o1 < 0)
2332                        return -1;
2333                    int ch = target .setIndex( o1 ) ;
2334                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
2335                        ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
2336                    RangeToken tok = op.getToken();
2337                    if (isSet(opts, IGNORE_CASE)) {
2338                        tok = tok.getCaseInsensitiveToken();
2339                        if (!tok.match(ch)) {
2340                            if (ch >= 0x10000) return -1;
2341                            char uch;
2342                            if (!tok.match(uch = Character.toUpperCase((char)ch))
2343                                && !tok.match(Character.toLowerCase(uch)))
2344                                return -1;
2345                        }
2346                    } else {
2347                        if (!tok.match(ch)) return -1;
2348                    }
2349                    offset = o1;
2350                }
2351                op = op.next;
2352                break;
2353
2354            case Op.ANCHOR:
2355                boolean go = false;
2356                switch (op.getData()) {
2357                case '^':
2358                    if (isSet(opts, MULTIPLE_LINES)) {
2359                        if (!(offset == con.start
2360                              || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
2361                            return -1;
2362                    } else {
2363                        if (offset != con.start)
2364                            return -1;
2365                    }
2366                    break;
2367
2368                case '@': // Internal use only.
2369
// The @ always matches line beginnings.
2370
if (!(offset == con.start
2371                          || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
2372                        return -1;
2373                    break;
2374
2375                case '$':
2376                    if (isSet(opts, MULTIPLE_LINES)) {
2377                        if (!(offset == con.limit
2378                              || offset < con.limit && isEOLChar( target .setIndex( offset ) )))
2379                            return -1;
2380                    } else {
2381                        if (!(offset == con.limit
2382                              || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
2383                              || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
2384                              && target .setIndex( offset+1 ) == LINE_FEED))
2385                            return -1;
2386                    }
2387                    break;
2388
2389                case 'A':
2390                    if (offset != con.start) return -1;
2391                    break;
2392
2393                case 'Z':
2394                    if (!(offset == con.limit
2395                          || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
2396                          || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
2397                          && target .setIndex( offset+1 ) == LINE_FEED))
2398                        return -1;
2399                    break;
2400
2401                case 'z':
2402                    if (offset != con.limit) return -1;
2403                    break;
2404
2405                case 'b':
2406                    if (con.length == 0) return -1;
2407                    {
2408                        int after = getWordType(target, con.start, con.limit, offset, opts);
2409                        if (after == WT_IGNORE) return -1;
2410                        int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
2411                        if (after == before) return -1;
2412                    }
2413                    break;
2414
2415                case 'B':
2416                    if (con.length == 0)
2417                        go = true;
2418                    else {
2419                        int after = getWordType(target, con.start, con.limit, offset, opts);
2420                        go = after == WT_IGNORE
2421                             || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
2422                    }
2423                    if (!go) return -1;
2424                    break;
2425
2426                case '<':
2427                    if (con.length == 0 || offset == con.limit) return -1;
2428                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
2429                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
2430                        return -1;
2431                    break;
2432
2433                case '>':
2434                    if (con.length == 0 || offset == con.start) return -1;
2435                    if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
2436                        || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
2437                        return -1;
2438                    break;
2439                } // switch anchor type
2440
op = op.next;
2441                break;
2442
2443            case Op.BACKREFERENCE:
2444                {
2445                    int refno = op.getData();
2446                    if (refno <= 0 || refno >= this.nofparen)
2447                        throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+refno);
2448                    if (con.match.getBeginning(refno) < 0
2449                        || con.match.getEnd(refno) < 0)
2450                        return -1; // ********
2451
int o2 = con.match.getBeginning(refno);
2452                    int literallen = con.match.getEnd(refno)-o2;
2453                    if (!isSet(opts, IGNORE_CASE)) {
2454                        if (dx > 0) {
2455                            if (!regionMatches(target, offset, con.limit, o2, literallen))
2456                                return -1;
2457                            offset += literallen;
2458                        } else {
2459                            if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
2460                                return -1;
2461                            offset -= literallen;
2462                        }
2463                    } else {
2464                        if (dx > 0) {
2465                            if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
2466                                return -1;
2467                            offset += literallen;
2468                        } else {
2469                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2470                                                         o2, literallen))
2471                                return -1;
2472                            offset -= literallen;
2473                        }
2474                    }
2475                }
2476                op = op.next;
2477                break;
2478            case Op.STRING:
2479                {
2480                    String JavaDoc literal = op.getString();
2481                    int literallen = literal.length();
2482                    if (!isSet(opts, IGNORE_CASE)) {
2483                        if (dx > 0) {
2484                            if (!regionMatches(target, offset, con.limit, literal, literallen))
2485                                return -1;
2486                            offset += literallen;
2487                        } else {
2488                            if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
2489                                return -1;
2490                            offset -= literallen;
2491                        }
2492                    } else {
2493                        if (dx > 0) {
2494                            if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
2495                                return -1;
2496                            offset += literallen;
2497                        } else {
2498                            if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
2499                                                         literal, literallen))
2500                                return -1;
2501                            offset -= literallen;
2502                        }
2503                    }
2504                }
2505                op = op.next;
2506                break;
2507
2508            case Op.CLOSURE:
2509                {
2510                    /*
2511                     * Saves current position to avoid
2512                     * zero-width repeats.
2513                     */

2514                    int id = op.getData();
2515                    if (id >= 0) {
2516                        int previousOffset = con.offsets[id];
2517                        if (previousOffset < 0 || previousOffset != offset) {
2518                            con.offsets[id] = offset;
2519                        } else {
2520                            con.offsets[id] = -1;
2521                            op = op.next;
2522                            break;
2523                        }
2524                    }
2525                    
2526                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2527                    if (id >= 0) con.offsets[id] = -1;
2528                    if (ret >= 0) return ret;
2529                    op = op.next;
2530                }
2531                break;
2532
2533            case Op.QUESTION:
2534                {
2535                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2536                    if (ret >= 0) return ret;
2537                    op = op.next;
2538                }
2539                break;
2540
2541            case Op.NONGREEDYCLOSURE:
2542            case Op.NONGREEDYQUESTION:
2543                {
2544                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2545                    if (ret >= 0) return ret;
2546                    op = op.getChild();
2547                }
2548                break;
2549
2550            case Op.UNION:
2551                for (int i = 0; i < op.size(); i ++) {
2552                    int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
2553                    if (DEBUG) {
2554                        System.err.println("UNION: "+i+", ret="+ret);
2555                    }
2556                    if (ret >= 0) return ret;
2557                }
2558                return -1;
2559
2560            case Op.CAPTURE:
2561                int refno = op.getData();
2562                if (con.match != null && refno > 0) {
2563                    int save = con.match.getBeginning(refno);
2564                    con.match.setBeginning(refno, offset);
2565                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2566                    if (ret < 0) con.match.setBeginning(refno, save);
2567                    return ret;
2568                } else if (con.match != null && refno < 0) {
2569                    int index = -refno;
2570                    int save = con.match.getEnd(index);
2571                    con.match.setEnd(index, offset);
2572                    int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
2573                    if (ret < 0) con.match.setEnd(index, save);
2574                    return ret;
2575                }
2576                op = op.next;
2577                break;
2578
2579            case Op.LOOKAHEAD:
2580                if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
2581                op = op.next;
2582                break;
2583            case Op.NEGATIVELOOKAHEAD:
2584                if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
2585                op = op.next;
2586                break;
2587            case Op.LOOKBEHIND:
2588                if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
2589                op = op.next;
2590                break;
2591            case Op.NEGATIVELOOKBEHIND:
2592                if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
2593                op = op.next;
2594                break;
2595
2596            case Op.INDEPENDENT:
2597                {
2598                    int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
2599                    if (ret < 0) return ret;
2600                    offset = ret;
2601                    op = op.next;
2602                }
2603                break;
2604
2605            case Op.MODIFIER:
2606                {
2607                    int localopts = opts;
2608                    localopts |= op.getData();
2609                    localopts &= ~op.getData2();
2610                    //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
2611
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
2612                    if (ret < 0) return ret;
2613                    offset = ret;
2614                    op = op.next;
2615                }
2616                break;
2617
2618            case Op.CONDITION:
2619                {
2620                    Op.ConditionOp cop = (Op.ConditionOp)op;
2621                    boolean matchp = false;
2622                    if (cop.refNumber > 0) {
2623                        if (cop.refNumber >= this.nofparen)
2624                            throw new RuntimeException JavaDoc("Internal Error: Reference number must be more than zero: "+cop.refNumber);
2625                        matchp = con.match.getBeginning(cop.refNumber) >= 0
2626                                 && con.match.getEnd(cop.refNumber) >= 0;
2627                    } else {
2628                        matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
2629                    }
2630
2631                    if (matchp) {
2632                        op = cop.yes;
2633                    } else if (cop.no != null) {
2634                        op = cop.no;
2635                    } else {
2636                        op = cop.next;
2637                    }
2638                }
2639                break;
2640
2641            default:
2642                throw new RuntimeException JavaDoc("Unknown operation type: "+op.type);
2643            } // switch (op.type)
2644
} // while
2645
}
2646
2647    private static final int getPreviousWordType(CharacterIterator JavaDoc target, int begin, int end,
2648                                                 int offset, int opts) {
2649        int ret = getWordType(target, begin, end, --offset, opts);
2650        while (ret == WT_IGNORE)
2651            ret = getWordType(target, begin, end, --offset, opts);
2652        return ret;
2653    }
2654
2655    private static final int getWordType(CharacterIterator JavaDoc target, int begin, int end,
2656                                         int offset, int opts) {
2657        if (offset < begin || offset >= end) return WT_OTHER;
2658        return getWordType0( target .setIndex( offset ) , opts);
2659    }
2660
2661
2662
2663    private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
2664                                               String JavaDoc part, int partlen) {
2665        if (offset < 0) return false;
2666        if (limit-offset < partlen)
2667            return false;
2668        int i = 0;
2669        while (partlen-- > 0) {
2670            if ( target .setIndex( offset++ ) != part.charAt(i++))
2671                return false;
2672        }
2673        return true;
2674    }
2675
2676    private static final boolean regionMatches(CharacterIterator JavaDoc target, int offset, int limit,
2677                                               int offset2, int partlen) {
2678        if (offset < 0) return false;
2679        if (limit-offset < partlen)
2680            return false;
2681        int i = offset2;
2682        while (partlen-- > 0) {
2683            if ( target .setIndex( offset++ ) != target .setIndex( i++ ) )
2684                return false;
2685        }
2686        return true;
2687    }
2688
2689    /**
2690     * @see java.lang.String#regionMatches
2691     */

2692    private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
2693                                                         String JavaDoc part, int partlen) {
2694        if (offset < 0) return false;
2695        if (limit-offset < partlen)
2696            return false;
2697        int i = 0;
2698        while (partlen-- > 0) {
2699            char ch1 = target .setIndex( offset++ ) ;
2700            char ch2 = part.charAt(i++);
2701            if (ch1 == ch2)
2702                continue;
2703            char uch1 = Character.toUpperCase(ch1);
2704            char uch2 = Character.toUpperCase(ch2);
2705            if (uch1 == uch2)
2706                continue;
2707            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2708                return false;
2709        }
2710        return true;
2711    }
2712
2713    private static final boolean regionMatchesIgnoreCase(CharacterIterator JavaDoc target, int offset, int limit,
2714                                                         int offset2, int partlen) {
2715        if (offset < 0) return false;
2716        if (limit-offset < partlen)
2717            return false;
2718        int i = offset2;
2719        while (partlen-- > 0) {
2720            char ch1 = target .setIndex( offset++ ) ;
2721            char ch2 = target .setIndex( i++ ) ;
2722            if (ch1 == ch2)
2723                continue;
2724            char uch1 = Character.toUpperCase(ch1);
2725            char uch2 = Character.toUpperCase(ch2);
2726            if (uch1 == uch2)
2727                continue;
2728            if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
2729                return false;
2730        }
2731        return true;
2732    }
2733
2734
2735
2736
2737    // ================================================================
2738

2739    /**
2740     * A regular expression.
2741     * @serial
2742     */

2743    String JavaDoc regex;
2744    /**
2745     * @serial
2746     */

2747    int options;
2748
2749    /**
2750     * The number of parenthesis in the regular expression.
2751     * @serial
2752     */

2753    int nofparen;
2754    /**
2755     * Internal representation of the regular expression.
2756     * @serial
2757     */

2758    Token tokentree;
2759
2760    boolean hasBackReferences = false;
2761
2762    transient int minlength;
2763    transient Op operations = null;
2764    transient int numberOfClosures;
2765    transient Context context = null;
2766    transient RangeToken firstChar = null;
2767
2768    transient String JavaDoc fixedString = null;
2769    transient int fixedStringOptions;
2770    transient BMPattern fixedStringTable = null;
2771    transient boolean fixedStringOnly = false;
2772
2773
2774    static final class Context {
2775        CharacterIterator JavaDoc ciTarget;
2776        String JavaDoc strTarget;
2777        char[] charTarget;
2778        int start;
2779        int limit;
2780        int length;
2781        Match match;
2782        boolean inuse = false;
2783        int[] offsets;
2784
2785        Context() {
2786        }
2787
2788        private void resetCommon(int nofclosures) {
2789            this.length = this.limit-this.start;
2790            this.inuse = true;
2791            this.match = null;
2792            if (this.offsets == null || this.offsets.length != nofclosures)
2793                this.offsets = new int[nofclosures];
2794            for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1;
2795        }
2796        void reset(CharacterIterator JavaDoc target, int start, int limit, int nofclosures) {
2797            this.ciTarget = target;
2798            this.start = start;
2799            this.limit = limit;
2800            this.resetCommon(nofclosures);
2801        }
2802        void reset(String JavaDoc target, int start, int limit, int nofclosures) {
2803            this.strTarget = target;
2804            this.start = start;
2805            this.limit = limit;
2806            this.resetCommon(nofclosures);
2807        }
2808        void reset(char[] target, int start, int limit, int nofclosures) {
2809            this.charTarget = target;
2810            this.start = start;
2811            this.limit = limit;
2812            this.resetCommon(nofclosures);
2813        }
2814    }
2815
2816    /**
2817     * Prepares for matching. This method is called just before starting matching.
2818     */

2819    void prepare() {
2820        if (Op.COUNT) Op.nofinstances = 0;
2821        this.compile(this.tokentree);
2822        /*
2823        if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2824            Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2825            anchor.next = this.operations;
2826            this.operations = anchor;
2827        }
2828        */

2829        if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2830
2831        this.minlength = this.tokentree.getMinLength();
2832
2833        this.firstChar = null;
2834        if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2835            && !isSet(this.options, XMLSCHEMA_MODE)) {
2836            RangeToken firstChar = Token.createRange();
2837            int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2838            if (fresult == Token.FC_TERMINAL) {
2839                firstChar.compactRanges();
2840                this.firstChar = firstChar;
2841                if (DEBUG)
2842                    System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2843            }
2844        }
2845
2846        if (this.operations != null
2847            && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2848            && this.operations.next == null) {
2849            if (DEBUG)
2850                System.err.print(" *** Only fixed string! *** ");
2851            this.fixedStringOnly = true;
2852            if (this.operations.type == Op.STRING)
2853                this.fixedString = this.operations.getString();
2854            else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2855
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2856            } else {
2857                char[] ac = new char[1];
2858                ac[0] = (char)this.operations.getData();
2859                this.fixedString = new String JavaDoc(ac);
2860            }
2861            this.fixedStringOptions = this.options;
2862            this.fixedStringTable = new BMPattern(this.fixedString, 256,
2863                                                  isSet(this.fixedStringOptions, IGNORE_CASE));
2864        } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2865                   && !isSet(this.options, XMLSCHEMA_MODE)) {
2866            Token.FixedStringContainer container = new Token.FixedStringContainer();
2867            this.tokentree.findFixedString(container, this.options);
2868            this.fixedString = container.token == null ? null : container.token.getString();
2869            this.fixedStringOptions = container.options;
2870            if (this.fixedString != null && this.fixedString.length() < 2)
2871                this.fixedString = null;
2872            // This pattern has a fixed string of which length is more than one.
2873
if (this.fixedString != null) {
2874                this.fixedStringTable = new BMPattern(this.fixedString, 256,
2875                                                      isSet(this.fixedStringOptions, IGNORE_CASE));
2876                if (DEBUG) {
2877                    System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2878                                       +"/" //+this.fixedString
2879
+"/"+REUtil.createOptionString(this.fixedStringOptions));
2880                    System.err.print("String: ");
2881                    REUtil.dumpString(this.fixedString);
2882                }
2883            }
2884        }
2885    }
2886
2887    /**
2888     * An option.
2889     * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2890     * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2891     * does not capture.
2892     *
2893     * @see #RegularExpression(java.lang.String,int)
2894     * @see #setPattern(java.lang.String,int)
2895    static final int MARK_PARENS = 1<<0;
2896     */

2897
2898    /**
2899     * "i"
2900     */

2901    static final int IGNORE_CASE = 1<<1;
2902
2903    /**
2904     * "s"
2905     */

2906    static final int SINGLE_LINE = 1<<2;
2907
2908    /**
2909     * "m"
2910     */

2911    static final int MULTIPLE_LINES = 1<<3;
2912
2913    /**
2914     * "x"
2915     */

2916    static final int EXTENDED_COMMENT = 1<<4;
2917
2918    /**
2919     * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2920     *
2921     * @see #RegularExpression(java.lang.String,int)
2922     * @see #setPattern(java.lang.String,int)
2923     * @see #UNICODE_WORD_BOUNDARY
2924     */

2925    static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2926

2927    /**
2928     * An option.
2929     * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2930     * <p>By default, the engine considers a position between a word character
2931     * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2932     * is a word boundary.
2933     * <p>By this option, the engine checks word boundaries with the method of
2934     * 'Unicode Regular Expression Guidelines' Revision 4.
2935     *
2936     * @see #RegularExpression(java.lang.String,int)
2937     * @see #setPattern(java.lang.String,int)
2938     */

2939    static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2940

2941    /**
2942     * "H"
2943     */

2944    static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2945    /**
2946     * "F"
2947     */

2948    static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2949    /**
2950     * "X". XML Schema mode.
2951     */

2952    static final int XMLSCHEMA_MODE = 1<<9;
2953    /**
2954     * ",".
2955     */

2956    static final int SPECIAL_COMMA = 1<<10;
2957
2958
2959    private static final boolean isSet(int options, int flag) {
2960        return (options & flag) == flag;
2961    }
2962
2963    /**
2964     * Creates a new RegularExpression instance.
2965     *
2966     * @param regex A regular expression
2967     * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2968     */

2969    public RegularExpression(String JavaDoc regex) throws ParseException {
2970        this.setPattern(regex, null);
2971    }
2972
2973    /**
2974     * Creates a new RegularExpression instance with options.
2975     *
2976     * @param regex A regular expression
2977     * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2978     * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2979     */

2980    public RegularExpression(String JavaDoc regex, String JavaDoc options) throws ParseException {
2981        this.setPattern(regex, options);
2982    }
2983
2984    RegularExpression(String JavaDoc regex, Token tok, int parens, boolean hasBackReferences, int options) {
2985        this.regex = regex;
2986        this.tokentree = tok;
2987        this.nofparen = parens;
2988        this.options = options;
2989        this.hasBackReferences = hasBackReferences;
2990    }
2991
2992    /**
2993     *
2994     */

2995    public void setPattern(String JavaDoc newPattern) throws ParseException {
2996        this.setPattern(newPattern, this.options);
2997    }
2998
2999    private void setPattern(String JavaDoc newPattern, int options) throws ParseException {
3000        this.regex = newPattern;
3001        this.options = options;
3002        RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
3003                         ? new ParserForXMLSchema() : new RegexParser();
3004        this.tokentree = rp.parse(this.regex, this.options);
3005        this.nofparen = rp.parennumber;
3006        this.hasBackReferences = rp.hasBackReferences;
3007
3008        this.operations = null;
3009        this.context = null;
3010    }
3011    /**
3012     *
3013     */

3014    public void setPattern(String JavaDoc newPattern, String JavaDoc options) throws ParseException {
3015        this.setPattern(newPattern, REUtil.parseOptions(options));
3016    }
3017
3018    /**
3019     *
3020     */

3021    public String JavaDoc getPattern() {
3022        return this.regex;
3023    }
3024
3025    /**
3026     * Represents this instence in String.
3027     */

3028    public String JavaDoc toString() {
3029        return this.tokentree.toString(this.options);
3030    }
3031
3032    /**
3033     * Returns a option string.
3034     * The order of letters in it may be different from a string specified
3035     * in a constructor or <code>setPattern()</code>.
3036     *
3037     * @see #RegularExpression(java.lang.String,java.lang.String)
3038     * @see #setPattern(java.lang.String,java.lang.String)
3039     */

3040    public String JavaDoc getOptions() {
3041        return REUtil.createOptionString(this.options);
3042    }
3043
3044    /**
3045     * Return true if patterns are the same and the options are equivalent.
3046     */

3047    public boolean equals(Object JavaDoc obj) {
3048        if (obj == null) return false;
3049        if (!(obj instanceof RegularExpression))
3050            return false;
3051        RegularExpression r = (RegularExpression)obj;
3052        return this.regex.equals(r.regex) && this.options == r.options;
3053    }
3054
3055    boolean equals(String JavaDoc pattern, int options) {
3056        return this.regex.equals(pattern) && this.options == options;
3057    }
3058
3059    /**
3060     *
3061     */

3062    public int hashCode() {
3063        return (this.regex+"/"+this.getOptions()).hashCode();
3064    }
3065
3066    /**
3067     * Return the number of regular expression groups.
3068     * This method returns 1 when the regular expression has no capturing-parenthesis.
3069     *
3070     */

3071    public int getNumberOfGroups() {
3072        return this.nofparen;
3073    }
3074
3075    // ================================================================
3076

3077    private static final int WT_IGNORE = 0;
3078    private static final int WT_LETTER = 1;
3079    private static final int WT_OTHER = 2;
3080    private static final int getWordType0(char ch, int opts) {
3081        if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
3082            if (isSet(opts, USE_UNICODE_CATEGORY)) {
3083                return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
3084            }
3085            return isWordChar(ch) ? WT_LETTER : WT_OTHER;
3086        }
3087
3088        switch (Character.getType(ch)) {
3089        case Character.UPPERCASE_LETTER: // L
3090
case Character.LOWERCASE_LETTER: // L
3091
case Character.TITLECASE_LETTER: // L
3092
case Character.MODIFIER_LETTER: // L
3093
case Character.OTHER_LETTER: // L
3094
case Character.LETTER_NUMBER: // N
3095
case Character.DECIMAL_DIGIT_NUMBER: // N
3096
case Character.OTHER_NUMBER: // N
3097
case Character.COMBINING_SPACING_MARK: // Mc
3098
return WT_LETTER;
3099
3100        case Character.FORMAT: // Cf
3101
case Character.NON_SPACING_MARK: // Mn
3102
case Character.ENCLOSING_MARK: // Mc
3103
return WT_IGNORE;
3104
3105        case Character.CONTROL: // Cc
3106
switch (ch) {
3107            case '\t':
3108            case '\n':
3109            case '\u000B':
3110            case '\f':
3111            case '\r':
3112                return WT_OTHER;
3113            default:
3114                return WT_IGNORE;
3115            }
3116
3117        default:
3118            return WT_OTHER;
3119        }
3120    }
3121
3122    // ================================================================
3123

3124    static final int LINE_FEED = 0x000A;
3125    static final int CARRIAGE_RETURN = 0x000D;
3126    static final int LINE_SEPARATOR = 0x2028;
3127    static final int PARAGRAPH_SEPARATOR = 0x2029;
3128
3129    private static final boolean isEOLChar(int ch) {
3130        return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
3131        || ch == PARAGRAPH_SEPARATOR;
3132    }
3133
3134    private static final boolean isWordChar(int ch) { // Legacy word characters
3135
if (ch == '_') return true;
3136        if (ch < '0') return false;
3137        if (ch > 'z') return false;
3138        if (ch <= '9') return true;
3139        if (ch < 'A') return false;
3140        if (ch <= 'Z') return true;
3141        if (ch < 'a') return false;
3142        return true;
3143    }
3144
3145    private static final boolean matchIgnoreCase(int chardata, int ch) {
3146        if (chardata == ch) return true;
3147        if (chardata > 0xffff || ch > 0xffff) return false;
3148        char uch1 = Character.toUpperCase((char)chardata);
3149        char uch2 = Character.toUpperCase((char)ch);
3150        if (uch1 == uch2) return true;
3151        return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
3152    }
3153}
3154
Popular Tags