KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > saxon > type > RegexTranslator


1 package net.sf.saxon.type;
2
3 import net.sf.saxon.om.XMLChar;
4 import net.sf.saxon.om.FastStringBuffer;
5
6 import java.math.BigDecimal JavaDoc;
7 import java.util.*;
8
9 /**
10  * This class translates XML Schema regex syntax into JDK 1.4 regex syntax.
11  * Author: James Clark
12  * Modified by Michael Kay (a) to integrate the code into Saxon, and (b) to support XPath additions
13  * to the XML Schema regex syntax.
14  */

15 public class RegexTranslator {
16
17
18     /**
19      * Translates XML Schema regexes into <code>java.util.regex</code> regexes.
20      *
21      * @see java.util.regex.Pattern
22      * @see <a HREF="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
23      */

24
25     private final CharSequence JavaDoc regExp;
26     private boolean isXPath;
27     private int pos = 0;
28     private final int length;
29     private char curChar;
30     private boolean eos = false;
31     private final FastStringBuffer result = new FastStringBuffer(32);
32
33     private static final String JavaDoc categories = "LMNPZSC";
34     private static final CharClass[] categoryCharClasses = new CharClass[categories.length()];
35     private static final String JavaDoc subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn";
36     private static final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2];
37
38     private static final int NONBMP_MIN = 0x10000;
39     private static final int NONBMP_MAX = 0x10FFFF;
40     private static final char SURROGATE2_MIN = '\uDC00';
41     private static final char SURROGATE2_MAX = '\uDFFF';
42
43     //static final Localizer localizer = new Localizer(RegexTranslator.class);
44

45     private static final String JavaDoc[] blockNames = {
46         "BasicLatin",
47         "Latin-1Supplement",
48         "LatinExtended-A",
49         "LatinExtended-B",
50         "IPAExtensions",
51         "SpacingModifierLetters",
52         "CombiningDiacriticalMarks",
53         "Greek",
54         "Cyrillic",
55         "Armenian",
56         "Hebrew",
57         "Arabic",
58         "Syriac",
59         "Thaana",
60         "Devanagari",
61         "Bengali",
62         "Gurmukhi",
63         "Gujarati",
64         "Oriya",
65         "Tamil",
66         "Telugu",
67         "Kannada",
68         "Malayalam",
69         "Sinhala",
70         "Thai",
71         "Lao",
72         "Tibetan",
73         "Myanmar",
74         "Georgian",
75         "HangulJamo",
76         "Ethiopic",
77         "Cherokee",
78         "UnifiedCanadianAboriginalSyllabics",
79         "Ogham",
80         "Runic",
81         "Khmer",
82         "Mongolian",
83         "LatinExtendedAdditional",
84         "GreekExtended",
85         "GeneralPunctuation",
86         "SuperscriptsandSubscripts",
87         "CurrencySymbols",
88         "CombiningMarksforSymbols",
89         "LetterlikeSymbols",
90         "NumberForms",
91         "Arrows",
92         "MathematicalOperators",
93         "MiscellaneousTechnical",
94         "ControlPictures",
95         "OpticalCharacterRecognition",
96         "EnclosedAlphanumerics",
97         "BoxDrawing",
98         "BlockElements",
99         "GeometricShapes",
100         "MiscellaneousSymbols",
101         "Dingbats",
102         "BraillePatterns",
103         "CJKRadicalsSupplement",
104         "KangxiRadicals",
105         "IdeographicDescriptionCharacters",
106         "CJKSymbolsandPunctuation",
107         "Hiragana",
108         "Katakana",
109         "Bopomofo",
110         "HangulCompatibilityJamo",
111         "Kanbun",
112         "BopomofoExtended",
113         "EnclosedCJKLettersandMonths",
114         "CJKCompatibility",
115         "CJKUnifiedIdeographsExtensionA",
116         "CJKUnifiedIdeographs",
117         "YiSyllables",
118         "YiRadicals",
119         "HangulSyllables",
120         // surrogates excluded because there are never any *characters* with codes in surrogate range
121
// "PrivateUse", excluded because 3.1 adds non-BMP ranges
122
"CJKCompatibilityIdeographs",
123         "AlphabeticPresentationForms",
124         "ArabicPresentationForms-A",
125         "CombiningHalfMarks",
126         "CJKCompatibilityForms",
127         "SmallFormVariants",
128         "ArabicPresentationForms-B",
129         "Specials",
130         "HalfwidthandFullwidthForms",
131         "Specials"
132     };
133
134
135     /**
136      * Names of blocks including ranges outside the BMP.
137      */

138     private static final String JavaDoc[] specialBlockNames = {
139         "OldItalic",
140         "Gothic",
141         "Deseret",
142         "ByzantineMusicalSymbols",
143         "MusicalSymbols",
144         "MathematicalAlphanumericSymbols",
145         "CJKUnifiedIdeographsExtensionB",
146         "CJKCompatibilityIdeographsSupplement",
147         "Tags",
148         "PrivateUse",
149         "HighSurrogates",
150         "HighPrivateUseSurrogates",
151         "LowSurrogates",
152     };
153
154 // This file was automatically generated by CategoriesGen
155

156       static final String JavaDoc CATEGORY_NAMES = "NoLoMnCfLlNlPoLuMcNdSoSmCo";
157
158       static final int[][] CATEGORY_RANGES = {
159         {
160           // No
161
0x10107, 0x10133,
162           0x10320, 0x10323
163         },
164         {
165           // Lo
166
0x10000, 0x1000b,
167           0x1000d, 0x10026,
168           0x10028, 0x1003a,
169           0x1003c, 0x1003d,
170           0x1003f, 0x1004d,
171           0x10050, 0x1005d,
172           0x10080, 0x100fa,
173           0x10300, 0x1031e,
174           0x10330, 0x10349,
175           0x10380, 0x1039d,
176           0x10450, 0x1049d,
177           0x10800, 0x10805,
178           0x10808, 0x10808,
179           0x1080a, 0x10835,
180           0x10837, 0x10838,
181           0x1083c, 0x1083c,
182           0x1083f, 0x1083f,
183           0x20000, 0x2a6d6,
184           0x2f800, 0x2fa1d
185         },
186         {
187           // Mn
188
0x1d167, 0x1d169,
189           0x1d17b, 0x1d182,
190           0x1d185, 0x1d18b,
191           0x1d1aa, 0x1d1ad,
192           0xe0100, 0xe01ef
193         },
194         {
195           // Cf
196
0x1d173, 0x1d17a,
197           0xe0001, 0xe0001,
198           0xe0020, 0xe007f
199         },
200         {
201           // Ll
202
0x10428, 0x1044f,
203           0x1d41a, 0x1d433,
204           0x1d44e, 0x1d454,
205           0x1d456, 0x1d467,
206           0x1d482, 0x1d49b,
207           0x1d4b6, 0x1d4b9,
208           0x1d4bb, 0x1d4bb,
209           0x1d4bd, 0x1d4c3,
210           0x1d4c5, 0x1d4cf,
211           0x1d4ea, 0x1d503,
212           0x1d51e, 0x1d537,
213           0x1d552, 0x1d56b,
214           0x1d586, 0x1d59f,
215           0x1d5ba, 0x1d5d3,
216           0x1d5ee, 0x1d607,
217           0x1d622, 0x1d63b,
218           0x1d656, 0x1d66f,
219           0x1d68a, 0x1d6a3,
220           0x1d6c2, 0x1d6da,
221           0x1d6dc, 0x1d6e1,
222           0x1d6fc, 0x1d714,
223           0x1d716, 0x1d71b,
224           0x1d736, 0x1d74e,
225           0x1d750, 0x1d755,
226           0x1d770, 0x1d788,
227           0x1d78a, 0x1d78f,
228           0x1d7aa, 0x1d7c2,
229           0x1d7c4, 0x1d7c9
230         },
231         {
232           // Nl
233
0x1034a, 0x1034a
234         },
235         {
236           // Po
237
0x10100, 0x10101,
238           0x1039f, 0x1039f
239         },
240         {
241           // Lu
242
0x10400, 0x10427,
243           0x1d400, 0x1d419,
244           0x1d434, 0x1d44d,
245           0x1d468, 0x1d481,
246           0x1d49c, 0x1d49c,
247           0x1d49e, 0x1d49f,
248           0x1d4a2, 0x1d4a2,
249           0x1d4a5, 0x1d4a6,
250           0x1d4a9, 0x1d4ac,
251           0x1d4ae, 0x1d4b5,
252           0x1d4d0, 0x1d4e9,
253           0x1d504, 0x1d505,
254           0x1d507, 0x1d50a,
255           0x1d50d, 0x1d514,
256           0x1d516, 0x1d51c,
257           0x1d538, 0x1d539,
258           0x1d53b, 0x1d53e,
259           0x1d540, 0x1d544,
260           0x1d546, 0x1d546,
261           0x1d54a, 0x1d550,
262           0x1d56c, 0x1d585,
263           0x1d5a0, 0x1d5b9,
264           0x1d5d4, 0x1d5ed,
265           0x1d608, 0x1d621,
266           0x1d63c, 0x1d655,
267           0x1d670, 0x1d689,
268           0x1d6a8, 0x1d6c0,
269           0x1d6e2, 0x1d6fa,
270           0x1d71c, 0x1d734,
271           0x1d756, 0x1d76e,
272           0x1d790, 0x1d7a8
273         },
274         {
275           // Mc
276
0x1d165, 0x1d166,
277           0x1d16d, 0x1d172
278         },
279         {
280           // Nd
281
0x104a0, 0x104a9,
282           0x1d7ce, 0x1d7ff
283         },
284         {
285           // So
286
0x10102, 0x10102,
287           0x10137, 0x1013f,
288           0x1d000, 0x1d0f5,
289           0x1d100, 0x1d126,
290           0x1d12a, 0x1d164,
291           0x1d16a, 0x1d16c,
292           0x1d183, 0x1d184,
293           0x1d18c, 0x1d1a9,
294           0x1d1ae, 0x1d1dd,
295           0x1d300, 0x1d356
296         },
297         {
298           // Sm
299
0x1d6c1, 0x1d6c1,
300           0x1d6db, 0x1d6db,
301           0x1d6fb, 0x1d6fb,
302           0x1d715, 0x1d715,
303           0x1d735, 0x1d735,
304           0x1d74f, 0x1d74f,
305           0x1d76f, 0x1d76f,
306           0x1d789, 0x1d789,
307           0x1d7a9, 0x1d7a9,
308           0x1d7c3, 0x1d7c3
309         },
310         {
311           // Co
312
0xf0000, 0xffffd,
313           0x100000, 0x10fffd
314         }
315       };
316
317    // end of generated code
318

319     /**
320      * CharClass for each block name in specialBlockNames.
321      */

322     private static final CharClass[] specialBlockCharClasses = {
323         new CharRange(0x10300, 0x1032F),
324         new CharRange(0x10330, 0x1034F),
325         new CharRange(0x10400, 0x1044F),
326         new CharRange(0x1D000, 0x1D0FF),
327         new CharRange(0x1D100, 0x1D1FF),
328         new CharRange(0x1D400, 0x1D7FF),
329         new CharRange(0x20000, 0x2A6D6),
330         new CharRange(0x2F800, 0x2FA1F),
331         new CharRange(0xE0000, 0xE007F),
332         new Union(new CharClass[]{
333             new CharRange(0xE000, 0xF8FF),
334             new CharRange(0xF0000, 0xFFFFD),
335             new CharRange(0x100000, 0x10FFFD)
336         }),
337         Empty.getInstance(),
338         Empty.getInstance(),
339         Empty.getInstance()
340     };
341
342     private static final CharClass DOT = new Complement(new Union(new CharClass[]{new SingleChar('\n'), new SingleChar('\r')}));
343
344     private static final CharClass ESC_d = new Property("Nd");
345
346     private static final CharClass ESC_D = new Complement(ESC_d);
347
348     private static final CharClass ESC_W = new Union(new CharClass[]{new Property("P"), new Property("Z"), new Property("C")});
349
350     private static final CharClass ESC_w = new Complement(ESC_W);
351
352     private static final CharClass ESC_s = new Union(new CharClass[]{
353         new SingleChar(' '),
354         new SingleChar('\n'),
355         new SingleChar('\r'),
356         new SingleChar('\t')
357     });
358
359 // This file was automatically generated by NamingExceptionsGen
360
// class NamingExceptions {
361
static final String JavaDoc NMSTRT_INCLUDES =
362             "\u003A\u005F\u02BB\u02BC\u02BD\u02BE\u02BF\u02C0\u02C1\u0559" +
363             "\u06E5\u06E6\u212E";
364     static final String JavaDoc NMSTRT_EXCLUDE_RANGES =
365             "\u00AA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149\u017F\u017F" +
366             "\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233\u02A9\u02AD" +
367             "\u03D7\u03D7\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1" +
368             "\u0400\u0400\u040D\u040D\u0450\u0450\u045D\u045D\u048C\u048F" +
369             "\u04EC\u04ED\u0587\u0587\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" +
370             "\u06FA\u07A5\u0950\u0950\u0AD0\u0AD0\u0D85\u0DC6\u0E2F\u0E2F" +
371             "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u1055\u1101\u1101\u1104\u1104" +
372             "\u1108\u1108\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D" +
373             "\u113F\u113F\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153" +
374             "\u1156\u1158\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168" +
375             "\u116A\u116C\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2" +
376             "\u11A9\u11AA\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB" +
377             "\u11C3\u11EA\u11EC\u11EF\u11F1\u11F8\u1200\u18A8\u207F\u2124" +
378             "\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u3006\u3038\u303A" +
379             "\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
380     static final String JavaDoc NMSTRT_CATEGORIES = "LlLuLoLtNl";
381     static final String JavaDoc NMCHAR_INCLUDES =
382             "\u002D\u002E\u003A\u005F\u00B7\u0387\u212E";
383     static final String JavaDoc NMCHAR_EXCLUDE_RANGES =
384             "\u00AA\u00B5\u00BA\u00BA\u0132\u0133\u013F\u0140\u0149\u0149" +
385             "\u017F\u017F\u01C4\u01CC\u01F1\u01F3\u01F6\u01F9\u0218\u0233" +
386             "\u02A9\u02B8\u02E0\u02EE\u0346\u034E\u0362\u037A\u03D7\u03D7" +
387             "\u03DB\u03DB\u03DD\u03DD\u03DF\u03DF\u03E1\u03E1\u0400\u0400" +
388             "\u040D\u040D\u0450\u0450\u045D\u045D\u0488\u048F\u04EC\u04ED" +
389             "\u0587\u0587\u0653\u0655\u06B8\u06B9\u06BF\u06BF\u06CF\u06CF" +
390             "\u06FA\u07B0\u0950\u0950\u0AD0\u0AD0\u0D82\u0DF3\u0E2F\u0E2F" +
391             "\u0EAF\u0EAF\u0EDC\u0F00\u0F6A\u0F6A\u0F96\u0F96\u0FAE\u0FB0" +
392             "\u0FB8\u0FB8\u0FBA\u1059\u1101\u1101\u1104\u1104\u1108\u1108" +
393             "\u110A\u110A\u110D\u110D\u1113\u113B\u113D\u113D\u113F\u113F" +
394             "\u1141\u114B\u114D\u114D\u114F\u114F\u1151\u1153\u1156\u1158" +
395             "\u1162\u1162\u1164\u1164\u1166\u1166\u1168\u1168\u116A\u116C" +
396             "\u116F\u1171\u1174\u1174\u1176\u119D\u119F\u11A2\u11A9\u11AA" +
397             "\u11AC\u11AD\u11B0\u11B6\u11B9\u11B9\u11BB\u11BB\u11C3\u11EA" +
398             "\u11EC\u11EF\u11F1\u11F8\u1200\u18A9\u207F\u207F\u20DD\u20E0" +
399             "\u20E2\u2124\u2128\u2128\u212C\u212D\u212F\u217F\u2183\u2183" +
400             "\u3006\u3006\u3038\u303A\u3131\u4DB5\uA000\uA48C\uF900\uFFDC";
401     static final String JavaDoc NMCHAR_CATEGORIES = "LlLuLoLtNlMcMeMnLmNd";
402 // end of generated code
403

404     private static final CharClass ESC_S = new Complement(ESC_s);
405
406     private static final CharClass ESC_i = makeCharClass(NMSTRT_CATEGORIES,
407                                                          NMSTRT_INCLUDES,
408                                                          NMSTRT_EXCLUDE_RANGES);
409
410     private static final CharClass ESC_I = new Complement(ESC_i);
411
412     private static final CharClass ESC_c = makeCharClass(NMCHAR_CATEGORIES,
413                                                          NMCHAR_INCLUDES,
414                                                          NMCHAR_EXCLUDE_RANGES);
415
416     private static final CharClass ESC_C = new Complement(ESC_c);
417
418     private static final char EOS = '\0';
419
420     private RegexTranslator(CharSequence JavaDoc regExp) {
421         this.regExp = regExp;
422         this.length = regExp.length();
423         advance();
424     }
425
426     /**
427      * Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
428      * expression in the syntax of <code>java.util.regex.Pattern</code>. The translation
429      * assumes that the string to be matched against the regex uses surrogate pairs correctly.
430      * If the string comes from XML content, a conforming XML parser will automatically
431      * check this; if the string comes from elsewhere, it may be necessary to check
432      * surrogate usage before matching.
433      *
434      * @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2
435      * @param xpath a boolean indicating whether the XPath 2.0 F+O extensions to the schema
436      * regex syntax are permitted
437      * @return a String containing a regular expression in the syntax of java.util.regex.Pattern
438      * @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the
439      * syntax of XML Schemas Part 2
440      * @see java.util.regex.Pattern
441      * @see <a HREF="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
442      */

443     public static String JavaDoc translate(CharSequence JavaDoc regexp, boolean xpath) throws RegexSyntaxException {
444         RegexTranslator tr = new RegexTranslator(regexp);
445         tr.isXPath = xpath;
446         tr.translateTop();
447         return tr.result.toString();
448     }
449
450     private void advance() {
451         if (pos < length)
452             curChar = regExp.charAt(pos++);
453         else {
454             pos++;
455             curChar = EOS;
456             eos = true;
457         }
458     }
459
460     private void translateTop() throws RegexSyntaxException {
461         translateRegExp();
462         if (!eos)
463             throw makeException("expected end of string");
464     }
465
466     private void translateRegExp() throws RegexSyntaxException {
467         translateBranch();
468         while (curChar == '|') {
469             copyCurChar();
470             translateBranch();
471         }
472     }
473
474     private void translateBranch() throws RegexSyntaxException {
475         while (translateAtom())
476             translateQuantifier();
477     }
478
479     private void translateQuantifier() throws RegexSyntaxException {
480         switch (curChar) {
481         case '*':
482         case '?':
483         case '+':
484             copyCurChar();
485             break;
486         case '{':
487             copyCurChar();
488             translateQuantity();
489             expect('}');
490             copyCurChar();
491             break;
492         default:
493             return;
494         }
495         if (curChar=='?' && isXPath) {
496             copyCurChar();
497         }
498     }
499
500     private void translateQuantity() throws RegexSyntaxException {
501         String JavaDoc lower = parseQuantExact().toString();
502         int lowerValue = -1;
503         try {
504             lowerValue = Integer.parseInt(lower);
505             result.append(lower);
506         } catch (NumberFormatException JavaDoc e) {
507             // JDK 1.4 cannot handle ranges bigger than this
508
result.append(""+Integer.MAX_VALUE);
509         }
510         if (curChar == ',') {
511             copyCurChar();
512             if (curChar != '}') {
513                 String JavaDoc upper = parseQuantExact().toString();
514                 try {
515                     int upperValue = Integer.parseInt(upper);
516                     result.append(upper);
517                     if (lowerValue < 0 || upperValue < lowerValue)
518                         throw makeException("invalid range in quantifier");
519                 } catch (NumberFormatException JavaDoc e) {
520                     result.append(""+Integer.MAX_VALUE);
521                     if (lowerValue < 0 && new BigDecimal JavaDoc(lower).compareTo(new BigDecimal JavaDoc(upper)) > 0)
522                         throw makeException("invalid range in quantifier");
523                 }
524             }
525         }
526     }
527
528     private CharSequence JavaDoc parseQuantExact() throws RegexSyntaxException {
529         FastStringBuffer buf = new FastStringBuffer(10);
530         do {
531             if ("0123456789".indexOf(curChar) < 0)
532                 throw makeException("expected digit in quantifier");
533             buf.append(curChar);
534             advance();
535         } while (curChar != ',' && curChar != '}');
536         return buf;
537     }
538
539     private void copyCurChar() {
540         result.append(curChar);
541         advance();
542     }
543
544     static final int NONE = -1;
545     static final int SOME = 0;
546     static final int ALL = 1;
547
548     static final String JavaDoc SURROGATES1_CLASS = "[\uD800-\uDBFF]";
549     static final String JavaDoc SURROGATES2_CLASS = "[\uDC00-\uDFFF]";
550     static final String JavaDoc NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]";
551
552     static final class Range implements Comparable JavaDoc {
553         private final int min;
554         private final int max;
555
556         Range(int min, int max) {
557             this.min = min;
558             this.max = max;
559         }
560
561         int getMin() {
562             return min;
563         }
564
565         int getMax() {
566             return max;
567         }
568
569         public int compareTo(Object JavaDoc o) {
570             Range other = (Range)o;
571             if (this.min < other.min)
572                 return -1;
573             if (this.min > other.min)
574                 return 1;
575             if (this.max > other.max)
576                 return -1;
577             if (this.max < other.max)
578                 return 1;
579             return 0;
580         }
581     }
582
583     static abstract class CharClass {
584
585         private final int containsBmp;
586         // if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must
587
// contain all the high surrogates
588
private final int containsNonBmp;
589
590         protected CharClass(int containsBmp, int containsNonBmp) {
591             this.containsBmp = containsBmp;
592             this.containsNonBmp = containsNonBmp;
593         }
594
595         int getContainsBmp() {
596             return containsBmp;
597         }
598
599         int getContainsNonBmp() {
600             return containsNonBmp;
601         }
602
603         final void output(FastStringBuffer buf) {
604             switch (containsNonBmp) {
605             case NONE:
606                 if (containsBmp == NONE)
607                     buf.append(NOT_ALLOWED_CLASS);
608                 else
609                     outputBmp(buf);
610                 break;
611             case ALL:
612                 buf.append("(?:");
613                 if (containsBmp == NONE) {
614                     buf.append(SURROGATES1_CLASS);
615                     buf.append(SURROGATES2_CLASS);
616                 } else {
617                     outputBmp(buf);
618                     buf.append(SURROGATES2_CLASS);
619                     buf.append('?');
620                 }
621                 buf.append(')');
622                 break;
623             case SOME:
624                 buf.append("(?:");
625                 boolean needSep = false;
626                 if (containsBmp != NONE) {
627                     needSep = true;
628                     outputBmp(buf);
629                 }
630                 List ranges = new ArrayList(10);
631                 addNonBmpRanges(ranges);
632                 sortRangeList(ranges);
633                 String JavaDoc hi = highSurrogateRanges(ranges);
634                 if (hi.length() > 0) {
635                     if (needSep)
636                         buf.append('|');
637                     else
638                         needSep = true;
639                     buf.append('[');
640                     for (int i = 0, len = hi.length(); i < len; i += 2) {
641                         char min = hi.charAt(i);
642                         char max = hi.charAt(i + 1);
643                         if (min == max)
644                             buf.append(min);
645                         else {
646                             buf.append(min);
647                             buf.append('-');
648                             buf.append(max);
649                         }
650                     }
651                     buf.append(']');
652                     buf.append(SURROGATES2_CLASS);
653                 }
654                 String JavaDoc lo = lowSurrogateRanges(ranges);
655                 for (int i = 0, len = lo.length(); i < len; i += 3) {
656                     if (needSep)
657                         buf.append('|');
658                     else
659                         needSep = true;
660                     buf.append(lo.charAt(i));
661                     char min = lo.charAt(i + 1);
662                     char max = lo.charAt(i + 2);
663                     if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)))
664                         buf.append(min);
665                     else {
666                         buf.append('[');
667                         for (; ;) {
668                             if (min == max)
669                                 buf.append(min);
670                             else {
671                                 buf.append(min);
672                                 buf.append('-');
673                                 buf.append(max);
674                             }
675                             if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))
676                                 break;
677                             i += 3;
678                             min = lo.charAt(i + 1);
679                             max = lo.charAt(i + 2);
680                         }
681                         buf.append(']');
682                     }
683                 }
684                 if (!needSep)
685                     buf.append(NOT_ALLOWED_CLASS);
686                 buf.append(')');
687                 break;
688             }
689         }
690
691         static String JavaDoc highSurrogateRanges(List ranges) {
692             FastStringBuffer highRanges = new FastStringBuffer(ranges.size() * 2);
693             for (int i = 0, len = ranges.size(); i < len; i++) {
694                 Range r = (Range)ranges.get(i);
695                 char min1 = XMLChar.highSurrogate(r.getMin());
696                 char min2 = XMLChar.lowSurrogate(r.getMin());
697                 char max1 = XMLChar.highSurrogate(r.getMax());
698                 char max2 = XMLChar.lowSurrogate(r.getMax());
699                 if (min2 != SURROGATE2_MIN)
700                     min1++;
701                 if (max2 != SURROGATE2_MAX)
702                     max1--;
703                 if (max1 >= min1) {
704                     highRanges.append(min1);
705                     highRanges.append(max1);
706                 }
707             }
708             return highRanges.toString();
709         }
710
711         static String JavaDoc lowSurrogateRanges(List ranges) {
712             FastStringBuffer lowRanges = new FastStringBuffer(ranges.size() * 2);
713             for (int i = 0, len = ranges.size(); i < len; i++) {
714                 Range r = (Range)ranges.get(i);
715                 char min1 = XMLChar.highSurrogate(r.getMin());
716                 char min2 = XMLChar.lowSurrogate(r.getMin());
717                 char max1 = XMLChar.highSurrogate(r.getMax());
718                 char max2 = XMLChar.lowSurrogate(r.getMax());
719                 if (min1 == max1) {
720                     if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) {
721                         lowRanges.append(min1);
722                         lowRanges.append(min2);
723                         lowRanges.append(max2);
724                     }
725                 } else {
726                     if (min2 != SURROGATE2_MIN) {
727                         lowRanges.append(min1);
728                         lowRanges.append(min2);
729                         lowRanges.append(SURROGATE2_MAX);
730                     }
731                     if (max2 != SURROGATE2_MAX) {
732                         lowRanges.append(max1);
733                         lowRanges.append(SURROGATE2_MIN);
734                         lowRanges.append(max2);
735                     }
736                 }
737             }
738             return lowRanges.toString();
739         }
740
741         abstract void outputBmp(FastStringBuffer buf);
742
743         abstract void outputComplementBmp(FastStringBuffer buf);
744
745         int getSingleChar() {
746             return -1;
747         }
748
749         void addNonBmpRanges(List ranges) {
750         }
751
752
753         static void sortRangeList(List ranges) {
754             Collections.sort(ranges);
755             int toIndex = 0;
756             int fromIndex = 0;
757             int len = ranges.size();
758             while (fromIndex < len) {
759                 Range r = (Range)ranges.get(fromIndex);
760                 int min = r.getMin();
761                 int max = r.getMax();
762                 while (++fromIndex < len) {
763                     Range r2 = (Range)ranges.get(fromIndex);
764                     if (r2.getMin() > max + 1)
765                         break;
766                     if (r2.getMax() > max)
767                         max = r2.getMax();
768                 }
769                 if (max != r.getMax())
770                     r = new Range(min, max);
771                 ranges.set(toIndex++, r);
772             }
773             while (len > toIndex)
774                 ranges.remove(--len);
775         }
776
777     }
778
779     static abstract class SimpleCharClass extends CharClass {
780         SimpleCharClass(int containsBmp, int containsNonBmp) {
781             super(containsBmp, containsNonBmp);
782         }
783
784         void outputBmp(FastStringBuffer buf) {
785             buf.append('[');
786             inClassOutputBmp(buf);
787             buf.append(']');
788         }
789
790         // must not call if containsBmp == ALL
791
void outputComplementBmp(FastStringBuffer buf) {
792             if (getContainsBmp() == NONE)
793                 buf.append("[\u0000-\uFFFF]");
794             else {
795                 buf.append("[^");
796                 inClassOutputBmp(buf);
797                 buf.append(']');
798             }
799         }
800
801         abstract void inClassOutputBmp(FastStringBuffer buf);
802     }
803
804     static class SingleChar extends SimpleCharClass {
805         private final char c;
806
807         SingleChar(char c) {
808             super(SOME, NONE);
809             this.c = c;
810         }
811
812         int getSingleChar() {
813             return c;
814         }
815
816         void outputBmp(FastStringBuffer buf) {
817             inClassOutputBmp(buf);
818         }
819
820         void inClassOutputBmp(FastStringBuffer buf) {
821             if (isJavaMetaChar(c)) {
822                 buf.append('\\');
823                 buf.append(c);
824             } else {
825                 switch (c) {
826                     case '\r':
827                         buf.append("\\r");
828                         break;
829                     case '\n':
830                         buf.append("\\n");
831                         break;
832                     case '\t':
833                         buf.append("\\t");
834                         break;
835                     case ' ':
836                         buf.append("\\x20");
837                         break;
838                     default:
839                         buf.append(c);
840                 }
841             }
842             return;
843         }
844
845     }
846
847     static class WideSingleChar extends SimpleCharClass {
848         private final int c;
849
850         WideSingleChar(int c) {
851             super(NONE, SOME);
852             this.c = c;
853         }
854
855         void inClassOutputBmp(FastStringBuffer buf) {
856             throw new RuntimeException JavaDoc("BMP output botch");
857         }
858
859         int getSingleChar() {
860             return c;
861         }
862
863         void addNonBmpRanges(List ranges) {
864             ranges.add(new Range(c, c));
865         }
866     }
867
868     static class Empty extends SimpleCharClass {
869         private static final Empty instance = new Empty();
870
871         private Empty() {
872             super(NONE, NONE);
873         }
874
875         static Empty getInstance() {
876             return instance;
877         }
878
879         void inClassOutputBmp(FastStringBuffer buf) {
880             throw new RuntimeException JavaDoc("BMP output botch");
881         }
882
883     }
884
885     static class CharRange extends SimpleCharClass {
886         private final int lower;
887         private final int upper;
888
889         CharRange(int lower, int upper) {
890             super(lower < NONBMP_MIN ? SOME : NONE,
891                   // don't use ALL here, because that requires that the BMP class contains high surrogates
892
upper >= NONBMP_MIN ? SOME : NONE);
893             this.lower = lower;
894             this.upper = upper;
895         }
896
897         void inClassOutputBmp(FastStringBuffer buf) {
898             if (lower >= NONBMP_MIN)
899                 throw new RuntimeException JavaDoc("BMP output botch");
900             if (isJavaMetaChar((char)lower))
901                 buf.append('\\');
902             buf.append((char)lower);
903             buf.append('-');
904             if (upper < NONBMP_MIN) {
905                 if (isJavaMetaChar((char)upper))
906                     buf.append('\\');
907                 buf.append((char)upper);
908             } else
909                 buf.append('\uFFFF');
910         }
911
912         void addNonBmpRanges(List ranges) {
913             if (upper >= NONBMP_MIN)
914                 ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper));
915         }
916     }
917
918     static class Property extends SimpleCharClass {
919         private final String JavaDoc name;
920
921         Property(String JavaDoc name) {
922             super(SOME, NONE);
923             this.name = name;
924         }
925
926         void outputBmp(FastStringBuffer buf) {
927             inClassOutputBmp(buf);
928         }
929
930         void inClassOutputBmp(FastStringBuffer buf) {
931             buf.append("\\p{");
932             buf.append(name);
933             buf.append('}');
934         }
935
936         void outputComplementBmp(FastStringBuffer buf) {
937             buf.append("\\P{");
938             buf.append(name);
939             buf.append('}');
940         }
941     }
942
943     static class Subtraction extends CharClass {
944         private final CharClass cc1;
945         private final CharClass cc2;
946
947         Subtraction(CharClass cc1, CharClass cc2) {
948             // min corresponds to intersection
949
// complement corresponds to negation
950
super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()),
951                   Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp()));
952             this.cc1 = cc1;
953             this.cc2 = cc2;
954         }
955
956         void outputBmp(FastStringBuffer buf) {
957             buf.append('[');
958             cc1.outputBmp(buf);
959             buf.append("&&");
960             cc2.outputComplementBmp(buf);
961             buf.append(']');
962         }
963
964         void outputComplementBmp(FastStringBuffer buf) {
965             buf.append('[');
966             cc1.outputComplementBmp(buf);
967             cc2.outputBmp(buf);
968             buf.append(']');
969         }
970
971         void addNonBmpRanges(List ranges) {
972             List posList = new Vector();
973             cc1.addNonBmpRanges(posList);
974             List negList = new Vector();
975             cc2.addNonBmpRanges(negList);
976             sortRangeList(posList);
977             sortRangeList(negList);
978             Iterator negIter = negList.iterator();
979             Range negRange;
980             if (negIter.hasNext())
981                 negRange = (Range)negIter.next();
982             else
983                 negRange = null;
984             for (int i = 0, len = posList.size(); i < len; i++) {
985                 Range posRange = (Range)posList.get(i);
986                 while (negRange != null && negRange.getMax() < posRange.getMin()) {
987                     if (negIter.hasNext())
988                         negRange = (Range)negIter.next();
989                     else
990                         negRange = null;
991                 }
992                 // if negRange != null, negRange.max >= posRange.min
993
int min = posRange.getMin();
994                 while (negRange != null && negRange.getMin() <= posRange.getMax()) {
995                     if (min < negRange.getMin()) {
996                         ranges.add(new Range(min, negRange.getMin() - 1));
997                     }
998                     min = negRange.getMax() + 1;
999                     if (min > posRange.getMax())
1000                        break;
1001                    if (negIter.hasNext())
1002                        negRange = (Range)negIter.next();
1003                    else
1004                        negRange = null;
1005                }
1006                if (min <= posRange.getMax())
1007                    ranges.add(new Range(min, posRange.getMax()));
1008            }
1009        }
1010    }
1011
1012    static class Union extends CharClass {
1013        private final List members;
1014
1015        Union(CharClass[] v) {
1016            this(toList(v));
1017        }
1018
1019        private static List toList(CharClass[] v) {
1020            List members = new Vector();
1021            for (int i = 0; i < v.length; i++)
1022                members.add(v[i]);
1023            return members;
1024        }
1025
1026        Union(List members) {
1027            super(computeContainsBmp(members), computeContainsNonBmp(members));
1028            this.members = members;
1029        }
1030
1031        void outputBmp(FastStringBuffer buf) {
1032            buf.append('[');
1033            for (int i = 0, len = members.size(); i < len; i++) {
1034                CharClass cc = (CharClass)members.get(i);
1035                if (cc.getContainsBmp() != NONE) {
1036                    if (cc instanceof SimpleCharClass)
1037                        ((SimpleCharClass)cc).inClassOutputBmp(buf);
1038                    else
1039                        cc.outputBmp(buf);
1040                }
1041            }
1042            buf.append(']');
1043        }
1044
1045        void outputComplementBmp(FastStringBuffer buf) {
1046            boolean first = true;
1047            int len = members.size();
1048            for (int i = 0; i < len; i++) {
1049                CharClass cc = (CharClass)members.get(i);
1050                if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) {
1051                    if (first) {
1052                        buf.append("[^");
1053                        first = false;
1054                    }
1055                    ((SimpleCharClass)cc).inClassOutputBmp(buf);
1056                }
1057            }
1058            for (int i = 0; i < len; i++) {
1059                CharClass cc = (CharClass)members.get(i);
1060                if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) {
1061                    if (first) {
1062                        buf.append('[');
1063                        first = false;
1064                    } else
1065                        buf.append("&&");
1066                    // can't have any members that are ALL, because that would make this ALL, which violates
1067
// the precondition for outputComplementBmp
1068
cc.outputComplementBmp(buf);
1069                }
1070            }
1071            if (first == true)
1072            // all members are NONE, so this is NONE, so complement is everything
1073
buf.append("[\u0000-\uFFFF]");
1074            else
1075                buf.append(']');
1076        }
1077
1078        void addNonBmpRanges(List ranges) {
1079            for (int i = 0, len = members.size(); i < len; i++)
1080                ((CharClass)members.get(i)).addNonBmpRanges(ranges);
1081        }
1082
1083        private static int computeContainsBmp(List members) {
1084            int ret = NONE;
1085            for (int i = 0, len = members.size(); i < len; i++)
1086                ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp());
1087            return ret;
1088        }
1089
1090        private static int computeContainsNonBmp(List members) {
1091            int ret = NONE;
1092            for (int i = 0, len = members.size(); i < len; i++)
1093                ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp());
1094            return ret;
1095        }
1096    }
1097
1098    static class BackReference extends CharClass {
1099       private final int i;
1100
1101        BackReference(int i) {
1102            super(SOME, NONE);
1103            this.i = i;
1104        }
1105
1106        void outputBmp(FastStringBuffer buf) {
1107            inClassOutputBmp(buf);
1108        }
1109
1110        void outputComplementBmp(FastStringBuffer buf) {
1111            inClassOutputBmp(buf);
1112        }
1113
1114        void inClassOutputBmp(FastStringBuffer buf) {
1115            buf.append("\\" + i);
1116        }
1117    }
1118
1119    /**
1120     * Thrown when an syntactically incorrect regular expression is detected.
1121     */

1122    public static class RegexSyntaxException extends Exception JavaDoc {
1123        private final int position;
1124
1125        /**
1126         * Represents an unknown position within a string containing a regular expression.
1127         */

1128        public static final int UNKNOWN_POSITION = -1;
1129
1130        public RegexSyntaxException(String JavaDoc detail) {
1131            this(detail, UNKNOWN_POSITION);
1132        }
1133
1134        public RegexSyntaxException(String JavaDoc detail, int position) {
1135            super(detail);
1136            this.position = position;
1137        }
1138
1139        /**
1140         * Returns the index into the regular expression where the error was detected
1141         * or <code>UNKNOWN_POSITION</code> if this is unknown.
1142         *
1143         * @return the index into the regular expression where the error was detected,
1144         * or <code>UNKNOWNN_POSITION</code> if this is unknown
1145         */

1146        public int getPosition() {
1147            return position;
1148        }
1149    }
1150
1151// public static class Localizer {
1152
// private final Class cls;
1153
// private ResourceBundle bundle;
1154
//
1155
// public Localizer(Class cls) {
1156
// this.cls = cls;
1157
// }
1158
//
1159
// public String message(String key) {
1160
// return MessageFormat.format(getBundle().getString(key), new Object[]{});
1161
// }
1162
//
1163
// public String message(String key, Object arg) {
1164
// return MessageFormat.format(getBundle().getString(key),
1165
// new Object[]{arg});
1166
// }
1167
//
1168
// public String message(String key, Object arg1, Object arg2) {
1169
// return MessageFormat.format(getBundle().getString(key),
1170
// new Object[]{arg1, arg2});
1171
// }
1172
//
1173
// public String message(String key, Object[] args) {
1174
// return MessageFormat.format(getBundle().getString(key), args);
1175
// }
1176
//
1177
// private ResourceBundle getBundle() {
1178
// if (bundle == null) {
1179
// String s = cls.getName();
1180
// int i = s.lastIndexOf('.');
1181
// if (i > 0)
1182
// s = s.substring(0, i + 1);
1183
// else
1184
// s = "";
1185
// bundle = ResourceBundle.getBundle(s + "resources.Messages");
1186
// }
1187
// return bundle;
1188
// }
1189
// }
1190

1191    static class Complement extends CharClass {
1192        private final CharClass cc;
1193
1194        Complement(CharClass cc) {
1195            super(-cc.getContainsBmp(), -cc.getContainsNonBmp());
1196            this.cc = cc;
1197        }
1198
1199        void outputBmp(FastStringBuffer buf) {
1200            cc.outputComplementBmp(buf);
1201        }
1202
1203        void outputComplementBmp(FastStringBuffer buf) {
1204            cc.outputBmp(buf);
1205        }
1206
1207        void addNonBmpRanges(List ranges) {
1208            List tem = new Vector();
1209            cc.addNonBmpRanges(tem);
1210            sortRangeList(tem);
1211            int c = NONBMP_MIN;
1212            for (int i = 0, len = tem.size(); i < len; i++) {
1213                Range r = (Range)tem.get(i);
1214                if (r.getMin() > c)
1215                    ranges.add(new Range(c, r.getMin() - 1));
1216                c = r.getMax() + 1;
1217            }
1218            if (c != NONBMP_MAX + 1)
1219                ranges.add(new Range(c, NONBMP_MAX));
1220        }
1221    }
1222
1223    private boolean translateAtom() throws RegexSyntaxException {
1224        switch (curChar) {
1225        case EOS:
1226            if (!eos)
1227                break;
1228            // fall through
1229
case '?':
1230        case '*':
1231        case '+':
1232        case ')':
1233        case '{':
1234        case '}':
1235        case '|':
1236        case ']':
1237            return false;
1238        case '(':
1239            copyCurChar();
1240            translateRegExp();
1241            expect(')');
1242            copyCurChar();
1243            return true;
1244        case '\\':
1245            advance();
1246            parseEsc().output(result);
1247            return true;
1248        case '[':
1249            advance();
1250            parseCharClassExpr().output(result);
1251            return true;
1252        case '.':
1253            if (isXPath) {
1254                // Note: "." matches a surrogate pair under JDK 1.5, but not under JDK 1.4
1255
// We'll live with this problem until 1.4 goes away...
1256
break;
1257            } else {
1258                DOT.output(result);
1259                advance();
1260                return true;
1261            }
1262        case '$':
1263        case '^':
1264            if (isXPath) {
1265                copyCurChar();
1266                return true;
1267            }
1268            result.append('\\');
1269            break;
1270        }
1271        copyCurChar();
1272        return true;
1273    }
1274
1275
1276    static private CharClass makeCharClass(String JavaDoc categories, String JavaDoc includes, String JavaDoc excludeRanges) {
1277        List includeList = new Vector();
1278        for (int i = 0, len = categories.length(); i < len; i += 2)
1279            includeList.add(new Property(categories.substring(i, i + 2)));
1280        for (int i = 0, len = includes.length(); i < len; i++) {
1281            int j = i + 1;
1282            for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++)
1283                ;
1284            --j;
1285            if (i == j - 1)
1286                --j;
1287            if (i == j)
1288                includeList.add(new SingleChar(includes.charAt(i)));
1289            else
1290                includeList.add(new CharRange(includes.charAt(i), includes.charAt(j)));
1291            i = j;
1292        }
1293        List excludeList = new Vector();
1294        for (int i = 0, len = excludeRanges.length(); i < len; i += 2) {
1295            char min = excludeRanges.charAt(i);
1296            char max = excludeRanges.charAt(i + 1);
1297            if (min == max)
1298                excludeList.add(new SingleChar(min));
1299            else if (min == max - 1) {
1300                excludeList.add(new SingleChar(min));
1301                excludeList.add(new SingleChar(max));
1302            } else
1303                excludeList.add(new CharRange(min, max));
1304        }
1305        return new Subtraction(new Union(includeList), new Union(excludeList));
1306    }
1307
1308    private CharClass parseEsc() throws RegexSyntaxException {
1309        switch (curChar) {
1310        case 'n':
1311            advance();
1312            return new SingleChar('\n');
1313        case 'r':
1314            advance();
1315            return new SingleChar('\r');
1316        case 't':
1317            advance();
1318            return new SingleChar('\t');
1319        case '\\':
1320        case '|':
1321        case '.':
1322        case '-':
1323        case '^':
1324        case '?':
1325        case '*':
1326        case '+':
1327        case '(':
1328        case ')':
1329        case '{':
1330        case '}':
1331        case '[':
1332        case ']':
1333            break;
1334        case 's':
1335            advance();
1336            return ESC_s;
1337        case 'S':
1338            advance();
1339            return ESC_S;
1340        case 'i':
1341            advance();
1342            return ESC_i;
1343        case 'I':
1344            advance();
1345            return ESC_I;
1346        case 'c':
1347            advance();
1348            return ESC_c;
1349        case 'C':
1350            advance();
1351            return ESC_C;
1352        case 'd':
1353            advance();
1354            return ESC_d;
1355        case 'D':
1356            advance();
1357            return ESC_D;
1358        case 'w':
1359            advance();
1360            return ESC_w;
1361        case 'W':
1362            advance();
1363            return ESC_W;
1364        case 'p':
1365            advance();
1366            return parseProp();
1367        case 'P':
1368            advance();
1369            return new Complement(parseProp());
1370        case '0': case '1': case '2': case '3': case '4':
1371        case '5': case '6': case '7': case '8': case '9':
1372            if (isXPath) {
1373                char c = curChar;
1374                advance();
1375                return new BackReference(c - '0');
1376            } else {
1377                throw makeException("digit not allowed after \\");
1378            }
1379        case '$':
1380            if (isXPath) {
1381                break;
1382            }
1383            // otherwise fall through
1384
default:
1385            throw makeException("invalid escape sequence");
1386        }
1387        CharClass tem = new SingleChar(curChar);
1388        advance();
1389        return tem;
1390    }
1391
1392    private CharClass parseProp() throws RegexSyntaxException {
1393        expect('{');
1394        int start = pos;
1395        for (; ;) {
1396            advance();
1397            if (curChar == '}')
1398                break;
1399            if (!isAsciiAlnum(curChar) && curChar != '-')
1400                expect('}');
1401        }
1402        String JavaDoc propertyName = regExp.subSequence(start, pos - 1).toString();
1403        advance();
1404        switch (propertyName.length()) {
1405        case 0:
1406            throw makeException("empty property name");
1407        case 2:
1408            int sci = subCategories.indexOf(propertyName);
1409            if (sci < 0 || sci % 2 == 1)
1410                throw makeException("unknown category");
1411            return getSubCategoryCharClass(sci / 2);
1412        case 1:
1413            int ci = categories.indexOf(propertyName.charAt(0));
1414            if (ci < 0)
1415                throw makeException("unknown category", propertyName);
1416            return getCategoryCharClass(ci);
1417        default:
1418            if (!propertyName.startsWith("Is"))
1419                break;
1420            String JavaDoc blockName = propertyName.substring(2);
1421            for (int i = 0; i < specialBlockNames.length; i++)
1422                if (blockName.equals(specialBlockNames[i]))
1423                    return specialBlockCharClasses[i];
1424            if (!isBlock(blockName))
1425                throw makeException("invalid block name", blockName);
1426            return new Property("In" + blockName);
1427        }
1428        throw makeException("invalid property name", propertyName);
1429    }
1430
1431    private static boolean isBlock(String JavaDoc name) {
1432        for (int i = 0; i < blockNames.length; i++)
1433            if (name.equals(blockNames[i]))
1434                return true;
1435        return false;
1436    }
1437
1438    private static boolean isAsciiAlnum(char c) {
1439        if ('a' <= c && c <= 'z')
1440            return true;
1441        if ('A' <= c && c <= 'Z')
1442            return true;
1443        if ('0' <= c && c <= '9')
1444            return true;
1445        return false;
1446    }
1447
1448    private void expect(char c) throws RegexSyntaxException {
1449        if (curChar != c)
1450            throw makeException("expected", new String JavaDoc(new char[]{c}));
1451    }
1452
1453    private CharClass parseCharClassExpr() throws RegexSyntaxException {
1454        boolean compl;
1455        if (curChar == '^') {
1456            advance();
1457            compl = true;
1458        } else
1459            compl = false;
1460        List members = new ArrayList(10);
1461        boolean first = true;
1462        do {
1463            CharClass lower = parseCharClassEscOrXmlChar(first);
1464            first = false;
1465            members.add(lower);
1466            if (curChar == '-') {
1467                advance();
1468                if (curChar == ']') { // MHK: [+-] is reallowed by Schema Oct 2004 2nd edition
1469
break;
1470                }
1471                if (curChar == '[') {
1472                    break;
1473                }
1474                CharClass upper = parseCharClassEscOrXmlChar(first);
1475                if (lower.getSingleChar() < 0 || upper.getSingleChar() < 0)
1476                    throw makeException("multi_range");
1477                if (lower.getSingleChar() > upper.getSingleChar())
1478                    throw makeException("invalid range (start > end)");
1479                members.set(members.size() - 1,
1480                            new CharRange(lower.getSingleChar(), upper.getSingleChar()));
1481                if (curChar == '-') {
1482                    advance();
1483                    expect('[');
1484                    break;
1485                }
1486            }
1487        } while (curChar != ']');
1488        CharClass result;
1489        if (members.size() == 1)
1490            result = (CharClass)members.get(0);
1491        else
1492            result = new Union(members);
1493        if (compl)
1494            result = new Complement(result);
1495        if (curChar == '[') {
1496            advance();
1497            result = new Subtraction(result, parseCharClassExpr());
1498            expect(']');
1499        }
1500        advance();
1501        return result;
1502    }
1503
1504    private CharClass parseCharClassEscOrXmlChar(boolean first) throws RegexSyntaxException {
1505        switch (curChar) {
1506        case EOS:
1507            if (eos)
1508                expect(']');
1509            break;
1510        case '\\':
1511            advance();
1512            return parseEsc();
1513        case '[':
1514        case ']':
1515            throw makeException("character must be escaped", new String JavaDoc(new char[]{curChar}));
1516        case '-':
1517            if (!first) {
1518                throw makeException("character must be escaped", new String JavaDoc(new char[]{curChar}));
1519            }
1520            break;
1521        }
1522        CharClass tem;
1523        if (XMLChar.isSurrogate(curChar)) {
1524            if (!XMLChar.isHighSurrogate(curChar))
1525                throw makeException("invalid surrogate pair");
1526            char c1 = curChar;
1527            advance();
1528            if (!XMLChar.isLowSurrogate(curChar))
1529                throw makeException("invalid surrogate pair");
1530            tem = new WideSingleChar(XMLChar.supplemental(c1, curChar));
1531        } else
1532            tem = new SingleChar(curChar);
1533        advance();
1534        return tem;
1535    }
1536
1537    private RegexSyntaxException makeException(String JavaDoc key) {
1538        return new RegexSyntaxException("Error at character " + (pos - 1) +
1539                                        " in regular expression: " + key);
1540    }
1541
1542    private RegexSyntaxException makeException(String JavaDoc key, String JavaDoc arg) {
1543        return new RegexSyntaxException("Error at character " + (pos - 1) +
1544                                        " in regular expression: " + key +
1545                                        " (" + arg + ')');
1546    }
1547
1548    private static boolean isJavaMetaChar(char c) {
1549        switch (c) {
1550        case '\\':
1551        case '^':
1552        case '?':
1553        case '*':
1554        case '+':
1555        case '(':
1556        case ')':
1557        case '{':
1558        case '}':
1559        case '|':
1560        case '[':
1561        case ']':
1562        case '-':
1563        case '&':
1564        case '$':
1565        case '.':
1566            return true;
1567        }
1568        return false;
1569    }
1570
1571    private static synchronized CharClass getCategoryCharClass(int ci) {
1572        if (categoryCharClasses[ci] == null)
1573            categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci));
1574        return categoryCharClasses[ci];
1575    }
1576
1577    private static synchronized CharClass getSubCategoryCharClass(int sci) {
1578        if (subCategoryCharClasses[sci] == null)
1579            subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2));
1580        return subCategoryCharClasses[sci];
1581    }
1582
1583    private static final char UNICODE_3_1_ADD_Lu = '\u03F4'; // added in 3.1
1584
private static final char UNICODE_3_1_ADD_Ll = '\u03F5'; // added in 3.1
1585
// 3 characters changed from No to Nl between 3.0 and 3.1
1586
private static final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE';
1587    private static final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0';
1588    private static final String JavaDoc CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi
1589
private static final String JavaDoc CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf
1590

1591    private static CharClass computeCategoryCharClass(char code) {
1592        List classes = new Vector();
1593        classes.add(new Property(new String JavaDoc(new char[]{code})));
1594        for (int ci = CATEGORY_NAMES.indexOf(code); ci >= 0; ci = CATEGORY_NAMES.indexOf(code, ci + 1)) {
1595            int[] addRanges = CATEGORY_RANGES[ci / 2];
1596            for (int i = 0; i < addRanges.length; i += 2)
1597                classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
1598        }
1599        if (code == 'P')
1600            classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf));
1601        if (code == 'L') {
1602            classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1603            classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1604        }
1605        if (code == 'C') {
1606            // JDK 1.4 leaves Cn out of C?
1607
classes.add(new Subtraction(new Property("Cn"),
1608                                        new Union(new CharClass[]{new SingleChar(UNICODE_3_1_ADD_Lu),
1609                                                                  new SingleChar(UNICODE_3_1_ADD_Ll)})));
1610            List assignedRanges = new Vector();
1611            for (int i = 0; i < CATEGORY_RANGES.length; i++)
1612                for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
1613                    assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j],
1614                                                     CATEGORY_RANGES[i][j + 1]));
1615            classes.add(new Subtraction(new CharRange(NONBMP_MIN, NONBMP_MAX),
1616                                        new Union(assignedRanges)));
1617        }
1618        if (classes.size() == 1)
1619            return (CharClass)classes.get(0);
1620        return new Union(classes);
1621    }
1622
1623    private static CharClass computeSubCategoryCharClass(String JavaDoc name) {
1624        CharClass base = new Property(name);
1625        int sci = CATEGORY_NAMES.indexOf(name);
1626        if (sci < 0) {
1627            if (name.equals("Cn")) {
1628                // Unassigned
1629
List assignedRanges = new Vector();
1630                assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1631                assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1632                for (int i = 0; i < CATEGORY_RANGES.length; i++)
1633                    for (int j = 0; j < CATEGORY_RANGES[i].length; j += 2)
1634                        assignedRanges.add(new CharRange(CATEGORY_RANGES[i][j],
1635                                                         CATEGORY_RANGES[i][j + 1]));
1636                return new Subtraction(new Union(new CharClass[]{base, new CharRange(NONBMP_MIN, NONBMP_MAX)}),
1637                                       new Union(assignedRanges));
1638            }
1639            if (name.equals("Pi"))
1640                return makeCharClass(CATEGORY_Pi);
1641            if (name.equals("Pf"))
1642                return makeCharClass(CATEGORY_Pf);
1643            return base;
1644        }
1645        List classes = new Vector();
1646        classes.add(base);
1647        int[] addRanges = CATEGORY_RANGES[sci / 2];
1648        for (int i = 0; i < addRanges.length; i += 2)
1649            classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
1650        if (name.equals("Lu"))
1651            classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1652        else if (name.equals("Ll"))
1653            classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1654        else if (name.equals("Nl"))
1655            classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1656        else if (name.equals("No"))
1657            return new Subtraction(new Union(classes),
1658                                   new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN,
1659                                                 UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1660        return new Union(classes);
1661    }
1662
1663    private static CharClass makeCharClass(String JavaDoc members) {
1664        List list = new Vector();
1665        for (int i = 0, len = members.length(); i < len; i++)
1666            list.add(new SingleChar(members.charAt(i)));
1667        return new Union(list);
1668    }
1669
1670    public static void main(String JavaDoc[] args) throws RegexSyntaxException {
1671        String JavaDoc s = translate(args[0], args[1].equals("xpath"));
1672        for (int i = 0, len = s.length(); i < len; i++) {
1673            char c = s.charAt(i);
1674            if (c >= 0x20 && c <= 0x7e)
1675                System.err.print(c);
1676            else {
1677                System.err.print("\\u");
1678                for (int shift = 12; shift >= 0; shift -= 4)
1679                    System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF));
1680            }
1681        }
1682        System.err.println();
1683    }
1684
1685
1686//}
1687

1688
1689}
1690
1691//
1692
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
1693
// you may not use this file except in compliance with the License. You may obtain a copy of the
1694
// License at http://www.mozilla.org/MPL/
1695
//
1696
// Software distributed under the License is distributed on an "AS IS" basis,
1697
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
1698
// See the License for the specific language governing rights and limitations under the License.
1699
//
1700
// The Original Code is: all this file except changes marked.
1701
//
1702
// The Initial Developer of the Original Code is James Clark
1703
//
1704
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
1705
//
1706
// Contributor(s): Michael Kay
1707
//
1708

1709
Popular Tags