KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > thaiopensource > datatype > xsd > regex > jdk1_4 > Translator


1 package com.thaiopensource.datatype.xsd.regex.jdk1_4;
2
3 import com.thaiopensource.util.Utf16;
4 import com.thaiopensource.util.Localizer;
5 import com.thaiopensource.datatype.xsd.regex.RegexSyntaxException;
6
7 import java.util.Collections JavaDoc;
8 import java.util.Iterator JavaDoc;
9 import java.util.List JavaDoc;
10 import java.util.Vector JavaDoc;
11 import java.math.BigDecimal JavaDoc;
12
13 /**
14  * Translates XML Schema regexes into <code>java.util.regex</code> regexes.
15  *
16  * @see java.util.regex.Pattern
17  * @see <a HREF="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
18  */

19 public class Translator {
20   private final String JavaDoc regExp;
21   private int pos = 0;
22   private final int length;
23   private char curChar;
24   private boolean eos = false;
25   private final StringBuffer JavaDoc result = new StringBuffer JavaDoc();
26
27   static private final String JavaDoc categories = "LMNPZSC";
28   static private final CharClass[] categoryCharClasses = new CharClass[categories.length()];
29   static private final String JavaDoc subCategories = "LuLlLtLmLoMnMcMeNdNlNoPcPdPsPePiPfPoZsZlZpSmScSkSoCcCfCoCn";
30   static private final CharClass[] subCategoryCharClasses = new CharClass[subCategories.length() / 2];
31
32   static private final int NONBMP_MIN = 0x10000;
33   static private final int NONBMP_MAX = 0x10FFFF;
34   static private final char SURROGATE2_MIN = '\uDC00';
35   static private final char SURROGATE2_MAX = '\uDFFF';
36
37   static final Localizer localizer = new Localizer(Translator.class);
38
39   static private final String JavaDoc[] blockNames = {
40     "BasicLatin",
41     "Latin-1Supplement",
42     "LatinExtended-A",
43     "LatinExtended-B",
44     "IPAExtensions",
45     "SpacingModifierLetters",
46     "CombiningDiacriticalMarks",
47     "Greek",
48     "Cyrillic",
49     "Armenian",
50     "Hebrew",
51     "Arabic",
52     "Syriac",
53     "Thaana",
54     "Devanagari",
55     "Bengali",
56     "Gurmukhi",
57     "Gujarati",
58     "Oriya",
59     "Tamil",
60     "Telugu",
61     "Kannada",
62     "Malayalam",
63     "Sinhala",
64     "Thai",
65     "Lao",
66     "Tibetan",
67     "Myanmar",
68     "Georgian",
69     "HangulJamo",
70     "Ethiopic",
71     "Cherokee",
72     "UnifiedCanadianAboriginalSyllabics",
73     "Ogham",
74     "Runic",
75     "Khmer",
76     "Mongolian",
77     "LatinExtendedAdditional",
78     "GreekExtended",
79     "GeneralPunctuation",
80     "SuperscriptsandSubscripts",
81     "CurrencySymbols",
82     "CombiningMarksforSymbols",
83     "LetterlikeSymbols",
84     "NumberForms",
85     "Arrows",
86     "MathematicalOperators",
87     "MiscellaneousTechnical",
88     "ControlPictures",
89     "OpticalCharacterRecognition",
90     "EnclosedAlphanumerics",
91     "BoxDrawing",
92     "BlockElements",
93     "GeometricShapes",
94     "MiscellaneousSymbols",
95     "Dingbats",
96     "BraillePatterns",
97     "CJKRadicalsSupplement",
98     "KangxiRadicals",
99     "IdeographicDescriptionCharacters",
100     "CJKSymbolsandPunctuation",
101     "Hiragana",
102     "Katakana",
103     "Bopomofo",
104     "HangulCompatibilityJamo",
105     "Kanbun",
106     "BopomofoExtended",
107     "EnclosedCJKLettersandMonths",
108     "CJKCompatibility",
109     "CJKUnifiedIdeographsExtensionA",
110     "CJKUnifiedIdeographs",
111     "YiSyllables",
112     "YiRadicals",
113     "HangulSyllables",
114     // surrogates excluded because there are never any *characters* with codes in surrogate range
115
// "PrivateUse", excluded because 3.1 adds non-BMP ranges
116
"CJKCompatibilityIdeographs",
117     "AlphabeticPresentationForms",
118     "ArabicPresentationForms-A",
119     "CombiningHalfMarks",
120     "CJKCompatibilityForms",
121     "SmallFormVariants",
122     "ArabicPresentationForms-B",
123     "Specials",
124     "HalfwidthandFullwidthForms",
125     "Specials"
126   };
127
128
129   /**
130    * Names of blocks including ranges outside the BMP.
131    */

132   static private final String JavaDoc[] specialBlockNames = {
133     "OldItalic",
134     "Gothic",
135     "Deseret",
136     "ByzantineMusicalSymbols",
137     "MusicalSymbols",
138     "MathematicalAlphanumericSymbols",
139     "CJKUnifiedIdeographsExtensionB",
140     "CJKCompatibilityIdeographsSupplement",
141     "Tags",
142     "PrivateUse",
143     "HighSurrogates",
144     "HighPrivateUseSurrogates",
145     "LowSurrogates",
146   };
147
148   /**
149    * CharClass for each block name in specialBlockNames.
150    */

151   static private final CharClass[] specialBlockCharClasses = {
152     new CharRange(0x10300, 0x1032F),
153     new CharRange(0x10330, 0x1034F),
154     new CharRange(0x10400, 0x1044F),
155     new CharRange(0x1D000, 0x1D0FF),
156     new CharRange(0x1D100, 0x1D1FF),
157     new CharRange(0x1D400, 0x1D7FF),
158     new CharRange(0x20000, 0x2A6D6),
159     new CharRange(0x2F800, 0x2FA1F),
160     new CharRange(0xE0000, 0xE007F),
161     new Union(new CharClass[] {
162       new CharRange(0xE000, 0xF8FF),
163       new CharRange(0xF0000, 0xFFFFD),
164       new CharRange(0x100000, 0x10FFFD)
165     }),
166     Empty.getInstance(),
167     Empty.getInstance(),
168     Empty.getInstance()
169   };
170
171   static private final CharClass DOT = new Complement(new Union(new CharClass[] { new SingleChar('\n'), new SingleChar('\r') }));
172
173   static private final CharClass ESC_d = new Property("Nd");
174
175   static private final CharClass ESC_D = new Complement(ESC_d);
176
177   static private final CharClass ESC_W = new Union(new CharClass[] {new Property("P"), new Property("Z"), new Property("C")});
178
179   static private final CharClass ESC_w = new Complement(ESC_W);
180
181   static private final CharClass ESC_s = new Union(new CharClass[] {
182     new SingleChar(' '),
183     new SingleChar('\n'),
184     new SingleChar('\r'),
185     new SingleChar('\t')
186   });
187
188   static private final CharClass ESC_S = new Complement(ESC_s);
189
190   static private final CharClass ESC_i = makeCharClass(NamingExceptions.NMSTRT_CATEGORIES,
191                                                        NamingExceptions.NMSTRT_INCLUDES,
192                                                        NamingExceptions.NMSTRT_EXCLUDE_RANGES);
193
194   static private final CharClass ESC_I = new Complement(ESC_i);
195
196   static private final CharClass ESC_c = makeCharClass(NamingExceptions.NMCHAR_CATEGORIES,
197                                                        NamingExceptions.NMCHAR_INCLUDES,
198                                                        NamingExceptions.NMCHAR_EXCLUDE_RANGES);
199
200   static private final CharClass ESC_C = new Complement(ESC_c);
201
202   static private final char EOS = '\0';
203
204   private Translator(String JavaDoc regExp) {
205     this.regExp = regExp;
206     this.length = regExp.length();
207     advance();
208   }
209
210   /**
211    * Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
212    * expression in the syntax of <code>java.util.regex.Pattern</code>. The translation
213    * assumes that the string to be matched against the regex uses surrogate pairs correctly.
214    * If the string comes from XML content, a conforming XML parser will automatically
215    * check this; if the string comes from elsewhere, it may be necessary to check
216    * surrogate usage before matching.
217    *
218    * @param regexp a String containing a regular expression in the syntax of XML Schemas Part 2
219    * @return a String containing a regular expression in the syntax of java.util.regex.Pattern
220    * @throws RegexSyntaxException if <code>regexp</code> is not a regular expression in the
221    * syntax of XML Schemas Part 2
222    * @see java.util.regex.Pattern
223    * @see <a HREF="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema Part 2</a>
224    */

225   static public String JavaDoc translate(String JavaDoc regexp) throws RegexSyntaxException {
226     Translator tr = new Translator(regexp);
227     tr.translateTop();
228     return tr.result.toString();
229   }
230
231   private void advance() {
232     if (pos < length)
233       curChar = regExp.charAt(pos++);
234     else {
235       pos++;
236       curChar = EOS;
237       eos = true;
238     }
239   }
240
241   private void translateTop() throws RegexSyntaxException {
242     translateRegExp();
243     if (!eos)
244       throw makeException("expected_eos");
245   }
246
247   private void translateRegExp() throws RegexSyntaxException {
248     translateBranch();
249     while (curChar == '|') {
250       copyCurChar();
251       translateBranch();
252     }
253   }
254
255   private void translateBranch() throws RegexSyntaxException {
256     while (translateAtom())
257       translateQuantifier();
258   }
259
260   private void translateQuantifier() throws RegexSyntaxException {
261     switch (curChar) {
262     case '*':
263     case '?':
264     case '+':
265       copyCurChar();
266       return;
267     case '{':
268       copyCurChar();
269       translateQuantity();
270       expect('}');
271       copyCurChar();
272     }
273   }
274
275   private void translateQuantity() throws RegexSyntaxException {
276     String JavaDoc lower = parseQuantExact();
277     int lowerValue = -1;
278     try {
279       lowerValue = Integer.parseInt(lower);
280       result.append(lower);
281     }
282     catch (NumberFormatException JavaDoc e) {
283       // JDK 1.4 cannot handle ranges bigger than this
284
result.append(Integer.MAX_VALUE);
285     }
286     if (curChar == ',') {
287       copyCurChar();
288       if (curChar != '}') {
289         String JavaDoc upper = parseQuantExact();
290         try {
291           int upperValue = Integer.parseInt(upper);
292           result.append(upper);
293           if (lowerValue < 0 || upperValue < lowerValue)
294             throw makeException("invalid_quantity_range");
295         }
296         catch (NumberFormatException JavaDoc e) {
297           result.append(Integer.MAX_VALUE);
298           if (lowerValue < 0 && new BigDecimal JavaDoc(lower).compareTo(new BigDecimal JavaDoc(upper)) > 0)
299             throw makeException("invalid_quantity_range");
300         }
301       }
302     }
303   }
304
305   private String JavaDoc parseQuantExact() throws RegexSyntaxException {
306     StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
307     do {
308       if ("0123456789".indexOf(curChar) < 0)
309         throw makeException("expected_digit");
310       buf.append(curChar);
311       advance();
312     } while (curChar != ',' && curChar != '}');
313     return buf.toString();
314   }
315
316   private void copyCurChar() {
317     result.append(curChar);
318     advance();
319   }
320
321   static final int NONE = -1;
322   static final int SOME = 0;
323   static final int ALL = 1;
324
325   static final String JavaDoc SURROGATES1_CLASS = "[\uD800-\uDBFF]";
326   static final String JavaDoc SURROGATES2_CLASS = "[\uDC00-\uDFFF]";
327   static final String JavaDoc NOT_ALLOWED_CLASS = "[\u0000&&[^\u0000]]";
328
329   static final class Range implements Comparable JavaDoc {
330     private final int min;
331     private final int max;
332
333     Range(int min, int max) {
334       this.min = min;
335       this.max = max;
336     }
337
338     int getMin() {
339       return min;
340     }
341
342     int getMax() {
343       return max;
344     }
345
346     public int compareTo(Object JavaDoc o) {
347       Range other = (Range)o;
348       if (this.min < other.min)
349         return -1;
350       if (this.min > other.min)
351         return 1;
352       if (this.max > other.max)
353         return -1;
354       if (this.max < other.max)
355         return 1;
356       return 0;
357     }
358   }
359
360   static abstract class CharClass {
361
362     private final int containsBmp;
363     // if it contains ALL and containsBmp != NONE, then the generated class for containsBmp must
364
// contain all the high surrogates
365
private final int containsNonBmp;
366
367     protected CharClass(int containsBmp, int containsNonBmp) {
368       this.containsBmp = containsBmp;
369       this.containsNonBmp = containsNonBmp;
370     }
371
372     int getContainsBmp() {
373       return containsBmp;
374     }
375
376     int getContainsNonBmp() {
377       return containsNonBmp;
378     }
379
380     final void output(StringBuffer JavaDoc buf) {
381       switch (containsNonBmp) {
382       case NONE:
383         if (containsBmp == NONE)
384           buf.append(NOT_ALLOWED_CLASS);
385         else
386           outputBmp(buf);
387         break;
388       case ALL:
389         buf.append('(');
390         if (containsBmp == NONE) {
391           buf.append(SURROGATES1_CLASS);
392           buf.append(SURROGATES2_CLASS);
393         }
394         else {
395           outputBmp(buf);
396           buf.append(SURROGATES2_CLASS);
397           buf.append('?');
398         }
399         buf.append(')');
400         break;
401       case SOME:
402         buf.append('(');
403         boolean needSep = false;
404         if (containsBmp != NONE) {
405           needSep = true;
406           outputBmp(buf);
407         }
408         List JavaDoc ranges = new Vector JavaDoc();
409         addNonBmpRanges(ranges);
410         sortRangeList(ranges);
411         String JavaDoc hi = highSurrogateRanges(ranges);
412         if (hi.length() > 0) {
413           if (needSep)
414             buf.append('|');
415           else
416             needSep = true;
417           buf.append('[');
418           for (int i = 0, len = hi.length(); i < len; i += 2) {
419             char min = hi.charAt(i);
420             char max = hi.charAt(i + 1);
421             if (min == max)
422               buf.append(min);
423             else {
424               buf.append(min);
425               buf.append('-');
426               buf.append(max);
427             }
428           }
429           buf.append(']');
430           buf.append(SURROGATES2_CLASS);
431         }
432         String JavaDoc lo = lowSurrogateRanges(ranges);
433         for (int i = 0, len = lo.length(); i < len; i += 3) {
434           if (needSep)
435             buf.append('|');
436           else
437             needSep = true;
438           buf.append(lo.charAt(i));
439           char min = lo.charAt(i + 1);
440           char max = lo.charAt(i + 2);
441           if (min == max && (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i)))
442             buf.append(min);
443           else {
444             buf.append('[');
445             for (;;) {
446               if (min == max)
447                 buf.append(min);
448               else {
449                 buf.append(min);
450                 buf.append('-');
451                 buf.append(max);
452               }
453               if (i + 3 >= len || lo.charAt(i + 3) != lo.charAt(i))
454                 break;
455               i += 3;
456               min = lo.charAt(i + 1);
457               max = lo.charAt(i + 2);
458             }
459             buf.append(']');
460           }
461         }
462         if (!needSep)
463           buf.append(NOT_ALLOWED_CLASS);
464         buf.append(')');
465         break;
466       }
467     }
468
469     static String JavaDoc highSurrogateRanges(List JavaDoc ranges) {
470       StringBuffer JavaDoc highRanges = new StringBuffer JavaDoc();
471       for (int i = 0, len = ranges.size(); i < len; i++) {
472         Range r = (Range)ranges.get(i);
473         char min1 = Utf16.surrogate1(r.getMin());
474         char min2 = Utf16.surrogate2(r.getMin());
475         char max1 = Utf16.surrogate1(r.getMax());
476         char max2 = Utf16.surrogate2(r.getMax());
477         if (min2 != SURROGATE2_MIN)
478           min1++;
479         if (max2 != SURROGATE2_MAX)
480           max1--;
481         if (max1 >= min1) {
482           highRanges.append(min1);
483           highRanges.append(max1);
484         }
485       }
486       return highRanges.toString();
487     }
488
489     static String JavaDoc lowSurrogateRanges(List JavaDoc ranges) {
490       StringBuffer JavaDoc lowRanges = new StringBuffer JavaDoc();
491       for (int i = 0, len = ranges.size(); i < len; i++) {
492         Range r = (Range)ranges.get(i);
493         char min1 = Utf16.surrogate1(r.getMin());
494         char min2 = Utf16.surrogate2(r.getMin());
495         char max1 = Utf16.surrogate1(r.getMax());
496         char max2 = Utf16.surrogate2(r.getMax());
497         if (min1 == max1) {
498           if (min2 != SURROGATE2_MIN || max2 != SURROGATE2_MAX) {
499             lowRanges.append(min1);
500             lowRanges.append(min2);
501             lowRanges.append(max2);
502           }
503         }
504         else {
505           if (min2 != SURROGATE2_MIN) {
506             lowRanges.append(min1);
507             lowRanges.append(min2);
508             lowRanges.append(SURROGATE2_MAX);
509           }
510           if (max2 != SURROGATE2_MAX) {
511             lowRanges.append(max1);
512             lowRanges.append(SURROGATE2_MIN);
513             lowRanges.append(max2);
514           }
515         }
516       }
517       return lowRanges.toString();
518     }
519
520     abstract void outputBmp(StringBuffer JavaDoc buf);
521     abstract void outputComplementBmp(StringBuffer JavaDoc buf);
522
523     int singleChar() {
524       return -1;
525     }
526
527     void addNonBmpRanges(List JavaDoc ranges) {
528     }
529
530
531     static void sortRangeList(List JavaDoc ranges) {
532       Collections.sort(ranges);
533       int toIndex = 0;
534       int fromIndex = 0;
535       int len = ranges.size();
536       while (fromIndex < len) {
537         Range r = (Range)ranges.get(fromIndex);
538         int min = r.getMin();
539         int max = r.getMax();
540         while (++fromIndex < len) {
541           Range r2 = (Range)ranges.get(fromIndex);
542           if (r2.getMin() > max + 1)
543             break;
544           if (r2.getMax() > max)
545             max = r2.getMax();
546         }
547         if (max != r.getMax())
548           r = new Range(min, max);
549         ranges.set(toIndex++, r);
550       }
551       while (len > toIndex)
552         ranges.remove(--len);
553     }
554
555   }
556
557   static abstract class SimpleCharClass extends CharClass {
558     SimpleCharClass(int containsBmp, int containsNonBmp) {
559       super(containsBmp, containsNonBmp);
560     }
561
562     void outputBmp(StringBuffer JavaDoc buf) {
563       buf.append('[');
564       inClassOutputBmp(buf);
565       buf.append(']');
566     }
567
568     // must not call if containsBmp == ALL
569
void outputComplementBmp(StringBuffer JavaDoc buf) {
570       if (getContainsBmp() == NONE)
571         buf.append("[\u0000-\uFFFF]");
572       else {
573         buf.append("[^");
574         inClassOutputBmp(buf);
575         buf.append(']');
576       }
577     }
578     abstract void inClassOutputBmp(StringBuffer JavaDoc buf);
579   }
580
581   static class SingleChar extends SimpleCharClass {
582     private final char c;
583     SingleChar(char c) {
584       super(SOME, NONE);
585       this.c = c;
586     }
587
588     int singleChar() {
589       return c;
590     }
591
592     void outputBmp(StringBuffer JavaDoc buf) {
593       inClassOutputBmp(buf);
594     }
595
596     void inClassOutputBmp(StringBuffer JavaDoc buf) {
597       if (isJavaMetaChar(c))
598         buf.append('\\');
599       buf.append(c);
600     }
601
602   }
603
604   static class WideSingleChar extends SimpleCharClass {
605     private final int c;
606
607     WideSingleChar(int c) {
608       super(NONE, SOME);
609       this.c = c;
610     }
611
612     void inClassOutputBmp(StringBuffer JavaDoc buf) {
613       throw new RuntimeException JavaDoc("BMP output botch");
614     }
615
616     int singleChar() {
617       return c;
618     }
619
620     void addNonBmpRanges(List JavaDoc ranges) {
621       ranges.add(new Range(c, c));
622     }
623   }
624
625   static class Empty extends SimpleCharClass {
626     static private final Empty instance = new Empty();
627     private Empty() {
628       super(NONE, NONE);
629     }
630
631     static Empty getInstance() {
632       return instance;
633     }
634
635     void inClassOutputBmp(StringBuffer JavaDoc buf) {
636       throw new RuntimeException JavaDoc("BMP output botch");
637     }
638
639   }
640
641   static class CharRange extends SimpleCharClass {
642     private final int lower;
643     private final int upper;
644
645     CharRange(int lower, int upper) {
646       super(lower < NONBMP_MIN ? SOME : NONE,
647             // don't use ALL here, because that requires that the BMP class contains high surrogates
648
upper >= NONBMP_MIN ? SOME : NONE);
649       this.lower = lower;
650       this.upper = upper;
651     }
652
653     void inClassOutputBmp(StringBuffer JavaDoc buf) {
654       if (lower >= NONBMP_MIN)
655         throw new RuntimeException JavaDoc("BMP output botch");
656       if (isJavaMetaChar((char)lower))
657         buf.append('\\');
658       buf.append((char)lower);
659       buf.append('-');
660       if (upper < NONBMP_MIN) {
661         if (isJavaMetaChar((char)upper))
662           buf.append('\\');
663         buf.append((char)upper);
664       }
665       else
666         buf.append('\uFFFF');
667     }
668
669     void addNonBmpRanges(List JavaDoc ranges) {
670       if (upper >= NONBMP_MIN)
671         ranges.add(new Range(lower < NONBMP_MIN ? NONBMP_MIN : lower, upper));
672     }
673   }
674
675   static class Property extends SimpleCharClass {
676     private final String JavaDoc name;
677
678     Property(String JavaDoc name) {
679       super(SOME, NONE);
680       this.name = name;
681     }
682
683     void outputBmp(StringBuffer JavaDoc buf) {
684       inClassOutputBmp(buf);
685     }
686
687     void inClassOutputBmp(StringBuffer JavaDoc buf) {
688       buf.append("\\p{");
689       buf.append(name);
690       buf.append('}');
691     }
692
693     void outputComplementBmp(StringBuffer JavaDoc buf) {
694       buf.append("\\P{");
695       buf.append(name);
696       buf.append('}');
697     }
698   }
699
700   static class Subtraction extends CharClass {
701     private final CharClass cc1;
702     private final CharClass cc2;
703     Subtraction(CharClass cc1, CharClass cc2) {
704       // min corresponds to intersection
705
// complement corresponds to negation
706
super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()),
707             Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp()));
708       this.cc1 = cc1;
709       this.cc2 = cc2;
710     }
711
712     void outputBmp(StringBuffer JavaDoc buf) {
713       buf.append('[');
714       cc1.outputBmp(buf);
715       buf.append("&&");
716       cc2.outputComplementBmp(buf);
717       buf.append(']');
718     }
719
720     void outputComplementBmp(StringBuffer JavaDoc buf) {
721       buf.append('[');
722       cc1.outputComplementBmp(buf);
723       cc2.outputBmp(buf);
724       buf.append(']');
725     }
726
727     void addNonBmpRanges(List JavaDoc ranges) {
728       List JavaDoc posList = new Vector JavaDoc();
729       cc1.addNonBmpRanges(posList);
730       List JavaDoc negList = new Vector JavaDoc();
731       cc2.addNonBmpRanges(negList);
732       sortRangeList(posList);
733       sortRangeList(negList);
734       Iterator JavaDoc negIter = negList.iterator();
735       Range negRange;
736       if (negIter.hasNext())
737         negRange = (Range)negIter.next();
738       else
739         negRange = null;
740       for (int i = 0, len = posList.size(); i < len; i++) {
741         Range posRange = (Range)posList.get(i);
742         while (negRange != null && negRange.getMax() < posRange.getMin()) {
743           if (negIter.hasNext())
744             negRange = (Range)negIter.next();
745           else
746             negRange = null;
747         }
748         // if negRange != null, negRange.max >= posRange.min
749
int min = posRange.getMin();
750         while (negRange != null && negRange.getMin() <= posRange.getMax()) {
751           if (min < negRange.getMin()) {
752             ranges.add(new Range(min, negRange.getMin() - 1));
753           }
754           min = negRange.getMax() + 1;
755           if (min > posRange.getMax())
756             break;
757           if (negIter.hasNext())
758             negRange = (Range)negIter.next();
759           else
760             negRange = null;
761         }
762         if (min <= posRange.getMax())
763           ranges.add(new Range(min, posRange.getMax()));
764       }
765     }
766   }
767
768   static class Union extends CharClass {
769     private final List JavaDoc members;
770
771     Union(CharClass[] v) {
772       this(toList(v));
773     }
774
775     static private List JavaDoc toList(CharClass[] v) {
776       List JavaDoc members = new Vector JavaDoc();
777       for (int i = 0; i < v.length; i++)
778         members.add(v[i]);
779       return members;
780     }
781
782     Union(List JavaDoc members) {
783       super(computeContainsBmp(members), computeContainsNonBmp(members));
784       this.members = members;
785     }
786
787     void outputBmp(StringBuffer JavaDoc buf) {
788       buf.append('[');
789       for (int i = 0, len = members.size(); i < len; i++) {
790         CharClass cc = (CharClass)members.get(i);
791         if (cc.getContainsBmp() != NONE) {
792           if (cc instanceof SimpleCharClass)
793             ((SimpleCharClass)cc).inClassOutputBmp(buf);
794           else
795             cc.outputBmp(buf);
796         }
797       }
798       buf.append(']');
799     }
800
801     void outputComplementBmp(StringBuffer JavaDoc buf) {
802       boolean first = true;
803       int len = members.size();
804       for (int i = 0; i < len; i++) {
805         CharClass cc = (CharClass)members.get(i);
806         if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) {
807           if (first) {
808             buf.append("[^");
809             first = false;
810           }
811           ((SimpleCharClass)cc).inClassOutputBmp(buf);
812         }
813       }
814       for (int i = 0; i < len; i++) {
815         CharClass cc = (CharClass)members.get(i);
816         if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) {
817           if (first) {
818             buf.append('[');
819             first = false;
820           }
821           else
822             buf.append("&&");
823           // can't have any members that are ALL, because that would make this ALL, which violates
824
// the precondition for outputComplementBmp
825
cc.outputComplementBmp(buf);
826         }
827       }
828       if (first == true)
829         // all members are NONE, so this is NONE, so complement is everything
830
buf.append("[\u0000-\uFFFF]");
831       else
832         buf.append(']');
833     }
834
835     void addNonBmpRanges(List JavaDoc ranges) {
836       for (int i = 0, len = members.size(); i < len; i++)
837         ((CharClass)members.get(i)).addNonBmpRanges(ranges);
838     }
839
840     private static int computeContainsBmp(List JavaDoc members) {
841       int ret = NONE;
842       for (int i = 0, len = members.size(); i < len; i++)
843         ret = Math.max(ret, ((CharClass)members.get(i)).getContainsBmp());
844       return ret;
845     }
846
847     private static int computeContainsNonBmp(List JavaDoc members) {
848       int ret = NONE;
849       for (int i = 0, len = members.size(); i < len; i++)
850         ret = Math.max(ret, ((CharClass)members.get(i)).getContainsNonBmp());
851       return ret;
852     }
853   }
854
855   static class Complement extends CharClass {
856     private final CharClass cc;
857     Complement(CharClass cc) {
858       super(-cc.getContainsBmp(), -cc.getContainsNonBmp());
859       this.cc = cc;
860     }
861
862     void outputBmp(StringBuffer JavaDoc buf) {
863       cc.outputComplementBmp(buf);
864     }
865
866     void outputComplementBmp(StringBuffer JavaDoc buf) {
867       cc.outputBmp(buf);
868     }
869
870     void addNonBmpRanges(List JavaDoc ranges) {
871       List JavaDoc tem = new Vector JavaDoc();
872       cc.addNonBmpRanges(tem);
873       sortRangeList(tem);
874       int c = NONBMP_MIN;
875       for (int i = 0, len = tem.size(); i < len; i++) {
876         Range r = (Range)tem.get(i);
877         if (r.getMin() > c)
878           ranges.add(new Range(c, r.getMin() - 1));
879         c = r.getMax() + 1;
880       }
881       if (c != NONBMP_MAX + 1)
882         ranges.add(new Range(c, NONBMP_MAX));
883     }
884   }
885
886   private boolean translateAtom() throws RegexSyntaxException {
887     switch (curChar) {
888     case EOS:
889       if (!eos)
890         break;
891       // fall through
892
case '?':
893     case '*':
894     case '+':
895     case ')':
896     case '{':
897     case '}':
898     case '|':
899     case ']':
900       return false;
901     case '(':
902       copyCurChar();
903       translateRegExp();
904       expect(')');
905       copyCurChar();
906       return true;
907     case '\\':
908       advance();
909       parseEsc().output(result);
910       return true;
911     case '[':
912       advance();
913       parseCharClassExpr().output(result);
914       return true;
915     case '.':
916       DOT.output(result);
917       advance();
918       return true;
919     case '$':
920     case '^':
921       result.append('\\');
922       break;
923     }
924     copyCurChar();
925     return true;
926   }
927
928
929   static private CharClass makeCharClass(String JavaDoc categories, String JavaDoc includes, String JavaDoc excludeRanges) {
930     List JavaDoc includeList = new Vector JavaDoc();
931     for (int i = 0, len = categories.length(); i < len; i += 2)
932       includeList.add(new Property(categories.substring(i, i + 2)));
933     for (int i = 0, len = includes.length(); i < len; i++) {
934       int j = i + 1;
935       for (; j < len && includes.charAt(j) - includes.charAt(i) == j - i; j++)
936         ;
937       --j;
938       if (i == j - 1)
939         --j;
940       if (i == j)
941         includeList.add(new SingleChar(includes.charAt(i)));
942       else
943         includeList.add(new CharRange(includes.charAt(i), includes.charAt(j)));
944       i = j;
945     }
946     List JavaDoc excludeList = new Vector JavaDoc();
947     for (int i = 0, len = excludeRanges.length(); i < len; i += 2) {
948       char min = excludeRanges.charAt(i);
949       char max = excludeRanges.charAt(i + 1);
950       if (min == max)
951         excludeList.add(new SingleChar(min));
952       else if (min == max - 1) {
953         excludeList.add(new SingleChar(min));
954         excludeList.add(new SingleChar(max));
955       }
956       else
957         excludeList.add(new CharRange(min, max));
958     }
959     return new Subtraction(new Union(includeList), new Union(excludeList));
960   }
961
962   private CharClass parseEsc() throws RegexSyntaxException {
963     switch (curChar) {
964     case 'n':
965       advance();
966       return new SingleChar('\n');
967     case 'r':
968       advance();
969       return new SingleChar('\r');
970     case 't':
971       advance();
972       return new SingleChar('\t');
973     case '\\':
974     case '|':
975     case '.':
976     case '-':
977     case '^':
978     case '?':
979     case '*':
980     case '+':
981     case '(':
982     case ')':
983     case '{':
984     case '}':
985     case '[':
986     case ']':
987       break;
988     case 's':
989       advance();
990       return ESC_s;
991     case 'S':
992       advance();
993       return ESC_S;
994     case 'i':
995       advance();
996       return ESC_i;
997     case 'I':
998       advance();
999       return ESC_I;
1000    case 'c':
1001      advance();
1002      return ESC_c;
1003    case 'C':
1004      advance();
1005      return ESC_C;
1006    case 'd':
1007      advance();
1008      return ESC_d;
1009    case 'D':
1010      advance();
1011      return ESC_D;
1012    case 'w':
1013      advance();
1014      return ESC_w;
1015    case 'W':
1016      advance();
1017      return ESC_W;
1018    case 'p':
1019      advance();
1020      return parseProp();
1021    case 'P':
1022      advance();
1023      return new Complement(parseProp());
1024    default:
1025      throw makeException("bad_escape");
1026    }
1027    CharClass tem = new SingleChar(curChar);
1028    advance();
1029    return tem;
1030  }
1031
1032  private CharClass parseProp() throws RegexSyntaxException {
1033    expect('{');
1034    int start = pos;
1035    for (;;) {
1036      advance();
1037      if (curChar == '}')
1038        break;
1039      if (!isAsciiAlnum(curChar) && curChar != '-')
1040        expect('}');
1041    }
1042    String JavaDoc propertyName = regExp.substring(start, pos - 1);
1043    advance();
1044    switch (propertyName.length()) {
1045    case 0:
1046      throw makeException("empty_property_name");
1047    case 2:
1048      int sci = subCategories.indexOf(propertyName);
1049      if (sci < 0 || sci % 2 == 1)
1050        throw makeException("bad_category");
1051      return getSubCategoryCharClass(sci / 2);
1052    case 1:
1053      int ci = categories.indexOf(propertyName.charAt(0));
1054      if (ci < 0)
1055        throw makeException("bad_category", propertyName);
1056      return getCategoryCharClass(ci);
1057    default:
1058      if (!propertyName.startsWith("Is"))
1059        break;
1060      String JavaDoc blockName = propertyName.substring(2);
1061      for (int i = 0; i < specialBlockNames.length; i++)
1062        if (blockName.equals(specialBlockNames[i]))
1063          return specialBlockCharClasses[i];
1064      if (!isBlock(blockName))
1065        throw makeException("bad_block_name", blockName);
1066      return new Property( "In" + blockName);
1067    }
1068    throw makeException("bad_property_name", propertyName);
1069  }
1070
1071  static private boolean isBlock(String JavaDoc name) {
1072    for (int i = 0; i < blockNames.length; i++)
1073      if (name.equals(blockNames[i]))
1074        return true;
1075    return false;
1076  }
1077
1078  static private boolean isAsciiAlnum(char c) {
1079    if ('a' <= c && c <= 'z')
1080      return true;
1081    if ('A' <= c && c <= 'Z')
1082      return true;
1083    if ('0' <= c && c <= '9')
1084      return true;
1085    return false;
1086  }
1087
1088  private void expect(char c) throws RegexSyntaxException {
1089    if (curChar != c)
1090      throw makeException("expected", new String JavaDoc(new char[]{c}));
1091  }
1092
1093  private CharClass parseCharClassExpr() throws RegexSyntaxException {
1094    boolean compl;
1095    if (curChar == '^') {
1096      advance();
1097      compl = true;
1098    }
1099    else
1100      compl = false;
1101    List JavaDoc members = new Vector JavaDoc();
1102    do {
1103      CharClass lower = parseCharClassEscOrXmlChar();
1104      members.add(lower);
1105      if (curChar == '-') {
1106        advance();
1107        if (curChar == '[')
1108          break;
1109        CharClass upper = parseCharClassEscOrXmlChar();
1110        if (lower.singleChar() < 0 || upper.singleChar() < 0)
1111          throw makeException("multi_range");
1112        if (lower.singleChar() > upper.singleChar())
1113          throw makeException("invalid_range");
1114        members.set(members.size() - 1,
1115                    new CharRange(lower.singleChar(), upper.singleChar()));
1116        if (curChar == '-') {
1117          advance();
1118          expect('[');
1119          break;
1120        }
1121      }
1122    } while (curChar != ']');
1123    CharClass result;
1124    if (members.size() == 1)
1125      result = (CharClass)members.get(0);
1126    else
1127      result = new Union(members);
1128    if (compl)
1129      result = new Complement(result);
1130    if (curChar == '[') {
1131      advance();
1132      result = new Subtraction(result, parseCharClassExpr());
1133      expect(']');
1134    }
1135    advance();
1136    return result;
1137  }
1138
1139  private CharClass parseCharClassEscOrXmlChar() throws RegexSyntaxException {
1140    switch (curChar) {
1141    case EOS:
1142      if (eos)
1143        expect(']');
1144      break;
1145    case '\\':
1146      advance();
1147      return parseEsc();
1148    case '[':
1149    case ']':
1150    case '-':
1151      throw makeException("should_quote", new String JavaDoc(new char[]{curChar}));
1152    }
1153    CharClass tem;
1154    if (Utf16.isSurrogate(curChar)) {
1155      if (!Utf16.isSurrogate1(curChar))
1156        throw makeException("invalid_surrogate");
1157      char c1 = curChar;
1158      advance();
1159      if (!Utf16.isSurrogate2(curChar))
1160        throw makeException("invalid_surrogate");
1161      tem = new WideSingleChar(Utf16.scalarValue(c1, curChar));
1162    }
1163    else
1164      tem = new SingleChar(curChar);
1165    advance();
1166    return tem;
1167  }
1168
1169  private RegexSyntaxException makeException(String JavaDoc key) {
1170    return new RegexSyntaxException(localizer.message(key), pos - 1);
1171  }
1172
1173  private RegexSyntaxException makeException(String JavaDoc key, String JavaDoc arg) {
1174    return new RegexSyntaxException(localizer.message(key, arg), pos - 1);
1175  }
1176
1177  static private boolean isJavaMetaChar(char c) {
1178    switch (c) {
1179    case '\\':
1180    case '^':
1181    case '?':
1182    case '*':
1183    case '+':
1184    case '(':
1185    case ')':
1186    case '{':
1187    case '}':
1188    case '|':
1189    case '[':
1190    case ']':
1191    case '-':
1192    case '&':
1193    case '$':
1194    case '.':
1195      return true;
1196    }
1197    return false;
1198  }
1199
1200  static private synchronized CharClass getCategoryCharClass(int ci) {
1201    if (categoryCharClasses[ci] == null)
1202      categoryCharClasses[ci] = computeCategoryCharClass(categories.charAt(ci));
1203    return categoryCharClasses[ci];
1204  }
1205
1206  static private synchronized CharClass getSubCategoryCharClass(int sci) {
1207    if (subCategoryCharClasses[sci] == null)
1208      subCategoryCharClasses[sci] = computeSubCategoryCharClass(subCategories.substring(sci * 2, (sci + 1) * 2));
1209    return subCategoryCharClasses[sci];
1210  }
1211
1212  static private final char UNICODE_3_1_ADD_Lu = '\u03F4'; // added in 3.1
1213
static private final char UNICODE_3_1_ADD_Ll = '\u03F5'; // added in 3.1
1214
// 3 characters changed from No to Nl between 3.0 and 3.1
1215
static private final char UNICODE_3_1_CHANGE_No_to_Nl_MIN = '\u16EE';
1216  static private final char UNICODE_3_1_CHANGE_No_to_Nl_MAX = '\u16F0';
1217  static private final String JavaDoc CATEGORY_Pi = "\u00AB\u2018\u201B\u201C\u201F\u2039"; // Java doesn't know about category Pi
1218
static private final String JavaDoc CATEGORY_Pf = "\u00BB\u2019\u201D\u203A"; // Java doesn't know about category Pf
1219

1220  static private CharClass computeCategoryCharClass(char code) {
1221    List JavaDoc classes = new Vector JavaDoc();
1222    classes.add(new Property(new String JavaDoc(new char[] { code })));
1223    for (int ci = Categories.CATEGORY_NAMES.indexOf(code); ci >= 0; ci = Categories.CATEGORY_NAMES.indexOf(code, ci + 1)) {
1224      int[] addRanges = Categories.CATEGORY_RANGES[ci/2];
1225      for (int i = 0; i < addRanges.length; i += 2)
1226        classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
1227    }
1228    if (code == 'P')
1229      classes.add(makeCharClass(CATEGORY_Pi + CATEGORY_Pf));
1230    if (code == 'L') {
1231      classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1232      classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1233    }
1234    if (code == 'C') {
1235      // JDK 1.4 leaves Cn out of C?
1236
classes.add(new Subtraction(new Property("Cn"),
1237                                  new Union(new CharClass[] { new SingleChar(UNICODE_3_1_ADD_Lu),
1238                                                              new SingleChar(UNICODE_3_1_ADD_Ll) })));
1239      List JavaDoc assignedRanges = new Vector JavaDoc();
1240      for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++)
1241        for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2)
1242          assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j],
1243                                           Categories.CATEGORY_RANGES[i][j + 1]));
1244      classes.add(new Subtraction(new CharRange(NONBMP_MIN, NONBMP_MAX),
1245                                  new Union(assignedRanges)));
1246    }
1247    if (classes.size() == 1)
1248      return (CharClass)classes.get(0);
1249    return new Union(classes);
1250  }
1251
1252  static private CharClass computeSubCategoryCharClass(String JavaDoc name) {
1253    CharClass base = new Property(name);
1254    int sci = Categories.CATEGORY_NAMES.indexOf(name);
1255    if (sci < 0) {
1256      if (name.equals("Cn")) {
1257        // Unassigned
1258
List JavaDoc assignedRanges = new Vector JavaDoc();
1259        assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1260        assignedRanges.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1261        for (int i = 0; i < Categories.CATEGORY_RANGES.length; i++)
1262          for (int j = 0; j < Categories.CATEGORY_RANGES[i].length; j += 2)
1263            assignedRanges.add(new CharRange(Categories.CATEGORY_RANGES[i][j],
1264                                             Categories.CATEGORY_RANGES[i][j + 1]));
1265        return new Subtraction(new Union(new CharClass[] { base, new CharRange(NONBMP_MIN, NONBMP_MAX) }),
1266                               new Union(assignedRanges));
1267      }
1268      if (name.equals("Pi"))
1269        return makeCharClass(CATEGORY_Pi);
1270      if (name.equals("Pf"))
1271        return makeCharClass(CATEGORY_Pf);
1272      return base;
1273    }
1274    List JavaDoc classes = new Vector JavaDoc();
1275    classes.add(base);
1276    int[] addRanges = Categories.CATEGORY_RANGES[sci/2];
1277    for (int i = 0; i < addRanges.length; i += 2)
1278      classes.add(new CharRange(addRanges[i], addRanges[i + 1]));
1279    if (name.equals("Lu"))
1280      classes.add(new SingleChar(UNICODE_3_1_ADD_Lu));
1281    else if (name.equals("Ll"))
1282      classes.add(new SingleChar(UNICODE_3_1_ADD_Ll));
1283    else if (name.equals("Nl"))
1284      classes.add(new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN, UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1285    else if (name.equals("No"))
1286      return new Subtraction(new Union(classes),
1287                             new CharRange(UNICODE_3_1_CHANGE_No_to_Nl_MIN,
1288                                           UNICODE_3_1_CHANGE_No_to_Nl_MAX));
1289    return new Union(classes);
1290  }
1291
1292  private static CharClass makeCharClass(String JavaDoc members) {
1293    List JavaDoc list = new Vector JavaDoc();
1294    for (int i = 0, len = members.length(); i < len; i++)
1295      list.add(new SingleChar(members.charAt(i)));
1296    return new Union(list);
1297  }
1298
1299  public static void main(String JavaDoc[] args) throws RegexSyntaxException {
1300    String JavaDoc s = translate(args[0]);
1301    for (int i = 0, len = s.length(); i < len; i++) {
1302      char c = s.charAt(i);
1303      if (c >= 0x20 && c <= 0x7e)
1304        System.err.print(c);
1305      else {
1306        System.err.print("\\u");
1307        for (int shift = 12; shift >= 0; shift -= 4)
1308          System.err.print("0123456789ABCDEF".charAt((c >> shift) & 0xF));
1309      }
1310    }
1311    System.err.println();
1312  }
1313}
1314
Popular Tags