KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > gnu > regexp > RESyntax


1 /*
2  * gnu/regexp/RESyntax.java
3  * Copyright (C) 1998-2001 Wes Biggs
4  *
5  * This library is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published
7  * by the Free Software Foundation; either version 2.1 of the License, or
8  * (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */

19
20 package gnu.regexp;
21 import java.io.Serializable JavaDoc;
22 import java.util.BitSet JavaDoc;
23
24 /**
25  * An RESyntax specifies the way a regular expression will be compiled.
26  * This class provides a number of predefined useful constants for
27  * emulating popular regular expression syntaxes. Additionally the
28  * user may construct his or her own syntax, using any combination of the
29  * syntax bit constants. The syntax is an optional argument to any of the
30  * matching methods on class RE.
31  *
32  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
33  */

34
35 public final class RESyntax implements Serializable JavaDoc {
36     static final String JavaDoc DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
37
38     private static final String JavaDoc SYNTAX_IS_FINAL = RE.getLocalizedMessage("syntax.final");
39
40     private BitSet JavaDoc bits;
41
42     // true for the constant defined syntaxes
43
private boolean isFinal = false;
44
45     private String JavaDoc lineSeparator = DEFAULT_LINE_SEPARATOR;
46
47   // Values for constants are bit indexes
48

49   /**
50    * Syntax bit. Backslash is an escape character in lists.
51    */

52   public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
53
54   /**
55    * Syntax bit. Use \? instead of ? and \+ instead of +.
56    */

57   public static final int RE_BK_PLUS_QM = 1;
58
59   /**
60    * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
61    */

62   public static final int RE_CHAR_CLASSES = 2;
63
64   /**
65    * Syntax bit. ^ and $ are special everywhere.
66    * <B>Not implemented.</B>
67    */

68   public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
69
70   /**
71    * Syntax bit. Repetition operators are only special in valid positions.
72    * <B>Not implemented.</B>
73    */

74   public static final int RE_CONTEXT_INDEP_OPS = 4;
75
76   /**
77    * Syntax bit. Repetition and alternation operators are invalid
78    * at start and end of pattern and other places.
79    * <B>Not implemented</B>.
80    */

81   public static final int RE_CONTEXT_INVALID_OPS = 5;
82
83   /**
84    * Syntax bit. Match-any-character operator (.) matches a newline.
85    */

86   public static final int RE_DOT_NEWLINE = 6;
87
88   /**
89    * Syntax bit. Match-any-character operator (.) does not match a null.
90    */

91   public static final int RE_DOT_NOT_NULL = 7;
92
93   /**
94    * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
95    */

96   public static final int RE_INTERVALS = 8;
97
98   /**
99    * Syntax bit. No alternation (|), match one-or-more (+), or
100    * match zero-or-one (?) operators.
101    */

102   public static final int RE_LIMITED_OPS = 9;
103
104   /**
105    * Syntax bit. Newline is an alternation operator.
106    */

107   public static final int RE_NEWLINE_ALT = 10; // impl.
108

109   /**
110    * Syntax bit. Intervals use { } instead of \{ \}
111    */

112   public static final int RE_NO_BK_BRACES = 11;
113
114   /**
115    * Syntax bit. Grouping uses ( ) instead of \( \).
116    */

117   public static final int RE_NO_BK_PARENS = 12;
118
119   /**
120    * Syntax bit. Backreferences not allowed.
121    */

122   public static final int RE_NO_BK_REFS = 13;
123
124   /**
125    * Syntax bit. Alternation uses | instead of \|
126    */

127   public static final int RE_NO_BK_VBAR = 14;
128
129   /**
130    * Syntax bit. <B>Not implemented</B>.
131    */

132   public static final int RE_NO_EMPTY_RANGES = 15;
133
134   /**
135    * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
136    * on RE_NO_BK_PARENS) will throw an exception when compiling.
137    */

138   public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
139
140   /**
141    * Syntax bit. <B>Not implemented.</B>
142    */

143   public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
144
145   /**
146    * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
147    */

148   public static final int RE_STINGY_OPS = 18;
149
150   /**
151    * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
152    */

153   public static final int RE_CHAR_CLASS_ESCAPES = 19;
154
155   /**
156    * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
157    */

158   public static final int RE_PURE_GROUPING = 20;
159
160   /**
161    * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
162    * to the text following the current position without consuming that text.
163    */

164   public static final int RE_LOOKAHEAD = 21;
165
166   /**
167    * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
168    */

169   public static final int RE_STRING_ANCHORS = 22;
170
171   /**
172    * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
173    */

174   public static final int RE_COMMENTS = 23;
175
176   /**
177    * Syntax bit. Allow character class escapes within lists, as in Perl5.
178    */

179   public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
180
181   private static final int BIT_TOTAL = 25;
182
183   /**
184    * Predefined syntax.
185    * Emulates regular expression support in the awk utility.
186    */

187   public static final RESyntax RE_SYNTAX_AWK;
188
189   /**
190    * Predefined syntax.
191    * Emulates regular expression support in the ed utility.
192    */

193   public static final RESyntax RE_SYNTAX_ED;
194
195   /**
196    * Predefined syntax.
197    * Emulates regular expression support in the egrep utility.
198    */

199   public static final RESyntax RE_SYNTAX_EGREP;
200
201   /**
202    * Predefined syntax.
203    * Emulates regular expression support in the GNU Emacs editor.
204    */

205   public static final RESyntax RE_SYNTAX_EMACS;
206
207   /**
208    * Predefined syntax.
209    * Emulates regular expression support in the grep utility.
210    */

211   public static final RESyntax RE_SYNTAX_GREP;
212
213   /**
214    * Predefined syntax.
215    * Emulates regular expression support in the POSIX awk specification.
216    */

217   public static final RESyntax RE_SYNTAX_POSIX_AWK;
218
219   /**
220    * Predefined syntax.
221    * Emulates POSIX basic regular expression support.
222    */

223   public static final RESyntax RE_SYNTAX_POSIX_BASIC;
224
225   /**
226    * Predefined syntax.
227    * Emulates regular expression support in the POSIX egrep specification.
228    */

229   public static final RESyntax RE_SYNTAX_POSIX_EGREP;
230
231   /**
232    * Predefined syntax.
233    * Emulates POSIX extended regular expression support.
234    */

235   public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
236
237   /**
238    * Predefined syntax.
239    * Emulates POSIX basic minimal regular expressions.
240    */

241   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
242
243   /**
244    * Predefined syntax.
245    * Emulates POSIX extended minimal regular expressions.
246    */

247   public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
248
249   /**
250    * Predefined syntax.
251    * Emulates regular expression support in the sed utility.
252    */

253   public static final RESyntax RE_SYNTAX_SED;
254
255   /**
256    * Predefined syntax.
257    * Emulates regular expression support in Larry Wall's perl, version 4,
258    */

259   public static final RESyntax RE_SYNTAX_PERL4;
260
261   /**
262    * Predefined syntax.
263    * Emulates regular expression support in Larry Wall's perl, version 4,
264    * using single line mode (/s modifier).
265    */

266   public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
267

268   /**
269    * Predefined syntax.
270    * Emulates regular expression support in Larry Wall's perl, version 5.
271    */

272   public static final RESyntax RE_SYNTAX_PERL5;
273
274   /**
275    * Predefined syntax.
276    * Emulates regular expression support in Larry Wall's perl, version 5,
277    * using single line mode (/s modifier).
278    */

279   public static final RESyntax RE_SYNTAX_PERL5_S;
280   
281   static {
282       // Define syntaxes
283

284       RE_SYNTAX_EMACS = new RESyntax().makeFinal();
285       
286       RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
287       .set(RE_CHAR_CLASSES)
288       .set(RE_DOT_NEWLINE)
289       .set(RE_DOT_NOT_NULL)
290       .set(RE_INTERVALS)
291       .set(RE_NO_EMPTY_RANGES)
292       .makeFinal();
293       
294       RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
295       .set(RE_BK_PLUS_QM)
296       .makeFinal();
297       
298       RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
299       .set(RE_CONTEXT_INDEP_ANCHORS)
300       .set(RE_CONTEXT_INDEP_OPS)
301       .set(RE_NO_BK_BRACES)
302       .set(RE_NO_BK_PARENS)
303       .set(RE_NO_BK_VBAR)
304       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
305       .makeFinal();
306
307       RE_SYNTAX_AWK = new RESyntax()
308       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
309       .set(RE_DOT_NOT_NULL)
310       .set(RE_NO_BK_PARENS)
311       .set(RE_NO_BK_REFS)
312       .set(RE_NO_BK_VBAR)
313       .set(RE_NO_EMPTY_RANGES)
314       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
315       .makeFinal();
316       
317       RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
318       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
319       .makeFinal();
320       
321       RE_SYNTAX_GREP = new RESyntax()
322       .set(RE_BK_PLUS_QM)
323       .set(RE_CHAR_CLASSES)
324       .set(RE_HAT_LISTS_NOT_NEWLINE)
325       .set(RE_INTERVALS)
326       .set(RE_NEWLINE_ALT)
327       .makeFinal();
328       
329       RE_SYNTAX_EGREP = new RESyntax()
330       .set(RE_CHAR_CLASSES)
331       .set(RE_CONTEXT_INDEP_ANCHORS)
332       .set(RE_CONTEXT_INDEP_OPS)
333       .set(RE_HAT_LISTS_NOT_NEWLINE)
334       .set(RE_NEWLINE_ALT)
335       .set(RE_NO_BK_PARENS)
336       .set(RE_NO_BK_VBAR)
337       .makeFinal();
338     
339       RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
340       .set(RE_INTERVALS)
341       .set(RE_NO_BK_BRACES)
342       .makeFinal();
343     
344       /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
345     
346       RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
347       .makeFinal();
348     
349       RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
350       .makeFinal();
351       
352       RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
353       .set(RE_LIMITED_OPS)
354       .makeFinal();
355       
356       /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
357      replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */

358       
359       RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
360       .set(RE_CONTEXT_INDEP_ANCHORS)
361       .set(RE_CONTEXT_INVALID_OPS)
362       .set(RE_NO_BK_BRACES)
363       .set(RE_NO_BK_PARENS)
364       .set(RE_NO_BK_REFS)
365       .set(RE_NO_BK_VBAR)
366       .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
367       .makeFinal();
368       
369       /* There is no official Perl spec, but here's a "best guess" */
370       
371       RE_SYNTAX_PERL4 = new RESyntax()
372       .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
373       .set(RE_CONTEXT_INDEP_ANCHORS)
374       .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
375
.set(RE_INTERVALS)
376       .set(RE_NO_BK_BRACES)
377       .set(RE_NO_BK_PARENS)
378       .set(RE_NO_BK_VBAR)
379       .set(RE_NO_EMPTY_RANGES)
380       .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S
381
.makeFinal();
382       
383       RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
384       .set(RE_DOT_NEWLINE)
385       .makeFinal();
386       
387       RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
388       .set(RE_PURE_GROUPING) // (?:)
389
.set(RE_STINGY_OPS) // *?,??,+?,{}?
390
.set(RE_LOOKAHEAD) // (?=)(?!)
391
.set(RE_STRING_ANCHORS) // \A,\Z
392
.set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
393
.set(RE_COMMENTS) // (?#)
394
.makeFinal();
395       
396       RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
397       .set(RE_DOT_NEWLINE)
398       .makeFinal();
399   }
400
401   /**
402    * Construct a new syntax object with all bits turned off.
403    * This is equivalent to RE_SYNTAX_EMACS.
404    */

405   public RESyntax() {
406     bits = new BitSet JavaDoc(BIT_TOTAL);
407   }
408
409     /**
410      * Called internally when constructing predefined syntaxes
411      * so their interpretation cannot vary. Conceivably useful
412      * for your syntaxes as well. Causes IllegalAccessError to
413      * be thrown if any attempt to modify the syntax is made.
414      *
415      * @return this object for convenient chaining
416      */

417     public RESyntax makeFinal() {
418     isFinal = true;
419     return this;
420     }
421
422   /**
423    * Construct a new syntax object with all bits set the same
424    * as the other syntax.
425    */

426   public RESyntax(RESyntax other) {
427     bits = (BitSet JavaDoc) other.bits.clone();
428   }
429
430   /**
431    * Check if a given bit is set in this syntax.
432    */

433   public boolean get(int index) {
434     return bits.get(index);
435   }
436
437   /**
438    * Set a given bit in this syntax.
439    *
440    * @param index the constant (RESyntax.RE_xxx) bit to set.
441    * @return a reference to this object for easy chaining.
442    */

443   public RESyntax set(int index) {
444       if (isFinal) throw new IllegalAccessError JavaDoc(SYNTAX_IS_FINAL);
445     bits.set(index);
446     return this;
447   }
448
449   /**
450    * Clear a given bit in this syntax.
451    *
452    * @param index the constant (RESyntax.RE_xxx) bit to clear.
453    * @return a reference to this object for easy chaining.
454    */

455   public RESyntax clear(int index) {
456       if (isFinal) throw new IllegalAccessError JavaDoc(SYNTAX_IS_FINAL);
457       bits.clear(index);
458       return this;
459   }
460
461     /**
462      * Changes the line separator string for regular expressions
463      * created using this RESyntax. The default separator is the
464      * value returned by the system property "line.separator", which
465      * should be correct when reading platform-specific files from a
466      * filesystem. However, many programs may collect input from
467      * sources where the line separator is differently specified (for
468      * example, in the applet environment, the text box widget
469      * interprets line breaks as single-character newlines,
470      * regardless of the host platform.
471      *
472      * Note that setting the line separator to a character or
473      * characters that have specific meaning within the current syntax
474      * can cause unexpected chronosynclastic infundibula.
475      *
476      * @return this object for convenient chaining
477      */

478     public RESyntax setLineSeparator(String JavaDoc aSeparator) {
479     if (isFinal) throw new IllegalAccessError JavaDoc(SYNTAX_IS_FINAL);
480     lineSeparator = aSeparator;
481     return this;
482     }
483
484     /**
485      * Returns the currently active line separator string. The default
486      * is the platform-dependent system property "line.separator".
487      */

488     public String JavaDoc getLineSeparator() {
489     return lineSeparator;
490     }
491 }
492
Popular Tags