KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > percederberg > grammatica > parser > re > CharacterSetElement


1 /*
2  * CharacterSetElement.java
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 2.1
7  * of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the Free
16  * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17  * MA 02111-1307, USA.
18  *
19  * Copyright (c) 2003-2005 Per Cederberg. All rights reserved.
20  */

21
22 package net.percederberg.grammatica.parser.re;
23
24 import java.io.IOException JavaDoc;
25 import java.io.PrintWriter JavaDoc;
26 import java.util.ArrayList JavaDoc;
27
28 import net.percederberg.grammatica.parser.LookAheadReader;
29
30 /**
31  * A regular expression character set element. This element matches a
32  * single character inside (or outside) a character set. The character
33  * set is user defined and may contain ranges of characters. The set
34  * may also be inverted, meaning that only characters not inside the
35  * set will be considered to match.
36  *
37  * @author Per Cederberg, <per at percederberg dot net>
38  * @version 1.5
39  */

40 class CharacterSetElement extends Element {
41
42     /**
43      * The dot ('.') character set. This element matches a single
44      * character that is not equal to a newline character.
45      */

46     public static final CharacterSetElement DOT =
47         new CharacterSetElement(false);
48
49     /**
50      * The digit character set. This element matches a single
51      * numeric character.
52      */

53     public static final CharacterSetElement DIGIT =
54         new CharacterSetElement(false);
55
56     /**
57      * The non-digit character set. This element matches a single
58      * non-numeric character.
59      */

60     public static final CharacterSetElement NON_DIGIT =
61         new CharacterSetElement(true);
62
63     /**
64      * The whitespace character set. This element matches a single
65      * whitespace character.
66      */

67     public static final CharacterSetElement WHITESPACE =
68         new CharacterSetElement(false);
69
70     /**
71      * The non-whitespace character set. This element matches a single
72      * non-whitespace character.
73      */

74     public static final CharacterSetElement NON_WHITESPACE =
75         new CharacterSetElement(true);
76
77     /**
78      * The word character set. This element matches a single word
79      * character.
80      */

81     public static final CharacterSetElement WORD =
82         new CharacterSetElement(false);
83
84     /**
85      * The non-word character set. This element matches a single
86      * non-word character.
87      */

88     public static final CharacterSetElement NON_WORD =
89         new CharacterSetElement(true);
90
91     /**
92      * The inverted character set flag.
93      */

94     private boolean inverted;
95
96     /**
97      * The character set content. This array may contain either
98      * range objects or Character objects.
99      */

100     private ArrayList JavaDoc contents = new ArrayList JavaDoc();
101
102     /**
103      * Creates a new character set element. If the inverted character
104      * set flag is set, only characters NOT in the set will match.
105      *
106      * @param inverted the inverted character set flag
107      */

108     public CharacterSetElement(boolean inverted) {
109         this.inverted = inverted;
110     }
111
112     /**
113      * Adds a single character to this character set.
114      *
115      * @param c the character to add
116      */

117     public void addCharacter(char c) {
118         contents.add(new Character JavaDoc(c));
119     }
120
121     /**
122      * Adds multiple characters to this character set.
123      *
124      * @param str the string with characters to add
125      */

126     public void addCharacters(String JavaDoc str) {
127         for (int i = 0; i < str.length(); i++) {
128             addCharacter(str.charAt(i));
129         }
130     }
131
132     /**
133      * Adds multiple characters to this character set.
134      *
135      * @param elem the string element with characters to add
136      */

137     public void addCharacters(StringElement elem) {
138         addCharacters(elem.getString());
139     }
140
141     /**
142      * Adds a character range to this character set.
143      *
144      * @param min the minimum character value
145      * @param max the maximum character value
146      */

147     public void addRange(char min, char max) {
148         contents.add(new Range(min, max));
149     }
150
151     /**
152      * Adds a character subset to this character set.
153      *
154      * @param elem the character set to add
155      */

156     public void addCharacterSet(CharacterSetElement elem) {
157         contents.add(elem);
158     }
159
160     /**
161      * Returns this element as the character set shouldn't be modified
162      * after creation. This partially breaks the contract of clone(),
163      * but as new characters are not added to the character set after
164      * creation, this will work correctly.
165      *
166      * @return this character set element
167      */

168     public Object JavaDoc clone() {
169         return this;
170     }
171
172     /**
173      * Returns the length of a matching string starting at the
174      * specified position. The number of matches to skip can also be
175      * specified, but numbers higher than zero (0) cause a failed
176      * match for any element that doesn't attempt to combine other
177      * elements.
178      *
179      * @param m the matcher being used
180      * @param input the input character stream to match
181      * @param start the starting position
182      * @param skip the number of matches to skip
183      *
184      * @return the length of the longest matching string, or
185      * -1 if no match was found
186      *
187      * @throws IOException if an I/O error occurred
188      */

189     public int match(Matcher m, LookAheadReader input, int start, int skip)
190         throws IOException JavaDoc {
191
192         int c;
193
194         if (skip != 0) {
195             return -1;
196         }
197         c = input.peek(start);
198         if (c < 0) {
199             m.setReadEndOfString();
200             return -1;
201         }
202         if (m.isCaseInsensitive()) {
203             c = Character.toLowerCase((char) c);
204         }
205         return inSet((char) c) ? 1 : -1;
206     }
207
208     /**
209      * Checks if the specified character matches this character set.
210      * This method takes the inverted flag into account.
211      *
212      * @param value the character to check
213      *
214      * @return true if the character matches, or
215      * false otherwise
216      */

217     private boolean inSet(char value) {
218         if (this == DOT) {
219             return inDotSet(value);
220         } else if (this == DIGIT || this == NON_DIGIT) {
221             return inDigitSet(value) != inverted;
222         } else if (this == WHITESPACE || this == NON_WHITESPACE) {
223             return inWhitespaceSet(value) != inverted;
224         } else if (this == WORD || this == NON_WORD) {
225             return inWordSet(value) != inverted;
226         } else {
227             return inUserSet(value) != inverted;
228         }
229     }
230
231     /**
232      * Checks if the specified character is present in the 'dot' set.
233      * This method does not consider the inverted flag.
234      *
235      * @param value the character to check
236      *
237      * @return true if the character is present, or
238      * false otherwise
239      */

240     private boolean inDotSet(char value) {
241         switch (value) {
242         case '\n':
243         case '\r':
244         case '\u0085':
245         case '\u2028':
246         case '\u2029':
247             return false;
248         default:
249             return true;
250         }
251     }
252
253     /**
254      * Checks if the specified character is a digit. This method does
255      * not consider the inverted flag.
256      *
257      * @param value the character to check
258      *
259      * @return true if the character is a digit, or
260      * false otherwise
261      */

262     private boolean inDigitSet(char value) {
263         return '0' <= value && value <= '9';
264     }
265
266     /**
267      * Checks if the specified character is a whitespace character.
268      * This method does not consider the inverted flag.
269      *
270      * @param value the character to check
271      *
272      * @return true if the character is a whitespace character, or
273      * false otherwise
274      */

275     private boolean inWhitespaceSet(char value) {
276         switch (value) {
277         case ' ':
278         case '\t':
279         case '\n':
280         case '\f':
281         case '\r':
282         case 11:
283             return true;
284         default:
285             return false;
286         }
287     }
288
289     /**
290      * Checks if the specified character is a word character. This
291      * method does not consider the inverted flag.
292      *
293      * @param value the character to check
294      *
295      * @return true if the character is a word character, or
296      * false otherwise
297      */

298     private boolean inWordSet(char value) {
299         return ('a' <= value && value <= 'z')
300             || ('A' <= value && value <= 'Z')
301             || ('0' <= value && value <= '9')
302             || value == '_';
303     }
304
305     /**
306      * Checks if the specified character is present in the user-
307      * defined set. This method does not consider the inverted flag.
308      *
309      * @param value the character to check
310      *
311      * @return true if the character is present, or
312      * false otherwise
313      */

314     private boolean inUserSet(char value) {
315         Object JavaDoc obj;
316         Character JavaDoc c;
317         Range r;
318         CharacterSetElement e;
319
320         for (int i = 0; i < contents.size(); i++) {
321             obj = contents.get(i);
322             if (obj instanceof Character JavaDoc) {
323                 c = (Character JavaDoc) obj;
324                 if (c.charValue() == value) {
325                     return true;
326                 }
327             } else if (obj instanceof Range) {
328                 r = (Range) obj;
329                 if (r.inside(value)) {
330                     return true;
331                 }
332             } else if (obj instanceof CharacterSetElement) {
333                 e = (CharacterSetElement) obj;
334                 if (e.inSet(value)) {
335                     return true;
336                 }
337             }
338         }
339         return false;
340     }
341
342     /**
343      * Prints this element to the specified output stream.
344      *
345      * @param output the output stream to use
346      * @param indent the current indentation
347      */

348     public void printTo(PrintWriter JavaDoc output, String JavaDoc indent) {
349         output.println(indent + toString());
350     }
351
352     /**
353      * Returns a string description of this character set.
354      *
355      * @return a string description of this character set
356      */

357     public String JavaDoc toString() {
358         StringBuffer JavaDoc buffer;
359
360         // Handle predefined character sets
361
if (this == DOT) {
362             return ".";
363         } else if (this == DIGIT) {
364             return "\\d";
365         } else if (this == NON_DIGIT) {
366             return "\\D";
367         } else if (this == WHITESPACE) {
368             return "\\s";
369         } else if (this == NON_WHITESPACE) {
370             return "\\S";
371         } else if (this == WORD) {
372             return "\\w";
373         } else if (this == NON_WORD) {
374             return "\\W";
375         }
376
377         // Handle user-defined character sets
378
buffer = new StringBuffer JavaDoc();
379         if (inverted) {
380             buffer.append("^[");
381         } else {
382             buffer.append("[");
383         }
384         for (int i = 0; i < contents.size(); i++) {
385             buffer.append(contents.get(i));
386         }
387         buffer.append("]");
388
389         return buffer.toString();
390     }
391
392
393     /**
394      * A character range class.
395      */

396     private class Range {
397
398         /**
399          * The minimum character value.
400          */

401         private char min;
402
403         /**
404          * The maximum character value.
405          */

406         private char max;
407
408         /**
409          * Creates a new character range.
410          *
411          * @param min the minimum character value
412          * @param max the maximum character value
413          */

414         public Range(char min, char max) {
415             this.min = min;
416             this.max = max;
417         }
418
419         /**
420          * Checks if the specified character is inside the range.
421          *
422          * @param c the character to check
423          *
424          * @return true if the character is in the range, or
425          * false otherwise
426          */

427         public boolean inside(char c) {
428             return c >= min && c <= max;
429         }
430
431         /**
432          * Returns a string representation of this object.
433          *
434          * @return a string representation of this object
435          */

436         public String JavaDoc toString() {
437             return min + "-" + max;
438         }
439     }
440 }
441
Popular Tags