KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > percederberg > grammatica > parser > Tokenizer


1 /*
2  * Tokenizer.java
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 2.1
7  * of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the Free
16  * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17  * MA 02111-1307, USA.
18  *
19  * Copyright (c) 2003-2005 Per Cederberg. All rights reserved.
20  */

21
22 package net.percederberg.grammatica.parser;
23
24 import java.io.IOException JavaDoc;
25 import java.io.Reader JavaDoc;
26 import java.util.ArrayList JavaDoc;
27
28 import net.percederberg.grammatica.parser.re.RegExp;
29 import net.percederberg.grammatica.parser.re.Matcher;
30 import net.percederberg.grammatica.parser.re.RegExpException;
31
32 /**
33  * A character stream tokenizer. This class groups the characters read
34  * from the stream together into tokens ("words"). The grouping is
35  * controlled by token patterns that contain either a fixed string to
36  * search for, or a regular expression. If the stream of characters
37  * don't match any of the token patterns, a parse exception is thrown.
38  *
39  * @author Per Cederberg, <per at percederberg dot net>
40  * @version 1.5
41  */

42 public class Tokenizer {
43
44     /**
45      * The ignore character case flag.
46      */

47     protected boolean ignoreCase = false;
48
49     /**
50      * The token list feature flag.
51      */

52     private boolean useTokenList = false;
53
54     /**
55      * The string token matcher. This token matcher is used for all
56      * string token patterns. This matcher implements a DFA to
57      * provide maximum performance.
58      */

59     private StringTokenMatcher stringMatcher = new StringTokenMatcher();
60
61     /**
62      * The list of all regular expression token matchers. These
63      * matchers each test matches for a single regular expression.
64      */

65     private ArrayList JavaDoc regexpMatchers = new ArrayList JavaDoc();
66
67     /**
68      * The look-ahead character stream reader.
69      */

70     private LookAheadReader input = null;
71
72     /**
73      * The previous token in the token list.
74      */

75     private Token previousToken = null;
76
77     /**
78      * Creates a new case-sensitive tokenizer for the specified input
79      * stream.
80      *
81      * @param input the input stream to read
82      */

83     public Tokenizer(Reader JavaDoc input) {
84         this(input, false);
85     }
86
87     /**
88      * Creates a new tokenizer for the specified input stream. The
89      * tokenizer can be set to process tokens either in case-sensitive
90      * or case-insensitive mode.
91      *
92      * @param input the input stream to read
93      * @param ignoreCase the character case ignore flag
94      *
95      * @since 1.5
96      */

97     public Tokenizer(Reader JavaDoc input, boolean ignoreCase) {
98         this.input = new LookAheadReader(input);
99         this.ignoreCase = ignoreCase;
100     }
101
102     /**
103      * Checks if the token list feature is used. The token list
104      * feature makes all tokens (including ignored tokens) link to
105      * each other in a linked list. By default the token list feature
106      * is not used.
107      *
108      * @return true if the token list feature is used, or
109      * false otherwise
110      *
111      * @see #setUseTokenList
112      * @see Token#getPreviousToken
113      * @see Token#getNextToken
114      *
115      * @since 1.4
116      */

117     public boolean getUseTokenList() {
118         return useTokenList;
119     }
120
121     /**
122      * Sets the token list feature flag. The token list feature makes
123      * all tokens (including ignored tokens) link to each other in a
124      * linked list when active. By default the token list feature is
125      * not used.
126      *
127      * @param useTokenList the token list feature flag
128      *
129      * @see #getUseTokenList
130      * @see Token#getPreviousToken
131      * @see Token#getNextToken
132      *
133      * @since 1.4
134      */

135     public void setUseTokenList(boolean useTokenList) {
136         this.useTokenList = useTokenList;
137     }
138
139     /**
140      * Returns a description of the token pattern with the specified
141      * id.
142      *
143      * @param id the token pattern id
144      *
145      * @return the token pattern description, or
146      * null if not present
147      */

148     public String JavaDoc getPatternDescription(int id) {
149         TokenPattern pattern;
150         RegExpTokenMatcher re;
151
152         pattern = stringMatcher.getPattern(id);
153         if (pattern != null) {
154             return pattern.toShortString();
155         }
156         for (int i = 0; i < regexpMatchers.size(); i++) {
157             re = (RegExpTokenMatcher) regexpMatchers.get(i);
158             if (re.getPattern().getId() == id) {
159                 return re.getPattern().toShortString();
160             }
161         }
162         return null;
163     }
164
165     /**
166      * Returns the current line number. This number will be the line
167      * number of the next token returned.
168      *
169      * @return the current line number
170      */

171     public int getCurrentLine() {
172         return input.getLineNumber();
173     }
174
175     /**
176      * Returns the current column number. This number will be the
177      * column number of the next token returned.
178      *
179      * @return the current column number
180      */

181     public int getCurrentColumn() {
182         return input.getColumnNumber();
183     }
184
185     /**
186      * Adds a new token pattern to the tokenizer. The pattern will be
187      * added last in the list, choosing a previous token pattern in
188      * case two matches the same string.
189      *
190      * @param pattern the pattern to add
191      *
192      * @throws ParserCreationException if the pattern couldn't be
193      * added to the tokenizer
194      */

195     public void addPattern(TokenPattern pattern)
196         throws ParserCreationException {
197
198         switch (pattern.getType()) {
199         case TokenPattern.STRING_TYPE:
200             stringMatcher.addPattern(pattern);
201             break;
202         case TokenPattern.REGEXP_TYPE:
203             try {
204                 regexpMatchers.add(new RegExpTokenMatcher(pattern, input));
205             } catch (RegExpException e) {
206                 throw new ParserCreationException(
207                     ParserCreationException.INVALID_TOKEN_ERROR,
208                     pattern.getName(),
209                     "regular expression contains error(s): " +
210                     e.getMessage());
211             }
212             break;
213         default:
214             throw new ParserCreationException(
215                 ParserCreationException.INVALID_TOKEN_ERROR,
216                 pattern.getName(),
217                 "pattern type " + pattern.getType() + " is undefined");
218         }
219     }
220
221     /**
222      * Resets this tokenizer for usage with another input stream. This
223      * method will clear all the internal state in the tokenizer as
224      * well as close the previous input stream. It is normally called
225      * in order to reuse a parser and tokenizer pair for parsing
226      * another input stream.
227      *
228      * @param input the new input stream to read
229      *
230      * @since 1.5
231      */

232     public void reset(Reader JavaDoc input) {
233         try {
234             this.input.close();
235         } catch (IOException JavaDoc ignore) {
236             // Do nothing
237
}
238         this.input = new LookAheadReader(input);
239         this.previousToken = null;
240         stringMatcher.reset();
241         for (int i = 0; i < regexpMatchers.size(); i++) {
242             ((RegExpTokenMatcher) regexpMatchers.get(i)).reset(this.input);
243         }
244     }
245
246     /**
247      * Finds the next token on the stream. This method will return
248      * null when end of file has been reached. It will return a parse
249      * exception if no token matched the input stream, or if a token
250      * pattern with the error flag set matched. Any tokens matching a
251      * token pattern with the ignore flag set will be silently ignored
252      * and the next token will be returned.
253      *
254      * @return the next token found, or
255      * null if end of file was encountered
256      *
257      * @throws ParseException if the input stream couldn't be read or
258      * parsed correctly
259      */

260     public Token next() throws ParseException {
261         Token token = null;
262
263         do {
264             token = nextToken();
265             if (useTokenList && token != null) {
266                 token.setPreviousToken(previousToken);
267                 previousToken = token;
268             }
269             if (token == null) {
270                 return null;
271             } else if (token.getPattern().isError()) {
272                 throw new ParseException(
273                     ParseException.INVALID_TOKEN_ERROR,
274                     token.getPattern().getErrorMessage(),
275                     token.getStartLine(),
276                     token.getStartColumn());
277             } else if (token.getPattern().isIgnore()) {
278                 token = null;
279             }
280         } while (token == null);
281
282         return token;
283     }
284
285     /**
286      * Finds the next token on the stream. This method will return
287      * null when end of file has been reached. It will return a parse
288      * exception if no token matched the input stream.
289      *
290      * @return the next token found, or
291      * null if end of file was encountered
292      *
293      * @throws ParseException if the input stream couldn't be read or
294      * parsed correctly
295      */

296     private Token nextToken() throws ParseException {
297         TokenMatcher m;
298         String JavaDoc str;
299         int line;
300         int column;
301
302         try {
303             m = findMatch();
304             if (m != null) {
305                 line = input.getLineNumber();
306                 column = input.getColumnNumber();
307                 str = input.readString(m.getMatchedLength());
308                 return new Token(m.getMatchedPattern(), str, line, column);
309             } else if (input.peek(0) < 0) {
310                 return null;
311             } else {
312                 line = input.getLineNumber();
313                 column = input.getColumnNumber();
314                 throw new ParseException(ParseException.UNEXPECTED_CHAR_ERROR,
315                                          input.readString(1),
316                                          line,
317                                          column);
318             }
319         } catch (IOException JavaDoc e) {
320             throw new ParseException(ParseException.IO_ERROR,
321                                      e.getMessage(),
322                                      -1,
323                                      -1);
324         }
325
326     }
327
328     /**
329      * Finds the longest token match from the current buffer position.
330      * This method will return the token matcher for the best match,
331      * or null if no match was found. As a side effect, this method
332      * will also set the end of buffer flag.
333      *
334      * @return the token mathcher with the longest match, or
335      * null if no match was found
336      *
337      * @throws IOException if an I/O error occurred
338      */

339     private TokenMatcher findMatch() throws IOException JavaDoc {
340         TokenMatcher bestMatch = null;
341         int bestLength = 0;
342         RegExpTokenMatcher re;
343
344         // TODO: The regexp token patterns might not always be defined
345
// last. We should check the ordering of the patterns in
346
// case a string match had the same length as a regexp
347
// match.
348

349         // Check string matches
350
if (stringMatcher.match(input)) {
351             bestMatch = stringMatcher;
352             bestLength = bestMatch.getMatchedLength();
353         }
354
355         // Check regular expression matches
356
for (int i = 0; i < regexpMatchers.size(); i++) {
357             re = (RegExpTokenMatcher) regexpMatchers.get(i);
358             if (re.match() && re.getMatchedLength() > bestLength) {
359                 bestMatch = re;
360                 bestLength = re.getMatchedLength();
361             }
362         }
363         return bestMatch;
364     }
365
366     /**
367      * Returns a string representation of this object. The returned
368      * string will contain the details of all the token patterns
369      * contained in this tokenizer.
370      *
371      * @return a detailed string representation
372      */

373     public String JavaDoc toString() {
374         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
375
376         buffer.append(stringMatcher);
377         for (int i = 0; i < regexpMatchers.size(); i++) {
378             buffer.append(regexpMatchers.get(i));
379         }
380         return buffer.toString();
381     }
382
383
384     /**
385      * A token pattern matcher. This class is the base class for the
386      * two types of token matchers that exist. The token matcher
387      * checks for matches with the tokenizer buffer, and maintains the
388      * state of the last match.
389      */

390     private abstract class TokenMatcher {
391
392         /**
393          * Returns the latest matched token pattern.
394          *
395          * @return the latest matched token pattern, or
396          * null if no match found
397          */

398         public abstract TokenPattern getMatchedPattern();
399
400         /**
401          * Returns the length of the latest match.
402          *
403          * @return the length of the latest match, or
404          * zero (0) if no match found
405          */

406         public abstract int getMatchedLength();
407     }
408
409
410     /**
411      * A regular expression token pattern matcher. This class is used
412      * to match a single regular expression with an input stream. This
413      * class also maintains the state of the last match.
414      */

415     private class RegExpTokenMatcher extends TokenMatcher {
416
417         /**
418          * The token pattern to match with.
419          */

420         private TokenPattern pattern;
421
422         /**
423          * The regular expression to use.
424          */

425         private RegExp regExp;
426
427         /**
428          * The regular expression matcher to use.
429          */

430         private Matcher matcher;
431
432         /**
433          * Creates a new regular expression token matcher.
434          *
435          * @param pattern the pattern to match
436          * @param input the input stream to check
437          *
438          * @throws RegExpException if the regular expression couldn't
439          * be created properly
440          */

441         public RegExpTokenMatcher(TokenPattern pattern, LookAheadReader input)
442             throws RegExpException {
443
444             this.pattern = pattern;
445             this.regExp = new RegExp(pattern.getPattern(), ignoreCase);
446             this.matcher = regExp.matcher(input);
447         }
448
449         /**
450          * Resets the matcher for another character input stream. This
451          * will clear the results of the last match.
452          *
453          * @param input the new input stream to check
454          */

455         public void reset(LookAheadReader input) {
456             matcher.reset(input);
457         }
458
459         /**
460          * Returns the token pattern.
461          *
462          * @return the token pattern
463          */

464         public TokenPattern getPattern() {
465             return pattern;
466         }
467
468         /**
469          * Returns the latest matched token pattern.
470          *
471          * @return the latest matched token pattern, or
472          * null if no match found
473          */

474         public TokenPattern getMatchedPattern() {
475             if (matcher.length() <= 0) {
476                 return null;
477             } else {
478                 return pattern;
479             }
480         }
481
482         /**
483          * Returns the length of the latest match.
484          *
485          * @return the length of the latest match, or
486          * zero (0) if no match found
487          */

488         public int getMatchedLength() {
489             return matcher.length();
490         }
491
492         /**
493          * Checks if the token pattern matches the input stream. This
494          * method will also reset all flags in this matcher.
495          *
496          * @return true if a match was found, or
497          * false otherwise
498          *
499          * @throws IOException if an I/O error occurred
500          */

501         public boolean match() throws IOException JavaDoc {
502             return matcher.matchFromBeginning();
503         }
504
505         /**
506          * Returns a string representation of this token matcher.
507          *
508          * @return a detailed string representation of this matcher
509          */

510         public String JavaDoc toString() {
511             return pattern.toString() + "\n" +
512                    regExp.toString() + "\n";
513         }
514     }
515
516
517     /**
518      * A string token pattern matcher. This class is used to match a
519      * set of strings with an input stream. This class internally uses
520      * a DFA for maximum performance. It also maintains the state of
521      * the last match.
522      */

523     private class StringTokenMatcher extends TokenMatcher {
524
525         /**
526          * The list of string token patterns.
527          */

528         private ArrayList JavaDoc patterns = new ArrayList JavaDoc();
529
530         /**
531          * The finite automaton to use for matching.
532          */

533         private Automaton start = new Automaton();
534
535         /**
536          * The last token pattern match found.
537          */

538         private TokenPattern match = null;
539
540         /**
541          * Creates a new string token matcher.
542          */

543         public StringTokenMatcher() {
544         }
545
546         /**
547          * Resets the matcher state. This will clear the results of
548          * the last match.
549          */

550         public void reset() {
551             match = null;
552         }
553
554         /**
555          * Returns the latest matched token pattern.
556          *
557          * @return the latest matched token pattern, or
558          * null if no match found
559          */

560         public TokenPattern getMatchedPattern() {
561             return match;
562         }
563
564         /**
565          * Returns the length of the latest match.
566          *
567          * @return the length of the latest match, or
568          * zero (0) if no match found
569          */

570         public int getMatchedLength() {
571             if (match == null) {
572                 return 0;
573             } else {
574                 return match.getPattern().length();
575             }
576         }
577
578         /**
579          * Returns the token pattern with the specified id. Only
580          * token patterns handled by this matcher can be returned.
581          *
582          * @param id the token pattern id
583          *
584          * @return the token pattern found, or
585          * null if not found
586          */

587         public TokenPattern getPattern(int id) {
588             TokenPattern pattern;
589
590             for (int i = 0; i < patterns.size(); i++) {
591                 pattern = (TokenPattern) patterns.get(i);
592                 if (pattern.getId() == id) {
593                     return pattern;
594                 }
595             }
596             return null;
597         }
598
599         /**
600          * Adds a string token pattern to this matcher.
601          *
602          * @param pattern the pattern to add
603          */

604         public void addPattern(TokenPattern pattern) {
605             patterns.add(pattern);
606             start.addMatch(pattern.getPattern(), ignoreCase, pattern);
607         }
608
609         /**
610          * Checks if the token pattern matches the input stream. This
611          * method will also reset all flags in this matcher.
612          *
613          * @param input the input stream to check
614          *
615          * @return true if a match was found, or
616          * false otherwise
617          *
618          * @throws IOException if an I/O error occurred
619          */

620         public boolean match(LookAheadReader input) throws IOException JavaDoc {
621             reset();
622             match = (TokenPattern) start.matchFrom(input, 0, ignoreCase);
623             return match != null;
624         }
625
626         /**
627          * Returns a string representation of this matcher. This will
628          * contain all the token patterns.
629          *
630          * @return a detailed string representation of this matcher
631          */

632         public String JavaDoc toString() {
633             StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
634
635             for (int i = 0; i < patterns.size(); i++) {
636                 buffer.append(patterns.get(i));
637                 buffer.append("\n\n");
638             }
639             return buffer.toString();
640         }
641     }
642 }
643
Popular Tags