Tokenizer


1   /*
2    * Tokenizer.java
3    *
4    * This library is free software; you can redistribute it and/or
5    * modify it under the terms of the GNU Lesser General Public License
6    * as published by the Free Software Foundation; either version 2.1
7    * of the License, or (at your option) any later version.
8    *
9    * This library is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   * Lesser General Public License for more details.
13   *
14   * You should have received a copy of the GNU Lesser General Public
15   * License along with this library; if not, write to the Free
16   * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
17   * MA 02111-1307, USA.
18   *
19   * Copyright (c) 2003-2005 Per Cederberg. All rights reserved.
20   */
21  
22  package net.percederberg.grammatica.parser;
23  
24  import java.io.IOException  ;
25  import java.io.Reader  ;
26  import java.util.ArrayList  ;
27  
28  import net.percederberg.grammatica.parser.re.RegExp;
29  import net.percederberg.grammatica.parser.re.Matcher;
30  import net.percederberg.grammatica.parser.re.RegExpException;
31  
32  /**
33   * A character stream tokenizer. This class groups the characters read
34   * from the stream together into tokens ("words"). The grouping is
35   * controlled by token patterns that contain either a fixed string to
36   * search for, or a regular expression. If the stream of characters
37   * don't match any of the token patterns, a parse exception is thrown.
38   *
39   * @author   Per Cederberg, <per at percederberg dot net>
40   * @version  1.5
41   */
42  public class Tokenizer {
43  
44      /**
45       * The ignore character case flag.
46       */
47      protected boolean ignoreCase = false;
48  
49      /**
50       * The token list feature flag.
51       */
52      private boolean useTokenList = false;
53  
54      /**
55       * The string token matcher. This token matcher is used for all
56       * string token patterns. This matcher implements a DFA to
57       * provide maximum performance.
58       */
59      private StringTokenMatcher stringMatcher = new StringTokenMatcher();
60  
61      /**
62       * The list of all regular expression token matchers. These
63       * matchers each test matches for a single regular expression.
64       */
65      private ArrayList   regexpMatchers = new ArrayList  ();
66  
67      /**
68       * The look-ahead character stream reader.
69       */
70      private LookAheadReader input = null;
71  
72      /**
73       * The previous token in the token list.
74       */
75      private Token previousToken = null;
76  
77      /**
78       * Creates a new case-sensitive tokenizer for the specified input
79       * stream.
80       *
81       * @param input          the input stream to read
82       */
83      public Tokenizer(Reader   input) {
84          this(input, false);
85      }
86  
87      /**
88       * Creates a new tokenizer for the specified input stream. The
89       * tokenizer can be set to process tokens either in case-sensitive
90       * or case-insensitive mode.
91       *
92       * @param input          the input stream to read
93       * @param ignoreCase     the character case ignore flag
94       *
95       * @since 1.5
96       */
97      public Tokenizer(Reader   input, boolean ignoreCase) {
98          this.input = new LookAheadReader(input);
99          this.ignoreCase = ignoreCase;
100     }
101 
102     /**
103      * Checks if the token list feature is used. The token list
104      * feature makes all tokens (including ignored tokens) link to
105      * each other in a linked list. By default the token list feature
106      * is not used.
107      *
108      * @return true if the token list feature is used, or
109      *         false otherwise
110      *
111      * @see #setUseTokenList
112      * @see Token#getPreviousToken
113      * @see Token#getNextToken
114      *
115      * @since 1.4
116      */
117     public boolean getUseTokenList() {
118         return useTokenList;
119     }
120 
121     /**
122      * Sets the token list feature flag. The token list feature makes
123      * all tokens (including ignored tokens) link to each other in a
124      * linked list when active. By default the token list feature is
125      * not used.
126      *
127      * @param useTokenList   the token list feature flag
128      *
129      * @see #getUseTokenList
130      * @see Token#getPreviousToken
131      * @see Token#getNextToken
132      *
133      * @since 1.4
134      */
135     public void setUseTokenList(boolean useTokenList) {
136         this.useTokenList = useTokenList;
137     }
138 
139     /**
140      * Returns a description of the token pattern with the specified
141      * id.
142      *
143      * @param id             the token pattern id
144      *
145      * @return the token pattern description, or
146      *         null if not present
147      */
148     public String   getPatternDescription(int id) {
149         TokenPattern        pattern;
150         RegExpTokenMatcher  re;
151 
152         pattern = stringMatcher.getPattern(id);
153         if (pattern != null) {
154             return pattern.toShortString();
155         }
156         for (int i = 0; i < regexpMatchers.size(); i++) {
157             re = (RegExpTokenMatcher) regexpMatchers.get(i);
158             if (re.getPattern().getId() == id) {
159                 return re.getPattern().toShortString();
160             }
161         }
162         return null;
163     }
164 
165     /**
166      * Returns the current line number. This number will be the line
167      * number of the next token returned.
168      *
169      * @return the current line number
170      */
171     public int getCurrentLine() {
172         return input.getLineNumber();
173     }
174 
175     /**
176      * Returns the current column number. This number will be the
177      * column number of the next token returned.
178      *
179      * @return the current column number
180      */
181     public int getCurrentColumn() {
182         return input.getColumnNumber();
183     }
184 
185     /**
186      * Adds a new token pattern to the tokenizer. The pattern will be
187      * added last in the list, choosing a previous token pattern in
188      * case two matches the same string.
189      *
190      * @param pattern        the pattern to add
191      *
192      * @throws ParserCreationException if the pattern couldn't be
193      *             added to the tokenizer
194      */
195     public void addPattern(TokenPattern pattern)
196         throws ParserCreationException {
197 
198         switch (pattern.getType()) {
199         case TokenPattern.STRING_TYPE:
200             stringMatcher.addPattern(pattern);
201             break;
202         case TokenPattern.REGEXP_TYPE:
203             try {
204                 regexpMatchers.add(new RegExpTokenMatcher(pattern, input));
205             } catch (RegExpException e) {
206                 throw new ParserCreationException(
207                     ParserCreationException.INVALID_TOKEN_ERROR,
208                     pattern.getName(),
209                     "regular expression contains error(s): " +
210                     e.getMessage());
211             }
212             break;
213         default:
214             throw new ParserCreationException(
215                 ParserCreationException.INVALID_TOKEN_ERROR,
216                 pattern.getName(),
217                 "pattern type " + pattern.getType() + " is undefined");
218         }
219     }
220 
221     /**
222      * Resets this tokenizer for usage with another input stream. This
223      * method will clear all the internal state in the tokenizer as
224      * well as close the previous input stream. It is normally called
225      * in order to reuse a parser and tokenizer pair for parsing
226      * another input stream.
227      *
228      * @param input          the new input stream to read
229      *
230      * @since 1.5
231      */
232     public void reset(Reader   input) {
233         try {
234             this.input.close();
235         } catch (IOException   ignore) {
236             // Do nothing
237         }
238         this.input = new LookAheadReader(input);
239         this.previousToken = null;
240         stringMatcher.reset();
241         for (int i = 0; i < regexpMatchers.size(); i++) {
242             ((RegExpTokenMatcher) regexpMatchers.get(i)).reset(this.input);
243         }
244     }
245 
246     /**
247      * Finds the next token on the stream. This method will return
248      * null when end of file has been reached. It will return a parse
249      * exception if no token matched the input stream, or if a token
250      * pattern with the error flag set matched. Any tokens matching a
251      * token pattern with the ignore flag set will be silently ignored
252      * and the next token will be returned.
253      *
254      * @return the next token found, or
255      *         null if end of file was encountered
256      *
257      * @throws ParseException if the input stream couldn't be read or
258      *             parsed correctly
259      */
260     public Token next() throws ParseException {
261         Token  token = null;
262 
263         do {
264             token = nextToken();
265             if (useTokenList && token != null) {
266                 token.setPreviousToken(previousToken);
267                 previousToken = token;
268             }
269             if (token == null) {
270                 return null;
271             } else if (token.getPattern().isError()) {
272                 throw new ParseException(
273                     ParseException.INVALID_TOKEN_ERROR,
274                     token.getPattern().getErrorMessage(),
275                     token.getStartLine(),
276                     token.getStartColumn());
277             } else if (token.getPattern().isIgnore()) {
278                 token = null;
279             }
280         } while (token == null);
281 
282         return token;
283     }
284 
285     /**
286      * Finds the next token on the stream. This method will return
287      * null when end of file has been reached. It will return a parse
288      * exception if no token matched the input stream.
289      *
290      * @return the next token found, or
291      *         null if end of file was encountered
292      *
293      * @throws ParseException if the input stream couldn't be read or
294      *             parsed correctly
295      */
296     private Token nextToken() throws ParseException {
297         TokenMatcher  m;
298         String          str;
299         int           line;
300         int           column;
301 
302         try {
303             m = findMatch();
304             if (m != null) {
305                 line = input.getLineNumber();
306                 column = input.getColumnNumber();
307                 str = input.readString(m.getMatchedLength());
308                 return new Token(m.getMatchedPattern(), str, line, column);
309             } else if (input.peek(0) < 0) {
310                 return null;
311             } else {
312                 line = input.getLineNumber();
313                 column = input.getColumnNumber();
314                 throw new ParseException(ParseException.UNEXPECTED_CHAR_ERROR,
315                                          input.readString(1),
316                                          line,
317                                          column);
318             }
319         } catch (IOException   e) {
320             throw new ParseException(ParseException.IO_ERROR,
321                                      e.getMessage(),
322                                      -1,
323                                      -1);
324         }
325 
326     }
327 
328     /**
329      * Finds the longest token match from the current buffer position.
330      * This method will return the token matcher for the best match,
331      * or null if no match was found. As a side effect, this method
332      * will also set the end of buffer flag.
333      *
334      * @return the token mathcher with the longest match, or
335      *         null if no match was found
336      *
337      * @throws IOException if an I/O error occurred
338      */
339     private TokenMatcher findMatch() throws IOException   {
340         TokenMatcher        bestMatch = null;
341         int                 bestLength = 0;
342         RegExpTokenMatcher  re;
343 
344         // TODO: The regexp token patterns might not always be defined
345         //       last. We should check the ordering of the patterns in
346         //       case a string match had the same length as a regexp
347         //       match.
348 
349         // Check string matches
350         if (stringMatcher.match(input)) {
351             bestMatch = stringMatcher;
352             bestLength = bestMatch.getMatchedLength();
353         }
354 
355         // Check regular expression matches
356         for (int i = 0; i < regexpMatchers.size(); i++) {
357             re = (RegExpTokenMatcher) regexpMatchers.get(i);
358             if (re.match() && re.getMatchedLength() > bestLength) {
359                 bestMatch = re;
360                 bestLength = re.getMatchedLength();
361             }
362         }
363         return bestMatch;
364     }
365 
366     /**
367      * Returns a string representation of this object. The returned
368      * string will contain the details of all the token patterns
369      * contained in this tokenizer.
370      *
371      * @return a detailed string representation
372      */
373     public String   toString() {
374         StringBuffer    buffer = new StringBuffer  ();
375 
376         buffer.append(stringMatcher);
377         for (int i = 0; i < regexpMatchers.size(); i++) {
378             buffer.append(regexpMatchers.get(i));
379         }
380         return buffer.toString();
381     }
382 
383 
384     /**
385      * A token pattern matcher. This class is the base class for the
386      * two types of token matchers that exist. The token matcher
387      * checks for matches with the tokenizer buffer, and maintains the
388      * state of the last match.
389      */
390     private abstract class TokenMatcher {
391 
392         /**
393          * Returns the latest matched token pattern.
394          *
395          * @return the latest matched token pattern, or
396          *         null if no match found
397          */
398         public abstract TokenPattern getMatchedPattern();
399 
400         /**
401          * Returns the length of the latest match.
402          *
403          * @return the length of the latest match, or
404          *         zero (0) if no match found
405          */
406         public abstract int getMatchedLength();
407     }
408 
409 
410     /**
411      * A regular expression token pattern matcher. This class is used
412      * to match a single regular expression with an input stream. This
413      * class also maintains the state of the last match.
414      */
415     private class RegExpTokenMatcher extends TokenMatcher {
416 
417         /**
418          * The token pattern to match with.
419          */
420         private TokenPattern pattern;
421 
422         /**
423          * The regular expression to use.
424          */
425         private RegExp regExp;
426 
427         /**
428          * The regular expression matcher to use.
429          */
430         private Matcher matcher;
431 
432         /**
433          * Creates a new regular expression token matcher.
434          *
435          * @param pattern        the pattern to match
436          * @param input          the input stream to check
437          *
438          * @throws RegExpException if the regular expression couldn't
439          *             be created properly
440          */
441         public RegExpTokenMatcher(TokenPattern pattern, LookAheadReader input)
442             throws RegExpException {
443 
444             this.pattern = pattern;
445             this.regExp = new RegExp(pattern.getPattern(), ignoreCase);
446             this.matcher = regExp.matcher(input);
447         }
448 
449         /**
450          * Resets the matcher for another character input stream. This
451          * will clear the results of the last match.
452          *
453          * @param input          the new input stream to check
454          */
455         public void reset(LookAheadReader input) {
456             matcher.reset(input);
457         }
458 
459         /**
460          * Returns the token pattern.
461          *
462          * @return the token pattern
463          */
464         public TokenPattern getPattern() {
465             return pattern;
466         }
467 
468         /**
469          * Returns the latest matched token pattern.
470          *
471          * @return the latest matched token pattern, or
472          *         null if no match found
473          */
474         public TokenPattern getMatchedPattern() {
475             if (matcher.length() <= 0) {
476                 return null;
477             } else {
478                 return pattern;
479             }
480         }
481 
482         /**
483          * Returns the length of the latest match.
484          *
485          * @return the length of the latest match, or
486          *         zero (0) if no match found
487          */
488         public int getMatchedLength() {
489             return matcher.length();
490         }
491 
492         /**
493          * Checks if the token pattern matches the input stream. This
494          * method will also reset all flags in this matcher.
495          *
496          * @return true if a match was found, or
497          *         false otherwise
498          *
499          * @throws IOException if an I/O error occurred
500          */
501         public boolean match() throws IOException   {
502             return matcher.matchFromBeginning();
503         }
504 
505         /**
506          * Returns a string representation of this token matcher.
507          *
508          * @return a detailed string representation of this matcher
509          */
510         public String   toString() {
511             return pattern.toString() + "\n" +
512                    regExp.toString() + "\n";
513         }
514     }
515 
516 
517     /**
518      * A string token pattern matcher. This class is used to match a
519      * set of strings with an input stream. This class internally uses
520      * a DFA for maximum performance. It also maintains the state of
521      * the last match.
522      */
523     private class StringTokenMatcher extends TokenMatcher {
524 
525         /**
526          * The list of string token patterns.
527          */
528         private ArrayList   patterns = new ArrayList  ();
529 
530         /**
531          * The finite automaton to use for matching.
532          */
533         private Automaton start = new Automaton();
534 
535         /**
536          * The last token pattern match found.
537          */
538         private TokenPattern match = null;
539 
540         /**
541          * Creates a new string token matcher.
542          */
543         public StringTokenMatcher() {
544         }
545 
546         /**
547          * Resets the matcher state. This will clear the results of
548          * the last match.
549          */
550         public void reset() {
551             match = null;
552         }
553 
554         /**
555          * Returns the latest matched token pattern.
556          *
557          * @return the latest matched token pattern, or
558          *         null if no match found
559          */
560         public TokenPattern getMatchedPattern() {
561             return match;
562         }
563 
564         /**
565          * Returns the length of the latest match.
566          *
567          * @return the length of the latest match, or
568          *         zero (0) if no match found
569          */
570         public int getMatchedLength() {
571             if (match == null) {
572                 return 0;
573             } else {
574                 return match.getPattern().length();
575             }
576         }
577 
578         /**
579          * Returns the token pattern with the specified id. Only
580          * token patterns handled by this matcher can be returned.
581          *
582          * @param id         the token pattern id
583          *
584          * @return the token pattern found, or
585          *         null if not found
586          */
587         public TokenPattern getPattern(int id) {
588             TokenPattern  pattern;
589 
590             for (int i = 0; i < patterns.size(); i++) {
591                 pattern = (TokenPattern) patterns.get(i);
592                 if (pattern.getId() == id) {
593                     return pattern;
594                 }
595             }
596             return null;
597         }
598 
599         /**
600          * Adds a string token pattern to this matcher.
601          *
602          * @param pattern        the pattern to add
603          */
604         public void addPattern(TokenPattern pattern) {
605             patterns.add(pattern);
606             start.addMatch(pattern.getPattern(), ignoreCase, pattern);
607         }
608 
609         /**
610          * Checks if the token pattern matches the input stream. This
611          * method will also reset all flags in this matcher.
612          *
613          * @param input           the input stream to check
614          *
615          * @return true if a match was found, or
616          *         false otherwise
617          *
618          * @throws IOException if an I/O error occurred
619          */
620         public boolean match(LookAheadReader input) throws IOException   {
621             reset();
622             match = (TokenPattern) start.matchFrom(input, 0, ignoreCase);
623             return match != null;
624         }
625 
626         /**
627          * Returns a string representation of this matcher. This will
628          * contain all the token patterns.
629          *
630          * @return a detailed string representation of this matcher
631          */
632         public String   toString() {
633             StringBuffer    buffer = new StringBuffer  ();
634 
635             for (int i = 0; i < patterns.size(); i++) {
636                 buffer.append(patterns.get(i));
637                 buffer.append("\n\n");
638             }
639             return buffer.toString();
640         }
641     }
642 }
643
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags