Lexers


1   /*****************************************************************************
2    * Copyright (C) Zephyr Business Solutions Corp. All rights reserved.            *
3    * ------------------------------------------------------------------------- *
4    * The software in this package is published under the terms of the BSD      *
5    * style license a copy of which has been included with this distribution in *
6    * the LICENSE.txt file.                                                     *
7    *****************************************************************************/
8   /*
9    * Created on Dec 19, 2004
10   *
11   * Author Ben Yu
12   */
13  package jfun.parsec;
14  
15  import jfun.parsec.pattern.Patterns;
16  import jfun.parsec.tokens.Tokenizers;
17  
18  
19  /**
20   * Provides some predefined basic lexer objects.
21   * A lexer is a character level parser that returns a token
22   * based on the recognized character range.
23   * @author Ben Yu
24   *
25   * Dec 19, 2004
26   */
27  public final class Lexers {
28    private static final Parser<Tok> _charLiteral = charLiteral("charLiteral");
29    /**
30     * returns the lexer that's gonna parse single quoted character literal (escaped by '\'),
31     * and then converts the character to a Character.
32     * @return the lexer.
33     */
34    public static Parser<Tok> charLiteral(){
35      return _charLiteral;
36    }
37    /**
38     * returns the lexer that's gonna parse single quoted character literal (escaped by '\'),
39     * and then converts the character to a Character.
40     * @param name the lexer name.
41     * @return the lexer.
42     */
43    public static Parser<Tok> charLiteral(final String   name){
44      return Lexers.lexer(name, 
45          Scanners.isQuotedChar(),
46            Tokenizers.forChar());
47    }
48    private static final Parser<Tok> _stringLiteral = stringLiteral("stringLiteral");
49    /**
50     * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
51     * and convert the string to a String token.
52     * @return the lexer.
53     * @deprecated Use {@link #lexSimpleStringLiteral()}
54     */
55    public static Parser<Tok> stringLiteral(){
56      return lexSimpleStringLiteral();
57    }
58    /**
59     * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
60     * and convert the string to a String token.
61     * @return the lexer.
62     */
63    public static Parser<Tok> lexSimpleStringLiteral(){
64      return _stringLiteral;
65    }
66    /**
67     * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
68     * and convert the string to a String token.
69     * @param name the lexer name.
70     * @return the lexer.
71     * @deprecated Use {@link #lexSimpleStringLiteral(String)}
72     */
73    public static Parser<Tok> stringLiteral(final String   name){
74      return lexSimpleStringLiteral(name);
75    }
76    /**
77     * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
78     * and convert the string to a String token.
79     * @param name the lexer name.
80     * @return the lexer.
81     */
82    public static Parser<Tok> lexSimpleStringLiteral(final String   name){
83      return Lexers.lexer(name,
84          Scanners.isQuotedString(),
85          Tokenizers.forSimpleStringLiteral()
86      );
87    }
88    private static final Parser<Tok> _sqlStringLiteral = sqlStringLiteral("string quoted by '");
89    /**
90     * returns the lexer that's gonna parse single quoted string literal (single quote is escaped with another single quote),
91     * and convert the string to a String token.
92     * @return the lexer.
93     */
94    public static Parser<Tok> sqlStringLiteral(){
95      return _sqlStringLiteral;
96    }
97    /**
98     * returns the lexer that's gonna parse single quoted string literal (single quote is escaped with another single quote),
99     * and convert the string to a String token.
100    * @param name the lexer name.
101    * @return the lexer.
102    */
103   public static Parser<Tok> sqlStringLiteral(final String   name){
104     return Lexers.lexer(name,
105         Scanners.isSqlString(),
106         Tokenizers.forSqlStringLiteral()
107     );
108   }
109   private static final Parser<Tok> _decimal = decimal("decimal");
110   /**
111    * returns the lexer that's gonna parse a decimal number (valid patterns are: 1, 2.3, 000, 0., .23),
112    * and convert the string to a decimal typed token.
113    * @return the lexer.
114    */
115   public static Parser<Tok> decimal(){
116     return _decimal;
117   }
118   /**
119    * returns the lexer that's gonna parse a decimal number (valid patterns are: 1, 2.3, 000, 0., .23),
120    * and convert the string to a decimal typed token.
121    * @param name the lexer name.
122    * @return the lexer.
123    */
124   public static Parser<Tok> decimal(final String   name){
125     return Lexers.lexer(name,
126         Scanners.delimited(Scanners.isPattern(Patterns.isDecimal(), "decimal number")),
127         Tokenizers.forDecimal()
128     );
129   }
130   private static final Parser<Tok> _integer = integer("integer");
131   /**
132    * returns the lexer that's gonna parse a integer number (valid patterns are: 0, 00, 1, 10),
133    * and convert the string to an integer typed token.
134    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
135    * @return the lexer.
136    */
137   public static Parser<Tok> integer(){
138     return _integer;
139   }
140   /**
141    * returns the lexer that's gonna parse a integer number (valid patterns are: 0, 00, 1, 10),
142    * and convert the string to an integer typed token.
143    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
144    * @param name the lexer name.
145    * @return the lexer.
146    */
147   public static Parser<Tok> integer(final String   name){
148     return Lexers.lexer(name, Scanners.delimited(
149         Scanners.isPattern(Patterns.isInteger(), "integer")),
150         Tokenizers.forInteger());
151   }
152   /**
153    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
154    * and convert the string to a Long token.
155    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
156    * @return the lexer.
157    * @deprecated Use {@link #lexDecLong()}.
158    */
159   
160   public static Parser<Tok> decInteger(){
161     return lexDecLong();
162   }
163   /**
164    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
165    * and convert the string to a Long token.
166    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
167    * @param name the lexer name.
168    * @return the lexer.
169    * @deprecated Use {@link #lexDecLong(String)}.
170    */
171   public static Parser<Tok> decInteger(final String   name){
172     return lexDecLong(name);
173   }
174   
175   /**
176    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
177    * and convert the string to a Long token.
178    * an octal number has to start with 0.
179    * @return the lexer.
180    * @deprecated Use {@link #lexOctLong()}.
181    */
182   public static Parser<Tok> octInteger(){
183     return lexOctLong();
184   }
185   /**
186    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
187    * and convert the string to a Long token.
188    * an octal number has to start with 0.
189    * @param name the lexer name.
190    * @return the lexer.
191    * @deprecated Use {@link #lexOctLong(String)}.
192    */
193   public static Parser<Tok> octInteger(final String   name){
194     return lexOctLong(name);
195   }
196   /**
197    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
198    * and convert the string to a Long token.
199    * an hex number has to start with either 0x or 0X.
200    * @return the lexer.
201    * @deprecated Use {@link #lexHexLong()}.
202    */
203   public static Parser<Tok> hexInteger(){
204     return lexHexLong();
205   }
206   /**
207    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
208    * and convert the string to a Long token.
209    * an hex number has to start with either 0x or 0X.
210    * @param name the lexer name.
211    * @return the lexer.
212    * @deprecated Use {@link #lexHexLong(String)}.
213    */
214   public static Parser<Tok> hexInteger(final String   name){
215     return lexHexLong(name);
216   }
217   /**
218    * returns the lexer that's gonna parse decimal, hex, and octal numbers
219    * and convert the string to a Long token.
220    * @return the lexer.
221    * @deprecated Use {@link #lexLong()}.
222    */
223   public static Parser<Tok> allInteger(){
224     return lexLong();
225   }
226   /**
227    * returns the lexer that's gonna parse decimal, hex, and octal numbers
228    * and convert the string to a Long token.
229    * @param name the lexer name.
230    * @return the lexer.
231    * @deprecated Use {@link #lexLong(String)}.
232    */
233   public static Parser<Tok> allInteger(final String   name){
234     return lexLong(name);
235   }
236   
237   private static final Parser<Tok> _decLong = lexDecLong("decLong");
238   /**
239    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
240    * and convert the string to a Long token.
241    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
242    * @return the lexer.
243    */
244   public static Parser<Tok> lexDecLong(){
245     return _decLong;
246   }
247   /**
248    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
249    * and convert the string to a Long token.
250    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
251    * @param name the lexer name.
252    * @return the lexer.
253    */
254   public static Parser<Tok> lexDecLong(final String   name){
255     return Lexers.lexer(name,
256         Scanners.delimited(Scanners.isPattern(Patterns.isDecInteger(),
257             "decLong")), Tokenizers.forDecLong());
258   }
259   
260   private static final Parser<Tok> _octLong = lexOctLong("octLong");
261   /**
262    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
263    * and convert the string to a Long token.
264    * an octal number has to start with 0.
265    * @return the lexer.
266    */
267   public static Parser<Tok> lexOctLong(){
268     return _octLong;
269   }
270   /**
271    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
272    * and convert the string to a Long token.
273    * an octal number has to start with 0.
274    * @param name the lexer name.
275    * @return the lexer.
276    */
277   public static Parser<Tok> lexOctLong(final String   name){
278     return Lexers.lexer(name,
279         Scanners.delimited(Scanners.isPattern(
280             Patterns.isOctInteger(), "octLong")), Tokenizers.forOctLong());
281   }
282   private static final Parser<Tok> _hexLong = lexHexLong("hexLong");
283   /**
284    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
285    * and convert the string to a Long token.
286    * an hex number has to start with either 0x or 0X.
287    * @return the lexer.
288    */
289   public static Parser<Tok> lexHexLong(){
290     return _hexLong;
291   }
292   /**
293    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
294    * and convert the string to a Long token.
295    * an hex number has to start with either 0x or 0X.
296    * @param name the lexer name.
297    * @return the lexer.
298    */
299   public static Parser<Tok> lexHexLong(final String   name){
300     return Lexers.lexer(name,
301         Scanners.delimited(Scanners.isPattern(Patterns.isHexInteger(), "hexLong"))
302         , Tokenizers.forHexLong());
303   }
304   private static final Parser<Tok> _allLong = lexLong("allLong");
305   /**
306    * returns the lexer that's gonna parse decimal, hex, and octal numbers
307    * and convert the string to a Long token.
308    * @return the lexer.
309    */
310   public static Parser<Tok> lexLong(){
311     return _allLong;
312   }
313   /**
314    * returns the lexer that's gonna parse decimal, hex, and octal numbers
315    * and convert the string to a Long token.
316    * @param name the lexer name.
317    * @return the lexer.
318    */
319   public static Parser<Tok> lexLong(final String   name){
320     return Parsers.plus(name, lexHexLong(), lexDecLong(), lexOctLong());
321   }
322   private static final Parser<Tok> _word = word("word");
323   /**
324    * returns the lexer that's gonna parse any word.
325    * and convert the string to a TokenWord.
326    * A word starts with an alphametic character, followed by 0 or more alphanumeric characters.
327    * @return the lexer.
328    */
329   public static Parser<Tok> word(){
330     return _word;
331   }
332   /**
333    * returns the lexer that's gonna parse any word.
334    * and convert the string to a TokenWord.
335    * A word starts with an alphametic character, followed by 0 or more alphanumeric characters.
336    * @param name the lexer name.
337    * @return the lexer.
338    */
339   public static Parser<Tok> word(final String   name){
340     return Lexers.lexer(name, Scanners.delimited(
341         Scanners.isPattern(Patterns.isWord(), "word")),
342         Tokenizers.forWord());
343   }
344   /**
345    * Create a lexer that parsers a string literal quoted by open and close,
346    * and then converts it to a TokenQuoted token instance.
347    * @param name the lexer name.
348    * @param open the opening character.
349    * @param close the closing character.
350    * @return the lexer.
351    */
352   public static Parser<Tok> quoted(final String   name, final char open, final char close){
353     return Lexers.lexer(name, Scanners.quoted(name, open, close), 
354         Tokenizers.forQuotedString(open, close));
355   }
356   /**
357    * Create a lexer that parsers a string literal quoted by open and close,
358    * and then converts it to a TokenQuoted token instance.
359    * @param open the opening character.
360    * @param close the closing character.
361    * @return the lexer.
362    */
363   public static Parser<Tok> quoted(final char open, final char close){
364     return quoted("quoted", open, close);
365   }
366 
367   /**
368    * Creates a Words object for lexing the operators with names specified in ops.
369    * Operators are lexed as TokenReserved.
370    * @param ops the operator names.
371    * @return the Words instance.
372    */
373   public static Words getOperators(final String  ... ops){
374     return Words.getOperators(ops);
375   }
376   /**
377    * Creates a Words object for lexing the operators with names specified in ops,
378    * and for lexing the keywords case insensitively.
379    * Keywords and operators are lexed as TokenReserved.
380    * Words that are not among the keywords are lexed as TokenWord. 
381    * A word is defined as an alpha numeric string that starts with [_a-zA-Z],
382    * with 0 or more [0-9_a-zA-Z] following. 
383    * @param ops the operator names.
384    * @param keywords the keyword names.
385    * @return the Words instance.
386    */
387   public static Words getCaseInsensitive(
388       final String  [] ops, final String  [] keywords){
389     return Words.getCaseInsensitive(ops, keywords);
390   }
391 
392   /**
393    * Creates a Words object for lexing the operators with names specified in ops,
394    * and for lexing the keywords case sensitively. 
395    * Keywords and operators are lexed as TokenReserved.
396    * Words that are not among the keywords are lexed as TokenWord. 
397    * A word is defined as an alpha numeric string that starts with [_a-zA-Z],
398    * with 0 or more [0-9_a-zA-Z] following.
399    * @param ops the operator names.
400    * @param keywords the keyword names.
401    * @return the Words instance.
402    */
403   public static Words getCaseSensitive(
404       final String  [] ops, final String  [] keywords){
405     return Words.getCaseSensitive(ops, keywords);
406   }
407   /**
408    * Creates a Words object for lexing the operators with names specified in ops,
409    * and for lexing the keywords case insensitively.
410    * Keywords and operators are lexed as TokenReserved.
411    * Words that are not among the keywords are lexed as TokenWord. 
412    * @param wscanner the scanner for a word in the language.
413    * @param ops the operator names.
414    * @param keywords the keyword names.
415    * @return the Words instance.
416    */
417   public static Words getCaseInsensitive(final Parser<?> wscanner, 
418       final String  [] ops, final String  [] keywords){
419     return Words.getCaseInsensitive(wscanner, ops, keywords);
420   }
421   /**
422    * Creates a Words object for lexing the operators with names specified in ops,
423    * and for lexing the keywords case sensitively. 
424    * Keywords and operators are lexed as TokenReserved.
425    * Words that are not among the keywords are lexed as TokenWord. 
426    * @param wscanner the scanner for a word in the language.
427    * @param ops the operator names.
428    * @param keywords the keyword names.
429    * @return the Words instance.
430    */
431   public static Words getCaseSensitive(final Parser<?> wscanner,
432       final String  [] ops, final String  [] keywords){
433     return Words.getCaseSensitive(wscanner, ops, keywords);
434   }
435   /**
436    * Creates a Words object for lexing the operators with names specified in ops,
437    * and for lexing the keywords case insensitively.
438    * Keywords and operators are lexed as TokenReserved.
439    * Words that are not among the keywords are lexed as TokenWord. 
440    * @param wscanner the scanner for a word in the language.
441    * @param ops the operator names.
442    * @param keywords the keyword names.
443    * @param toWord the FromString object used to create a token for non-key words recognized by wscanner.
444    * @return the Words instance.
445    */
446   public static Words getCaseInsensitive(final Parser<?> wscanner, 
447       final String  [] ops, final String  [] keywords, FromString<?> toWord){
448     return Words.getCaseInsensitive(wscanner, ops, keywords, toWord);
449   }
450   /**
451    * Creates a Words object for lexing the operators with names specified in ops,
452    * and for lexing the keywords case sensitively. 
453    * Keywords and operators are lexed as TokenReserved.
454    * Words that are not among the keywords are lexed as TokenWord. 
455    * @param wscanner the scanner for a word in the language.
456    * @param ops the operator names.
457    * @param keywords the keyword names.
458    * @param toWord the FromString object used to create a token for non-key words recognized by wscanner.
459    * @return the Words instance.
460    */
461   public static Words getCaseSensitive(final Parser<?> wscanner,
462       final String  [] ops, final String  [] keywords, FromString<?> toWord){
463     return Words.getCaseSensitive(wscanner, ops, keywords, toWord);
464   }
465   /**
466    * Transform the recognized character range of scanner s to a token object
467    * with a Tokenizer. 
468    * If the Tokenizer.toToken() returns null, scan fails.
469    * @param name the name of the new Scanner.
470    * @param tn the Tokenizer object.
471    * @param s the scanner to transform.
472    * @return the new Scanner.
473    */
474   public static Parser<Tok> lexer(final String   name, final Parser<?> s, final Tokenizer tn){
475     return lexer(name, s, tn, "lexer error");
476   }
477   /**
478    * Transform the recognized character range of scanner s to a token object
479    * with a Tokenizer. 
480    * If the Tokenizer.toToken() returns null, scan fails.
481    * @param s the scanner to transform.
482    * @param tn the Tokenizer object.
483    * @return the new Scanner.
484    */
485   public static Parser<Tok> lexer(final Parser<?> s, final Tokenizer tn){
486     return lexer("lexer", s, tn);
487   }
488   /**
489    * Transform the recognized character range of scanner s to a token object
490    * with a Tokenizer. 
491    * If the Tokenizer.toToken() returns null, scan fails.
492    * @param s the scanner to transform.
493    * @param tn the Tokenizer object.
494    * @param err the error message when the tokenizer returns null.
495    * @return the new Scanner.
496    */
497   public static Parser<Tok> lexer(final Parser<?> s, final Tokenizer tn, final String   err){
498     return lexer("lexer", s, tn, err);
499   }
500   /**
501    * Transform the recognized character range of scanner s to a token object
502    * with a Tokenizer. 
503    * If the Tokenizer.toToken() returns null, scan fails.
504    * @param name the name of the new Scanner.
505    * @param s the scanner to transform.
506    * @param tn the Tokenizer object.
507    * @param err the error message when the tokenizer returns null.
508    * @return the new Scanner.
509    */
510   public static Parser<Tok> lexer(final String   name, final Parser<?> s, final Tokenizer tn, 
511       final String   err){
512     return new Parser<Tok>(name){
513       boolean apply(final ParseContext ctxt){
514         final int ind = ctxt.getIndex();
515         final int from = ctxt.getAt();
516         final Object   ret = ctxt.getReturn();
517         final int at = ctxt.getAt();
518         final int step = ctxt.getStep();
519         final Object   ustate = ctxt.getUserState();
520         //final AbstractParsecError error = ctxt.getError();
521         if(!s.parse(ctxt)) return false;
522         final int len = ctxt.getAt() - from;
523         final Object   tok = tn.toToken(ctxt.getSource(), from, len);
524         if(tok == null){
525           ctxt.set(step, at, ret, ustate, ParsecError.raiseExpecting(ind, err));
526           return false;
527         }
528         final Tok ptok = new Tok(at, len, tok);
529         ctxt.setStep(step+1);
530         ctxt.setReturn(ptok);
531         return true;
532       }
533     };
534   }
535   /**
536    * Greedily runs Parser s repeatedly,
537    * and ignores the pattern recognized by Parser delim before and after each s.
538    * Parser s has to be a lexer object that returns a Tok object.
539    * The result Tok objects are collected and returned in a Tok[] array.
540    * @param name the name of the new Parser object.
541    * @param delim the delimiter Parser object.
542    * @param s the Parser object.
543    * @return the new Parser object.
544    */
545   public static Parser<Tok[]> lexeme(final String   name, 
546       final Parser<?> delim, final Parser<Tok> s){
547     return delim.optional().seq(name, Parsers.sepEndBy(name, Tok.class, delim, s));
548   }
549   /**
550    * Greedily runs Parser s repeatedly,
551    * and ignores the pattern recognized by Parser delim before and after each s.
552    * Parser s has to be a lexer object that returns a Tok object.
553    * The result Tok objects are collected and returned in a Tok[] array.
554    * @param delim the delimiter Parser object.
555    * @param s the Parser object.
556    * @return the new Parser object.
557    */
558   public static Parser<Tok[]> lexeme(final Parser<?> delim, final Parser<Tok> s){
559     return lexeme("lexeme", delim, s);
560   }
561 }
562
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags