KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > jfun > parsec > Lexers


1 /*****************************************************************************
2  * Copyright (C) Zephyr Business Solutions Corp. All rights reserved. *
3  * ------------------------------------------------------------------------- *
4  * The software in this package is published under the terms of the BSD *
5  * style license a copy of which has been included with this distribution in *
6  * the LICENSE.txt file. *
7  *****************************************************************************/

8 /*
9  * Created on Dec 19, 2004
10  *
11  * Author Ben Yu
12  */

13 package jfun.parsec;
14
15 import jfun.parsec.pattern.Patterns;
16 import jfun.parsec.tokens.Tokenizers;
17
18
19 /**
20  * Provides some predefined basic lexer objects.
21  * A lexer is a character level parser that returns a token
22  * based on the recognized character range.
23  * @author Ben Yu
24  *
25  * Dec 19, 2004
26  */

27 public final class Lexers {
28   private static final Parser<Tok> _charLiteral = charLiteral("charLiteral");
29   /**
30    * returns the lexer that's gonna parse single quoted character literal (escaped by '\'),
31    * and then converts the character to a Character.
32    * @return the lexer.
33    */

34   public static Parser<Tok> charLiteral(){
35     return _charLiteral;
36   }
37   /**
38    * returns the lexer that's gonna parse single quoted character literal (escaped by '\'),
39    * and then converts the character to a Character.
40    * @param name the lexer name.
41    * @return the lexer.
42    */

43   public static Parser<Tok> charLiteral(final String JavaDoc name){
44     return Lexers.lexer(name,
45         Scanners.isQuotedChar(),
46           Tokenizers.forChar());
47   }
48   private static final Parser<Tok> _stringLiteral = stringLiteral("stringLiteral");
49   /**
50    * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
51    * and convert the string to a String token.
52    * @return the lexer.
53    * @deprecated Use {@link #lexSimpleStringLiteral()}
54    */

55   public static Parser<Tok> stringLiteral(){
56     return lexSimpleStringLiteral();
57   }
58   /**
59    * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
60    * and convert the string to a String token.
61    * @return the lexer.
62    */

63   public static Parser<Tok> lexSimpleStringLiteral(){
64     return _stringLiteral;
65   }
66   /**
67    * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
68    * and convert the string to a String token.
69    * @param name the lexer name.
70    * @return the lexer.
71    * @deprecated Use {@link #lexSimpleStringLiteral(String)}
72    */

73   public static Parser<Tok> stringLiteral(final String JavaDoc name){
74     return lexSimpleStringLiteral(name);
75   }
76   /**
77    * returns the lexer that's gonna parse double quoted string literal (escaped by '\'),
78    * and convert the string to a String token.
79    * @param name the lexer name.
80    * @return the lexer.
81    */

82   public static Parser<Tok> lexSimpleStringLiteral(final String JavaDoc name){
83     return Lexers.lexer(name,
84         Scanners.isQuotedString(),
85         Tokenizers.forSimpleStringLiteral()
86     );
87   }
88   private static final Parser<Tok> _sqlStringLiteral = sqlStringLiteral("string quoted by '");
89   /**
90    * returns the lexer that's gonna parse single quoted string literal (single quote is escaped with another single quote),
91    * and convert the string to a String token.
92    * @return the lexer.
93    */

94   public static Parser<Tok> sqlStringLiteral(){
95     return _sqlStringLiteral;
96   }
97   /**
98    * returns the lexer that's gonna parse single quoted string literal (single quote is escaped with another single quote),
99    * and convert the string to a String token.
100    * @param name the lexer name.
101    * @return the lexer.
102    */

103   public static Parser<Tok> sqlStringLiteral(final String JavaDoc name){
104     return Lexers.lexer(name,
105         Scanners.isSqlString(),
106         Tokenizers.forSqlStringLiteral()
107     );
108   }
109   private static final Parser<Tok> _decimal = decimal("decimal");
110   /**
111    * returns the lexer that's gonna parse a decimal number (valid patterns are: 1, 2.3, 000, 0., .23),
112    * and convert the string to a decimal typed token.
113    * @return the lexer.
114    */

115   public static Parser<Tok> decimal(){
116     return _decimal;
117   }
118   /**
119    * returns the lexer that's gonna parse a decimal number (valid patterns are: 1, 2.3, 000, 0., .23),
120    * and convert the string to a decimal typed token.
121    * @param name the lexer name.
122    * @return the lexer.
123    */

124   public static Parser<Tok> decimal(final String JavaDoc name){
125     return Lexers.lexer(name,
126         Scanners.delimited(Scanners.isPattern(Patterns.isDecimal(), "decimal number")),
127         Tokenizers.forDecimal()
128     );
129   }
130   private static final Parser<Tok> _integer = integer("integer");
131   /**
132    * returns the lexer that's gonna parse a integer number (valid patterns are: 0, 00, 1, 10),
133    * and convert the string to an integer typed token.
134    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
135    * @return the lexer.
136    */

137   public static Parser<Tok> integer(){
138     return _integer;
139   }
140   /**
141    * returns the lexer that's gonna parse a integer number (valid patterns are: 0, 00, 1, 10),
142    * and convert the string to an integer typed token.
143    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
144    * @param name the lexer name.
145    * @return the lexer.
146    */

147   public static Parser<Tok> integer(final String JavaDoc name){
148     return Lexers.lexer(name, Scanners.delimited(
149         Scanners.isPattern(Patterns.isInteger(), "integer")),
150         Tokenizers.forInteger());
151   }
152   /**
153    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
154    * and convert the string to a Long token.
155    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
156    * @return the lexer.
157    * @deprecated Use {@link #lexDecLong()}.
158    */

159   
160   public static Parser<Tok> decInteger(){
161     return lexDecLong();
162   }
163   /**
164    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
165    * and convert the string to a Long token.
166    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
167    * @param name the lexer name.
168    * @return the lexer.
169    * @deprecated Use {@link #lexDecLong(String)}.
170    */

171   public static Parser<Tok> decInteger(final String JavaDoc name){
172     return lexDecLong(name);
173   }
174   
175   /**
176    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
177    * and convert the string to a Long token.
178    * an octal number has to start with 0.
179    * @return the lexer.
180    * @deprecated Use {@link #lexOctLong()}.
181    */

182   public static Parser<Tok> octInteger(){
183     return lexOctLong();
184   }
185   /**
186    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
187    * and convert the string to a Long token.
188    * an octal number has to start with 0.
189    * @param name the lexer name.
190    * @return the lexer.
191    * @deprecated Use {@link #lexOctLong(String)}.
192    */

193   public static Parser<Tok> octInteger(final String JavaDoc name){
194     return lexOctLong(name);
195   }
196   /**
197    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
198    * and convert the string to a Long token.
199    * an hex number has to start with either 0x or 0X.
200    * @return the lexer.
201    * @deprecated Use {@link #lexHexLong()}.
202    */

203   public static Parser<Tok> hexInteger(){
204     return lexHexLong();
205   }
206   /**
207    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
208    * and convert the string to a Long token.
209    * an hex number has to start with either 0x or 0X.
210    * @param name the lexer name.
211    * @return the lexer.
212    * @deprecated Use {@link #lexHexLong(String)}.
213    */

214   public static Parser<Tok> hexInteger(final String JavaDoc name){
215     return lexHexLong(name);
216   }
217   /**
218    * returns the lexer that's gonna parse decimal, hex, and octal numbers
219    * and convert the string to a Long token.
220    * @return the lexer.
221    * @deprecated Use {@link #lexLong()}.
222    */

223   public static Parser<Tok> allInteger(){
224     return lexLong();
225   }
226   /**
227    * returns the lexer that's gonna parse decimal, hex, and octal numbers
228    * and convert the string to a Long token.
229    * @param name the lexer name.
230    * @return the lexer.
231    * @deprecated Use {@link #lexLong(String)}.
232    */

233   public static Parser<Tok> allInteger(final String JavaDoc name){
234     return lexLong(name);
235   }
236   
237   private static final Parser<Tok> _decLong = lexDecLong("decLong");
238   /**
239    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
240    * and convert the string to a Long token.
241    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
242    * @return the lexer.
243    */

244   public static Parser<Tok> lexDecLong(){
245     return _decLong;
246   }
247   /**
248    * returns the lexer that's gonna parse a decimal integer number (valid patterns are: 1, 10, 123),
249    * and convert the string to a Long token.
250    * The difference between integer() and decInteger() is that decInteger does not allow a number starting with 0.
251    * @param name the lexer name.
252    * @return the lexer.
253    */

254   public static Parser<Tok> lexDecLong(final String JavaDoc name){
255     return Lexers.lexer(name,
256         Scanners.delimited(Scanners.isPattern(Patterns.isDecInteger(),
257             "decLong")), Tokenizers.forDecLong());
258   }
259   
260   private static final Parser<Tok> _octLong = lexOctLong("octLong");
261   /**
262    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
263    * and convert the string to a Long token.
264    * an octal number has to start with 0.
265    * @return the lexer.
266    */

267   public static Parser<Tok> lexOctLong(){
268     return _octLong;
269   }
270   /**
271    * returns the lexer that's gonna parse a octal integer number (valid patterns are: 0, 07, 017, 0371 etc.),
272    * and convert the string to a Long token.
273    * an octal number has to start with 0.
274    * @param name the lexer name.
275    * @return the lexer.
276    */

277   public static Parser<Tok> lexOctLong(final String JavaDoc name){
278     return Lexers.lexer(name,
279         Scanners.delimited(Scanners.isPattern(
280             Patterns.isOctInteger(), "octLong")), Tokenizers.forOctLong());
281   }
282   private static final Parser<Tok> _hexLong = lexHexLong("hexLong");
283   /**
284    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
285    * and convert the string to a Long token.
286    * an hex number has to start with either 0x or 0X.
287    * @return the lexer.
288    */

289   public static Parser<Tok> lexHexLong(){
290     return _hexLong;
291   }
292   /**
293    * returns the lexer that's gonna parse a hex integer number (valid patterns are: 0x1, 0Xff, 0xFe1 etc.),
294    * and convert the string to a Long token.
295    * an hex number has to start with either 0x or 0X.
296    * @param name the lexer name.
297    * @return the lexer.
298    */

299   public static Parser<Tok> lexHexLong(final String JavaDoc name){
300     return Lexers.lexer(name,
301         Scanners.delimited(Scanners.isPattern(Patterns.isHexInteger(), "hexLong"))
302         , Tokenizers.forHexLong());
303   }
304   private static final Parser<Tok> _allLong = lexLong("allLong");
305   /**
306    * returns the lexer that's gonna parse decimal, hex, and octal numbers
307    * and convert the string to a Long token.
308    * @return the lexer.
309    */

310   public static Parser<Tok> lexLong(){
311     return _allLong;
312   }
313   /**
314    * returns the lexer that's gonna parse decimal, hex, and octal numbers
315    * and convert the string to a Long token.
316    * @param name the lexer name.
317    * @return the lexer.
318    */

319   public static Parser<Tok> lexLong(final String JavaDoc name){
320     return Parsers.plus(name, lexHexLong(), lexDecLong(), lexOctLong());
321   }
322   private static final Parser<Tok> _word = word("word");
323   /**
324    * returns the lexer that's gonna parse any word.
325    * and convert the string to a TokenWord.
326    * A word starts with an alphametic character, followed by 0 or more alphanumeric characters.
327    * @return the lexer.
328    */

329   public static Parser<Tok> word(){
330     return _word;
331   }
332   /**
333    * returns the lexer that's gonna parse any word.
334    * and convert the string to a TokenWord.
335    * A word starts with an alphametic character, followed by 0 or more alphanumeric characters.
336    * @param name the lexer name.
337    * @return the lexer.
338    */

339   public static Parser<Tok> word(final String JavaDoc name){
340     return Lexers.lexer(name, Scanners.delimited(
341         Scanners.isPattern(Patterns.isWord(), "word")),
342         Tokenizers.forWord());
343   }
344   /**
345    * Create a lexer that parsers a string literal quoted by open and close,
346    * and then converts it to a TokenQuoted token instance.
347    * @param name the lexer name.
348    * @param open the opening character.
349    * @param close the closing character.
350    * @return the lexer.
351    */

352   public static Parser<Tok> quoted(final String JavaDoc name, final char open, final char close){
353     return Lexers.lexer(name, Scanners.quoted(name, open, close),
354         Tokenizers.forQuotedString(open, close));
355   }
356   /**
357    * Create a lexer that parsers a string literal quoted by open and close,
358    * and then converts it to a TokenQuoted token instance.
359    * @param open the opening character.
360    * @param close the closing character.
361    * @return the lexer.
362    */

363   public static Parser<Tok> quoted(final char open, final char close){
364     return quoted("quoted", open, close);
365   }
366
367   /**
368    * Creates a Words object for lexing the operators with names specified in ops.
369    * Operators are lexed as TokenReserved.
370    * @param ops the operator names.
371    * @return the Words instance.
372    */

373   public static Words getOperators(final String JavaDoc... ops){
374     return Words.getOperators(ops);
375   }
376   /**
377    * Creates a Words object for lexing the operators with names specified in ops,
378    * and for lexing the keywords case insensitively.
379    * Keywords and operators are lexed as TokenReserved.
380    * Words that are not among the keywords are lexed as TokenWord.
381    * A word is defined as an alpha numeric string that starts with [_a-zA-Z],
382    * with 0 or more [0-9_a-zA-Z] following.
383    * @param ops the operator names.
384    * @param keywords the keyword names.
385    * @return the Words instance.
386    */

387   public static Words getCaseInsensitive(
388       final String JavaDoc[] ops, final String JavaDoc[] keywords){
389     return Words.getCaseInsensitive(ops, keywords);
390   }
391
392   /**
393    * Creates a Words object for lexing the operators with names specified in ops,
394    * and for lexing the keywords case sensitively.
395    * Keywords and operators are lexed as TokenReserved.
396    * Words that are not among the keywords are lexed as TokenWord.
397    * A word is defined as an alpha numeric string that starts with [_a-zA-Z],
398    * with 0 or more [0-9_a-zA-Z] following.
399    * @param ops the operator names.
400    * @param keywords the keyword names.
401    * @return the Words instance.
402    */

403   public static Words getCaseSensitive(
404       final String JavaDoc[] ops, final String JavaDoc[] keywords){
405     return Words.getCaseSensitive(ops, keywords);
406   }
407   /**
408    * Creates a Words object for lexing the operators with names specified in ops,
409    * and for lexing the keywords case insensitively.
410    * Keywords and operators are lexed as TokenReserved.
411    * Words that are not among the keywords are lexed as TokenWord.
412    * @param wscanner the scanner for a word in the language.
413    * @param ops the operator names.
414    * @param keywords the keyword names.
415    * @return the Words instance.
416    */

417   public static Words getCaseInsensitive(final Parser<?> wscanner,
418       final String JavaDoc[] ops, final String JavaDoc[] keywords){
419     return Words.getCaseInsensitive(wscanner, ops, keywords);
420   }
421   /**
422    * Creates a Words object for lexing the operators with names specified in ops,
423    * and for lexing the keywords case sensitively.
424    * Keywords and operators are lexed as TokenReserved.
425    * Words that are not among the keywords are lexed as TokenWord.
426    * @param wscanner the scanner for a word in the language.
427    * @param ops the operator names.
428    * @param keywords the keyword names.
429    * @return the Words instance.
430    */

431   public static Words getCaseSensitive(final Parser<?> wscanner,
432       final String JavaDoc[] ops, final String JavaDoc[] keywords){
433     return Words.getCaseSensitive(wscanner, ops, keywords);
434   }
435   /**
436    * Creates a Words object for lexing the operators with names specified in ops,
437    * and for lexing the keywords case insensitively.
438    * Keywords and operators are lexed as TokenReserved.
439    * Words that are not among the keywords are lexed as TokenWord.
440    * @param wscanner the scanner for a word in the language.
441    * @param ops the operator names.
442    * @param keywords the keyword names.
443    * @param toWord the FromString object used to create a token for non-key words recognized by wscanner.
444    * @return the Words instance.
445    */

446   public static Words getCaseInsensitive(final Parser<?> wscanner,
447       final String JavaDoc[] ops, final String JavaDoc[] keywords, FromString<?> toWord){
448     return Words.getCaseInsensitive(wscanner, ops, keywords, toWord);
449   }
450   /**
451    * Creates a Words object for lexing the operators with names specified in ops,
452    * and for lexing the keywords case sensitively.
453    * Keywords and operators are lexed as TokenReserved.
454    * Words that are not among the keywords are lexed as TokenWord.
455    * @param wscanner the scanner for a word in the language.
456    * @param ops the operator names.
457    * @param keywords the keyword names.
458    * @param toWord the FromString object used to create a token for non-key words recognized by wscanner.
459    * @return the Words instance.
460    */

461   public static Words getCaseSensitive(final Parser<?> wscanner,
462       final String JavaDoc[] ops, final String JavaDoc[] keywords, FromString<?> toWord){
463     return Words.getCaseSensitive(wscanner, ops, keywords, toWord);
464   }
465   /**
466    * Transform the recognized character range of scanner s to a token object
467    * with a Tokenizer.
468    * If the Tokenizer.toToken() returns null, scan fails.
469    * @param name the name of the new Scanner.
470    * @param tn the Tokenizer object.
471    * @param s the scanner to transform.
472    * @return the new Scanner.
473    */

474   public static Parser<Tok> lexer(final String JavaDoc name, final Parser<?> s, final Tokenizer tn){
475     return lexer(name, s, tn, "lexer error");
476   }
477   /**
478    * Transform the recognized character range of scanner s to a token object
479    * with a Tokenizer.
480    * If the Tokenizer.toToken() returns null, scan fails.
481    * @param s the scanner to transform.
482    * @param tn the Tokenizer object.
483    * @return the new Scanner.
484    */

485   public static Parser<Tok> lexer(final Parser<?> s, final Tokenizer tn){
486     return lexer("lexer", s, tn);
487   }
488   /**
489    * Transform the recognized character range of scanner s to a token object
490    * with a Tokenizer.
491    * If the Tokenizer.toToken() returns null, scan fails.
492    * @param s the scanner to transform.
493    * @param tn the Tokenizer object.
494    * @param err the error message when the tokenizer returns null.
495    * @return the new Scanner.
496    */

497   public static Parser<Tok> lexer(final Parser<?> s, final Tokenizer tn, final String JavaDoc err){
498     return lexer("lexer", s, tn, err);
499   }
500   /**
501    * Transform the recognized character range of scanner s to a token object
502    * with a Tokenizer.
503    * If the Tokenizer.toToken() returns null, scan fails.
504    * @param name the name of the new Scanner.
505    * @param s the scanner to transform.
506    * @param tn the Tokenizer object.
507    * @param err the error message when the tokenizer returns null.
508    * @return the new Scanner.
509    */

510   public static Parser<Tok> lexer(final String JavaDoc name, final Parser<?> s, final Tokenizer tn,
511       final String JavaDoc err){
512     return new Parser<Tok>(name){
513       boolean apply(final ParseContext ctxt){
514         final int ind = ctxt.getIndex();
515         final int from = ctxt.getAt();
516         final Object JavaDoc ret = ctxt.getReturn();
517         final int at = ctxt.getAt();
518         final int step = ctxt.getStep();
519         final Object JavaDoc ustate = ctxt.getUserState();
520         //final AbstractParsecError error = ctxt.getError();
521
if(!s.parse(ctxt)) return false;
522         final int len = ctxt.getAt() - from;
523         final Object JavaDoc tok = tn.toToken(ctxt.getSource(), from, len);
524         if(tok == null){
525           ctxt.set(step, at, ret, ustate, ParsecError.raiseExpecting(ind, err));
526           return false;
527         }
528         final Tok ptok = new Tok(at, len, tok);
529         ctxt.setStep(step+1);
530         ctxt.setReturn(ptok);
531         return true;
532       }
533     };
534   }
535   /**
536    * Greedily runs Parser s repeatedly,
537    * and ignores the pattern recognized by Parser delim before and after each s.
538    * Parser s has to be a lexer object that returns a Tok object.
539    * The result Tok objects are collected and returned in a Tok[] array.
540    * @param name the name of the new Parser object.
541    * @param delim the delimiter Parser object.
542    * @param s the Parser object.
543    * @return the new Parser object.
544    */

545   public static Parser<Tok[]> lexeme(final String JavaDoc name,
546       final Parser<?> delim, final Parser<Tok> s){
547     return delim.optional().seq(name, Parsers.sepEndBy(name, Tok.class, delim, s));
548   }
549   /**
550    * Greedily runs Parser s repeatedly,
551    * and ignores the pattern recognized by Parser delim before and after each s.
552    * Parser s has to be a lexer object that returns a Tok object.
553    * The result Tok objects are collected and returned in a Tok[] array.
554    * @param delim the delimiter Parser object.
555    * @param s the Parser object.
556    * @return the new Parser object.
557    */

558   public static Parser<Tok[]> lexeme(final Parser<?> delim, final Parser<Tok> s){
559     return lexeme("lexeme", delim, s);
560   }
561 }
562
Popular Tags