KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > modules > ruby > lexer > RubyLexer


1 /*
2  * The contents of this file are subject to the terms of the Common Development
3  * and Distribution License (the License). You may not use this file except in
4  * compliance with the License.
5  *
6  * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7  * or http://www.netbeans.org/cddl.txt.
8  *
9  * When distributing Covered Code, include this CDDL Header Notice in each file
10  * and include the License file at http://www.netbeans.org/cddl.txt.
11  * If applicable, add the following below the CDDL Header, with the fields
12  * enclosed by brackets [] replaced by your own identifying information:
13  * "Portions Copyrighted [year] [name of copyright owner]"
14  *
15  * The Original Software is NetBeans. The Initial Developer of the Original
16  * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17  * Microsystems, Inc. All Rights Reserved.
18  */

19 package org.netbeans.modules.ruby.lexer;
20
21 import java.io.IOException JavaDoc;
22 import java.io.Reader JavaDoc;
23
24 import org.jruby.common.NullWarnings;
25 import org.jruby.lexer.yacc.LexState;
26 import org.jruby.lexer.yacc.LexerSource;
27 import org.jruby.lexer.yacc.LexerSource;
28 import org.jruby.lexer.yacc.RubyYaccLexer;
29 import org.jruby.lexer.yacc.RubyYaccLexer;
30 import org.jruby.lexer.yacc.StrTerm;
31 import org.jruby.lexer.yacc.SyntaxException;
32 import org.jruby.parser.Tokens;
33 import org.netbeans.api.gsf.GsfTokenId;
34 import org.netbeans.api.lexer.Token;
35 import org.netbeans.modules.ruby.lexer.RubyTokenId;
36 import org.netbeans.spi.lexer.Lexer;
37 import org.netbeans.spi.lexer.LexerInput;
38 import org.netbeans.spi.lexer.LexerRestartInfo;
39 import org.netbeans.spi.lexer.TokenFactory;
40 import org.openide.ErrorManager;
41
42
43 /**
44  * A scanner for Ruby, which directly uses the JRuby lexer and translates
45  * from JRuby tokens to NetBeans lexer-based tokens
46  *
47  * @todo Should I generate a single large token for composite token types
48  * like strings and regular expressions? Today, I go to a lot of trouble
49  * to do state saving between the opening quote, middle literal string,
50  * and ending quote, for strings. (Ditto for regular expressions, single
51  * quoted strings, other forms of quoted strings, etc.).
52  * This allows me to have for example the / /'s in regular expressions
53  * stay black, and have only the embedded portion be green. But is that
54  * really necessary? If I instead were to eat up a whole String combination
55  * into a single token and return it as a single token, that would ensure
56  * that for incremental parsing, I always get called on a token boundary
57  * where I don't need the extra state saving. All I would need would
58  * be the lexer state (if different than the default state) which is a small
59  * integer (which gets compressed to single bytes by the lexer infrastructure.)
60  *
61  * @author Tor Norbye
62  */

63 public final class RubyLexer implements Lexer<GsfTokenId> {
64     /** This is still not working; I wonder if release() is called correctly at all times...*/
65     private static final boolean REUSE_LEXERS = false;
66     private static RubyLexer cached;
67     private RubyYaccLexer lexer;
68     private LexerSource lexerSource;
69     private boolean inRegexp;
70     private LexerInput input;
71     private TokenFactory<GsfTokenId> tokenFactory;
72     private boolean substituting;
73     private boolean inSymbol;
74     private boolean inEmbedded;
75
76     private RubyLexer(LexerRestartInfo<GsfTokenId> info) {
77         lexer = new RubyYaccLexer();
78         // XXX Do something at scan time about illegal characters?
79
lexer.setWarnings(new NullWarnings());
80         lexer.setPreserveSpaces(true);
81     }
82
83     public static synchronized RubyLexer create(LexerRestartInfo<GsfTokenId> info) {
84         RubyLexer rubyLexer = cached;
85
86         if (rubyLexer == null) {
87             rubyLexer = new RubyLexer(info);
88         }
89
90         rubyLexer.restart(info);
91
92         return rubyLexer;
93     }
94
95     void restart(LexerRestartInfo<GsfTokenId> info) {
96         inRegexp = substituting = inSymbol = inEmbedded = false;
97         lexer.reset();
98
99         input = info.input();
100         tokenFactory = info.tokenFactory();
101
102         String JavaDoc fileName = "unknown";
103         Reader JavaDoc lexerReader = new LexerInputReader(input);
104         lexerSource = new LexerSource(fileName, lexerReader);
105         lexer.setSource(lexerSource);
106
107         Object JavaDoc state = info.state();
108
109         if (state instanceof JRubyLexerRestartInfo) {
110             ((JRubyLexerRestartInfo)state).initializeState(this);
111         } else if (state instanceof Integer JavaDoc) {
112             int stateValue = ((Integer JavaDoc)state).intValue();
113             lexer.setState(LexState.fromOrdinal(stateValue));
114         }
115     }
116
117     public void release() {
118         if (REUSE_LEXERS) {
119             // Possibly reset the structures that could cause memory leaks
120
synchronized (RubyLexer.class) {
121                 cached = this;
122             }
123         }
124     }
125
126     public Object JavaDoc state() {
127         if (JRubyLexerRestartInfo.needsStateStorage(this)) {
128             return new JRubyLexerRestartInfo(this);
129         }
130
131         // We only need to store the state of the lexer when it's in a significant state,
132
// e.g. outside of normal expressions and with no string processing in progress
133
LexState state = lexer.getLexState();
134
135         if (state == null) {
136             return null;
137         }
138
139         if (lexer.getStrTerm() != null) {
140             return new JRubyLexerRestartInfo(this);
141         }
142
143         // The lexer can store integer states very efficiently
144
// (besides, Integer.valueOf will cache all these values since they are < 128)
145
return Integer.valueOf(state.getOrdinal());
146     }
147
148     private Token<GsfTokenId> token(GsfTokenId id, int length) {
149         String JavaDoc fixedText = id.fixedText();
150
151         return (fixedText != null) ? tokenFactory.getFlyweightToken(id, fixedText)
152                                    : tokenFactory.createToken(id, length);
153     }
154
155     public Token<GsfTokenId> nextToken() {
156         int token = 0;
157         int tokenLength = 0;
158         int oldOffset = lexerSource.getOffset();
159
160         while (tokenLength == 0) {
161             try {
162                 lexer.advance();
163                 token = lexer.token();
164
165                 StrTerm strTerm = lexer.getStrTerm();
166
167                 if (strTerm != null) {
168                     strTerm.splitEmbeddedTokens();
169                 }
170             } catch (SyntaxException ex) { // includes SyntaxException
171
token = Tokens.yyErrorCode; // TODO - generate incomplete tokens?
172
tokenLength = lexerSource.getOffset() - oldOffset;
173
174                 if (tokenLength == 0) {
175                     if (input.readLength() > 0) {
176                         return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
177
} else {
178                         return null;
179                     }
180                 }
181
182                 break;
183             } catch (Throwable JavaDoc ex) { // includes SyntaxException
184
ErrorManager.getDefault().notify(ex);
185
186                 break;
187             }
188
189             if (token == 0) { // EOF
190

191                 if (input.readLength() > 0) {
192                     return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
193
} else {
194                     return null;
195                 }
196             }
197
198             int offset = lexerSource.getOffset();
199             tokenLength = offset - oldOffset;
200         }
201
202         // Update lexer input to make sure it records the right
203
// character boundaries for the tokens (since incremental lexing
204
// will restart at token boundaries, and we want to make sure
205
// it knows in the character stream where those boundaries truly are
206
int readAhead = lexerSource.chompReadAhead();
207
208         if (readAhead > 0) {
209             input.backup(readAhead);
210         }
211
212         // Map to IDE types
213
GsfTokenId id = getTokenId(token, oldOffset);
214
215         if (inSymbol) {
216             // A type symbol in front of a keyword, literal or constant
217
// should be lexed as a symbol
218
if ("keyword".equals(id.primaryCategory())) { // NOI18N
219
id = RubyTokenId.TYPE_SYMBOL;
220             }
221         }
222
223         inSymbol = (token == Tokens.tSYMBEG);
224
225         return token(id, tokenLength);
226     }
227
228     /** @todo Move classification of tokens into TokenTypes into JRuby somehow */
229     private GsfTokenId getTokenId(int token, int offset) {
230         // If you add any new token types here, remember to update #getRelevantTokenTypes below
231
switch (token) {
232         case Tokens.tCOMMENT:
233             return RubyTokenId.LINE_COMMENT;
234
235         case Tokens.tWHITESPACE:
236             return RubyTokenId.WHITESPACE;
237
238         case Tokens.tFLOAT:
239             return RubyTokenId.FLOAT_LITERAL;
240
241         case Tokens.tINTEGER:
242             return RubyTokenId.INT_LITERAL;
243
244         case Tokens.tSTRING_BEG:
245         case Tokens.tXSTRING_BEG:
246
247             if (lexer.getStrTerm() != null) {
248                 substituting = lexer.getStrTerm().isSubstituting();
249             } else {
250                 substituting = false;
251             }
252
253             return substituting ? RubyTokenId.QUOTED_STRING_BEGIN : RubyTokenId.STRING_BEGIN;
254
255         case Tokens.tSTRING_DVAR:
256         case Tokens.tSTRING_DBEG:
257             inEmbedded = true;
258
259             return inRegexp ? RubyTokenId.REGEXP_LITERAL : RubyTokenId.STRING_LITERAL;
260
261         case Tokens.tSTRING_END:
262             return substituting ? RubyTokenId.QUOTED_STRING_END : RubyTokenId.STRING_END;
263
264         case Tokens.tSTRING_CONTENT: // What about tXSTRING??
265

266             if (inEmbedded) {
267                 inEmbedded = false;
268
269                 return RubyTokenId.EMBEDDED_RUBY;
270             } else if (inRegexp) {
271                 return RubyTokenId.REGEXP_LITERAL;
272             } else if (substituting) {
273                 return RubyTokenId.QUOTED_STRING_LITERAL;
274             } else {
275                 return RubyTokenId.STRING_LITERAL;
276             }
277
278         case Tokens.tREGEXP_BEG:
279             inRegexp = true;
280
281             return RubyTokenId.REGEXP_BEGIN;
282
283         case Tokens.tREGEXP_END:
284             inRegexp = false;
285
286             return RubyTokenId.REGEXP_END;
287
288         case Tokens.tDOCUMENTATION:
289             return RubyTokenId.DOCUMENTATION;
290
291         case Tokens.yyErrorCode:
292             return RubyTokenId.ERROR;
293
294         case Tokens.tGVAR: // Global variable
295
return RubyTokenId.GLOBAL_VAR;
296
297         case Tokens.tIVAR: // Instance variable
298
return RubyTokenId.INSTANCE_VAR;
299
300         case Tokens.tCVAR: // Class variable
301
return RubyTokenId.CLASS_VAR;
302
303         case Tokens.tCONSTANT: // Constant
304
return inSymbol ? RubyTokenId.TYPE_SYMBOL : RubyTokenId.CONSTANT;
305
306         case Tokens.tIDENTIFIER:
307             return inSymbol ? RubyTokenId.TYPE_SYMBOL : RubyTokenId.IDENTIFIER;
308
309         case Tokens.tSYMBEG:
310             return RubyTokenId.TYPE_SYMBOL;
311
312         case Tokens.tLBRACK:
313             return RubyTokenId.LBRACKET;
314
315         case Tokens.tRBRACK:
316             return RubyTokenId.RBRACKET;
317
318         case Tokens.tLPAREN:
319         case Tokens.tLPAREN2: // XXX What is this?
320
case Tokens.tLPAREN_ARG: // XXX What is this?
321
return RubyTokenId.LPAREN;
322
323         case Tokens.tRPAREN:
324             return RubyTokenId.RPAREN;
325
326         case Tokens.tLCURLY: // block (primary)
327
case Tokens.tLBRACE: // hash
328
case Tokens.tLBRACE_ARG: // block (expr)
329
return RubyTokenId.LBRACE;
330
331         case Tokens.tRCURLY:
332             return RubyTokenId.RBRACE;
333
334         case Tokens.kDEF:
335             return RubyTokenId.DEF;
336
337         case Tokens.kEND:
338             return RubyTokenId.END;
339
340         case Tokens.kCLASS:
341             return RubyTokenId.CLASS;
342
343         case Tokens.kMODULE:
344             return RubyTokenId.MODULE;
345
346         case Tokens.kBEGIN:
347             return RubyTokenId.BEGIN;
348
349         case Tokens.kIF:
350             return RubyTokenId.IF;
351
352         case Tokens.kUNLESS:
353             return RubyTokenId.UNLESS;
354
355         case Tokens.kWHILE:
356             return RubyTokenId.WHILE;
357
358         case Tokens.kUNTIL:
359             return RubyTokenId.UNTIL;
360
361         case Tokens.kDO:
362             return RubyTokenId.DO;
363
364         case Tokens.kCASE:
365             return RubyTokenId.CASE;
366
367         case Tokens.kFOR:
368             return RubyTokenId.FOR;
369
370         case Tokens.kELSE:
371             return RubyTokenId.ELSE;
372
373         case Tokens.kELSIF:
374             return RubyTokenId.ELSIF;
375
376         case Tokens.kENSURE:
377             return RubyTokenId.ENSURE;
378
379         case Tokens.kWHEN:
380             return RubyTokenId.WHEN;
381
382         case Tokens.kRESCUE:
383             return RubyTokenId.RESCUE;
384
385         case Tokens.kSUPER:
386             return RubyTokenId.SUPER;
387
388         case Tokens.kSELF:
389             return RubyTokenId.SELF;
390
391         case Tokens.kRESCUE_MOD:
392         case Tokens.kDO_COND:
393         case Tokens.kDO_BLOCK:
394         case Tokens.kUNDEF:
395         case Tokens.kTHEN:
396         case Tokens.kBREAK:
397         case Tokens.kNEXT:
398         case Tokens.kREDO:
399         case Tokens.kRETRY:
400         case Tokens.kIN:
401         case Tokens.kRETURN:
402         case Tokens.kYIELD:
403         case Tokens.kNIL:
404         case Tokens.kTRUE:
405         case Tokens.kFALSE:
406         case Tokens.kAND:
407         case Tokens.kOR:
408         case Tokens.kNOT:
409         case Tokens.kIF_MOD:
410         case Tokens.kUNLESS_MOD:
411         case Tokens.kWHILE_MOD:
412         case Tokens.kUNTIL_MOD:
413         case Tokens.kALIAS:
414         case Tokens.kDEFINED:
415         case Tokens.klBEGIN: // "BEGIN { }": not matched with END { }
416
case Tokens.klEND: // "END { }": not matched with BEGIN { }
417
case Tokens.k__LINE__:
418         case Tokens.k__FILE__:
419             return RubyTokenId.ANY_KEYWORD;
420
421         case Tokens.tDOT:
422             return RubyTokenId.DOT;
423
424         case Tokens.tDOT2:
425         case Tokens.tDOT3:
426             return RubyTokenId.RANGE;
427
428         case Tokens.tCOLON3:
429             return RubyTokenId.COLON3;
430
431         default:
432             return RubyTokenId.IDENTIFIER;
433         }
434     }
435
436     private static class JRubyLexerRestartInfo {
437         /** Bit set when we're in regular expressions */
438         private static final int IN_REGEXP = 1;
439
440         /** Bit set when we're in symbols */
441         private static final int IN_SYMBOL = 2;
442
443         /** Bit set when we're in an embedded ruby context... #{here} */
444         private static final int IN_EMBEDDED = 4;
445
446         /** Bit set when we're in a substituting/doublequoted string */
447         private static final int IN_SUBSTITUTING = 8;
448
449         /** Bit set when we need to set the spaceSeen flag in RubyYaccLexer */
450         private static final int SET_SPACE_SEEN = 16;
451
452         /** Bit set when we need to set commandStart in RubyYaccLexer */
453         private static final int SET_COMMAND_START = 32;
454         private StrTerm strTerm;
455         private int localState;
456         private LexState lexState;
457         private Object JavaDoc strTermState;
458
459         JRubyLexerRestartInfo(RubyLexer rubyLexer) {
460             strTerm = rubyLexer.lexer.getStrTerm();
461
462             if (strTerm != null) {
463                 strTermState = strTerm.getMutableState();
464             }
465
466             lexState = rubyLexer.lexer.getLexState();
467
468             if (rubyLexer.inRegexp) {
469                 localState += IN_REGEXP;
470             }
471
472             if (rubyLexer.inSymbol) {
473                 localState += IN_SYMBOL;
474             }
475
476             if (rubyLexer.inEmbedded) {
477                 localState += IN_EMBEDDED;
478             }
479
480             if (rubyLexer.substituting) {
481                 localState += IN_SUBSTITUTING;
482             }
483
484             if (rubyLexer.lexer.isSetSpaceSeen()) {
485                 localState += SET_SPACE_SEEN;
486             }
487
488             if (rubyLexer.lexer.isCommandStart()) {
489                 localState += SET_COMMAND_START;
490             }
491         }
492
493         /** Return true iff the given lexer needs custom state storage beyond the state integers */
494         public static boolean needsStateStorage(RubyLexer rubyLexer) {
495             return rubyLexer.inRegexp || rubyLexer.inSymbol || rubyLexer.inEmbedded ||
496             rubyLexer.substituting || rubyLexer.lexer.isCommandStart() ||
497             rubyLexer.lexer.isSetSpaceSeen();
498         }
499
500         public boolean equals(Object JavaDoc obj) {
501             if (obj == null) {
502                 return false;
503             }
504
505             if (getClass() != obj.getClass()) {
506                 return false;
507             }
508
509             final JRubyLexerRestartInfo other = (JRubyLexerRestartInfo)obj;
510
511             if ((this.strTerm != other.strTerm) &&
512                     ((this.strTerm == null) || !this.strTerm.equals(other.strTerm))) {
513                 return false;
514             }
515
516             if (this.localState != other.localState) {
517                 return false;
518             }
519
520             if ((this.lexState != other.lexState) &&
521                     ((this.lexState == null) ||
522                     !(this.lexState.getOrdinal() == other.lexState.getOrdinal()))) {
523                 return false;
524             }
525
526             if ((this.strTermState != other.strTermState) &&
527                     ((this.strTermState == null) || !this.strTermState.equals(other.strTermState))) {
528                 return false;
529             }
530
531             return true;
532         }
533
534         public int hashCode() {
535             int hash = 7;
536
537             hash = (43 * hash) + this.localState;
538             hash = (43 * hash) + ((this.strTerm != null) ? this.strTerm.hashCode() : 0);
539             hash = (43 * hash) + ((this.strTermState != null) ? this.strTermState.hashCode() : 0);
540
541             return hash;
542         }
543
544         public String JavaDoc toString() {
545             return "RubyLexerState[" + localState + "," + strTerm + "," + lexState + "," +
546             strTermState + "]";
547         }
548
549         void initializeState(RubyLexer rubyLexer) {
550             rubyLexer.lexer.setStrTerm(strTerm);
551
552             if ((strTermState != null) && (strTerm != null)) {
553                 strTerm.setMutableState(strTermState);
554             }
555
556             if ((localState & IN_REGEXP) != 0) {
557                 rubyLexer.inRegexp = true;
558             }
559
560             if ((localState & IN_SYMBOL) != 0) {
561                 rubyLexer.inSymbol = true;
562             }
563
564             if ((localState & IN_EMBEDDED) != 0) {
565                 rubyLexer.inEmbedded = true;
566             }
567
568             if ((localState & IN_SUBSTITUTING) != 0) {
569                 rubyLexer.substituting = true;
570             }
571
572             if ((localState & SET_COMMAND_START) != 0) {
573                 rubyLexer.lexer.setCommandStart(true);
574             }
575
576             if ((localState & SET_SPACE_SEEN) != 0) {
577                 rubyLexer.lexer.setSpaceSeen(true);
578             }
579
580             rubyLexer.lexer.setLexState(lexState);
581         }
582     }
583
584     private class LexerInputReader extends Reader JavaDoc {
585         private LexerInput input;
586
587         LexerInputReader(LexerInput input) {
588             this.input = input;
589         }
590
591         public int read(char[] buf, int off, int len) throws IOException JavaDoc {
592             for (int i = 0; i < len; i++) {
593                 int c = input.read();
594
595                 if (c == LexerInput.EOF) {
596                     return -1;
597                 }
598
599                 buf[i + off] = (char)c;
600             }
601
602             return len;
603         }
604
605         public void close() throws IOException JavaDoc {
606         }
607     }
608 }
609
Popular Tags