RubyLexer


1   /*
2    * The contents of this file are subject to the terms of the Common Development
3    * and Distribution License (the License). You may not use this file except in
4    * compliance with the License.
5    *
6    * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7    * or http://www.netbeans.org/cddl.txt.
8    *
9    * When distributing Covered Code, include this CDDL Header Notice in each file
10   * and include the License file at http://www.netbeans.org/cddl.txt.
11   * If applicable, add the following below the CDDL Header, with the fields
12   * enclosed by brackets [] replaced by your own identifying information:
13   * "Portions Copyrighted [year] [name of copyright owner]"
14   *
15   * The Original Software is NetBeans. The Initial Developer of the Original
16   * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17   * Microsystems, Inc. All Rights Reserved.
18   */
19  package org.netbeans.modules.ruby.lexer;
20  
21  import java.io.IOException  ;
22  import java.io.Reader  ;
23  
24  import org.jruby.common.NullWarnings;
25  import org.jruby.lexer.yacc.LexState;
26  import org.jruby.lexer.yacc.LexerSource;
27  import org.jruby.lexer.yacc.LexerSource;
28  import org.jruby.lexer.yacc.RubyYaccLexer;
29  import org.jruby.lexer.yacc.RubyYaccLexer;
30  import org.jruby.lexer.yacc.StrTerm;
31  import org.jruby.lexer.yacc.SyntaxException;
32  import org.jruby.parser.Tokens;
33  import org.netbeans.api.gsf.GsfTokenId;
34  import org.netbeans.api.lexer.Token;
35  import org.netbeans.modules.ruby.lexer.RubyTokenId;
36  import org.netbeans.spi.lexer.Lexer;
37  import org.netbeans.spi.lexer.LexerInput;
38  import org.netbeans.spi.lexer.LexerRestartInfo;
39  import org.netbeans.spi.lexer.TokenFactory;
40  import org.openide.ErrorManager;
41  
42  
43  /**
44   * A scanner for Ruby, which directly uses the JRuby lexer and translates
45   * from JRuby tokens to NetBeans lexer-based tokens
46   *
47   * @todo Should I generate a single large token for composite token types
48   *  like strings and regular expressions? Today, I go to a lot of trouble
49   *  to do state saving between the opening quote, middle literal string,
50   *  and ending quote, for strings. (Ditto for regular expressions, single
51   *  quoted strings, other forms of quoted strings, etc.).
52   *  This allows me to have for example the / /'s in regular expressions
53   *  stay black, and have only the embedded portion be green. But is that
54   *  really necessary? If I instead were to eat up a whole String combination
55   *  into a single token and return it as a single token, that would ensure
56   *  that for incremental parsing, I always get called on a token boundary
57   *  where I don't need the extra state saving. All I would need would
58   *  be the lexer state (if different than the default state) which is a small
59   *  integer (which gets compressed to single bytes by the lexer infrastructure.)
60   *
61   * @author Tor Norbye
62   */
63  public final class RubyLexer implements Lexer<GsfTokenId> {
64      /** This is still not working; I wonder if release() is called correctly at all times...*/
65      private static final boolean REUSE_LEXERS = false;
66      private static RubyLexer cached;
67      private RubyYaccLexer lexer;
68      private LexerSource lexerSource;
69      private boolean inRegexp;
70      private LexerInput input;
71      private TokenFactory<GsfTokenId> tokenFactory;
72      private boolean substituting;
73      private boolean inSymbol;
74      private boolean inEmbedded;
75  
76      private RubyLexer(LexerRestartInfo<GsfTokenId> info) {
77          lexer = new RubyYaccLexer();
78          // XXX Do something at scan time about illegal characters?
79          lexer.setWarnings(new NullWarnings());
80          lexer.setPreserveSpaces(true);
81      }
82  
83      public static synchronized RubyLexer create(LexerRestartInfo<GsfTokenId> info) {
84          RubyLexer rubyLexer = cached;
85  
86          if (rubyLexer == null) {
87              rubyLexer = new RubyLexer(info);
88          }
89  
90          rubyLexer.restart(info);
91  
92          return rubyLexer;
93      }
94  
95      void restart(LexerRestartInfo<GsfTokenId> info) {
96          inRegexp = substituting = inSymbol = inEmbedded = false;
97          lexer.reset();
98  
99          input = info.input();
100         tokenFactory = info.tokenFactory();
101 
102         String   fileName = "unknown";
103         Reader   lexerReader = new LexerInputReader(input);
104         lexerSource = new LexerSource(fileName, lexerReader);
105         lexer.setSource(lexerSource);
106 
107         Object   state = info.state();
108 
109         if (state instanceof JRubyLexerRestartInfo) {
110             ((JRubyLexerRestartInfo)state).initializeState(this);
111         } else if (state instanceof Integer  ) {
112             int stateValue = ((Integer  )state).intValue();
113             lexer.setState(LexState.fromOrdinal(stateValue));
114         }
115     }
116 
117     public void release() {
118         if (REUSE_LEXERS) {
119             // Possibly reset the structures that could cause memory leaks
120             synchronized (RubyLexer.class) {
121                 cached = this;
122             }
123         }
124     }
125 
126     public Object   state() {
127         if (JRubyLexerRestartInfo.needsStateStorage(this)) {
128             return new JRubyLexerRestartInfo(this);
129         }
130 
131         // We only need to store the state of the lexer when it's in a significant state,
132         // e.g. outside of normal expressions and with no string processing in progress
133         LexState state = lexer.getLexState();
134 
135         if (state == null) {
136             return null;
137         }
138 
139         if (lexer.getStrTerm() != null) {
140             return new JRubyLexerRestartInfo(this);
141         }
142 
143         // The lexer can store integer states very efficiently
144         // (besides, Integer.valueOf will cache all these values since they are < 128)
145         return Integer.valueOf(state.getOrdinal());
146     }
147 
148     private Token<GsfTokenId> token(GsfTokenId id, int length) {
149         String   fixedText = id.fixedText();
150 
151         return (fixedText != null) ? tokenFactory.getFlyweightToken(id, fixedText)
152                                    : tokenFactory.createToken(id, length);
153     }
154 
155     public Token<GsfTokenId> nextToken() {
156         int token = 0;
157         int tokenLength = 0;
158         int oldOffset = lexerSource.getOffset();
159 
160         while (tokenLength == 0) {
161             try {
162                 lexer.advance();
163                 token = lexer.token();
164 
165                 StrTerm strTerm = lexer.getStrTerm();
166 
167                 if (strTerm != null) {
168                     strTerm.splitEmbeddedTokens();
169                 }
170             } catch (SyntaxException ex) { // includes SyntaxException
171                 token = Tokens.yyErrorCode; // TODO - generate incomplete tokens?
172                 tokenLength = lexerSource.getOffset() - oldOffset;
173 
174                 if (tokenLength == 0) {
175                     if (input.readLength() > 0) {
176                         return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
177                     } else {
178                         return null;
179                     }
180                 }
181 
182                 break;
183             } catch (Throwable   ex) { // includes SyntaxException
184                 ErrorManager.getDefault().notify(ex);
185 
186                 break;
187             }
188 
189             if (token == 0) { // EOF
190 
191                 if (input.readLength() > 0) {
192                     return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
193                 } else {
194                     return null;
195                 }
196             }
197 
198             int offset = lexerSource.getOffset();
199             tokenLength = offset - oldOffset;
200         }
201 
202         // Update lexer input to make sure it records the right
203         // character boundaries for the tokens (since incremental lexing
204         // will restart at token boundaries, and we want to make sure
205         // it knows in the character stream where those boundaries truly are
206         int readAhead = lexerSource.chompReadAhead();
207 
208         if (readAhead > 0) {
209             input.backup(readAhead);
210         }
211 
212         // Map to IDE types
213         GsfTokenId id = getTokenId(token, oldOffset);
214 
215         if (inSymbol) {
216             // A type symbol in front of a keyword, literal or constant
217             // should be lexed as a symbol
218             if ("keyword".equals(id.primaryCategory())) { // NOI18N
219                 id = RubyTokenId.TYPE_SYMBOL;
220             }
221         }
222 
223         inSymbol = (token == Tokens.tSYMBEG);
224 
225         return token(id, tokenLength);
226     }
227 
228     /** @todo Move classification of tokens into TokenTypes into JRuby somehow */
229     private GsfTokenId getTokenId(int token, int offset) {
230         // If you add any new token types here, remember to update #getRelevantTokenTypes below
231         switch (token) {
232         case Tokens.tCOMMENT:
233             return RubyTokenId.LINE_COMMENT;
234 
235         case Tokens.tWHITESPACE:
236             return RubyTokenId.WHITESPACE;
237 
238         case Tokens.tFLOAT:
239             return RubyTokenId.FLOAT_LITERAL;
240 
241         case Tokens.tINTEGER:
242             return RubyTokenId.INT_LITERAL;
243 
244         case Tokens.tSTRING_BEG:
245         case Tokens.tXSTRING_BEG:
246 
247             if (lexer.getStrTerm() != null) {
248                 substituting = lexer.getStrTerm().isSubstituting();
249             } else {
250                 substituting = false;
251             }
252 
253             return substituting ? RubyTokenId.QUOTED_STRING_BEGIN : RubyTokenId.STRING_BEGIN;
254 
255         case Tokens.tSTRING_DVAR:
256         case Tokens.tSTRING_DBEG:
257             inEmbedded = true;
258 
259             return inRegexp ? RubyTokenId.REGEXP_LITERAL : RubyTokenId.STRING_LITERAL;
260 
261         case Tokens.tSTRING_END:
262             return substituting ? RubyTokenId.QUOTED_STRING_END : RubyTokenId.STRING_END;
263 
264         case Tokens.tSTRING_CONTENT: // What about tXSTRING??
265 
266             if (inEmbedded) {
267                 inEmbedded = false;
268 
269                 return RubyTokenId.EMBEDDED_RUBY;
270             } else if (inRegexp) {
271                 return RubyTokenId.REGEXP_LITERAL;
272             } else if (substituting) {
273                 return RubyTokenId.QUOTED_STRING_LITERAL;
274             } else {
275                 return RubyTokenId.STRING_LITERAL;
276             }
277 
278         case Tokens.tREGEXP_BEG:
279             inRegexp = true;
280 
281             return RubyTokenId.REGEXP_BEGIN;
282 
283         case Tokens.tREGEXP_END:
284             inRegexp = false;
285 
286             return RubyTokenId.REGEXP_END;
287 
288         case Tokens.tDOCUMENTATION:
289             return RubyTokenId.DOCUMENTATION;
290 
291         case Tokens.yyErrorCode:
292             return RubyTokenId.ERROR;
293 
294         case Tokens.tGVAR: // Global variable
295             return RubyTokenId.GLOBAL_VAR;
296 
297         case Tokens.tIVAR: // Instance variable
298             return RubyTokenId.INSTANCE_VAR;
299 
300         case Tokens.tCVAR: // Class variable
301             return RubyTokenId.CLASS_VAR;
302 
303         case Tokens.tCONSTANT: // Constant
304             return inSymbol ? RubyTokenId.TYPE_SYMBOL : RubyTokenId.CONSTANT;
305 
306         case Tokens.tIDENTIFIER:
307             return inSymbol ? RubyTokenId.TYPE_SYMBOL : RubyTokenId.IDENTIFIER;
308 
309         case Tokens.tSYMBEG:
310             return RubyTokenId.TYPE_SYMBOL;
311 
312         case Tokens.tLBRACK:
313             return RubyTokenId.LBRACKET;
314 
315         case Tokens.tRBRACK:
316             return RubyTokenId.RBRACKET;
317 
318         case Tokens.tLPAREN:
319         case Tokens.tLPAREN2: // XXX What is this?
320         case Tokens.tLPAREN_ARG: // XXX What is this?
321             return RubyTokenId.LPAREN;
322 
323         case Tokens.tRPAREN:
324             return RubyTokenId.RPAREN;
325 
326         case Tokens.tLCURLY: // block (primary)
327         case Tokens.tLBRACE: // hash
328         case Tokens.tLBRACE_ARG: // block (expr)
329             return RubyTokenId.LBRACE;
330 
331         case Tokens.tRCURLY:
332             return RubyTokenId.RBRACE;
333 
334         case Tokens.kDEF:
335             return RubyTokenId.DEF;
336 
337         case Tokens.kEND:
338             return RubyTokenId.END;
339 
340         case Tokens.kCLASS:
341             return RubyTokenId.CLASS;
342 
343         case Tokens.kMODULE:
344             return RubyTokenId.MODULE;
345 
346         case Tokens.kBEGIN:
347             return RubyTokenId.BEGIN;
348 
349         case Tokens.kIF:
350             return RubyTokenId.IF;
351 
352         case Tokens.kUNLESS:
353             return RubyTokenId.UNLESS;
354 
355         case Tokens.kWHILE:
356             return RubyTokenId.WHILE;
357 
358         case Tokens.kUNTIL:
359             return RubyTokenId.UNTIL;
360 
361         case Tokens.kDO:
362             return RubyTokenId.DO;
363 
364         case Tokens.kCASE:
365             return RubyTokenId.CASE;
366 
367         case Tokens.kFOR:
368             return RubyTokenId.FOR;
369 
370         case Tokens.kELSE:
371             return RubyTokenId.ELSE;
372 
373         case Tokens.kELSIF:
374             return RubyTokenId.ELSIF;
375 
376         case Tokens.kENSURE:
377             return RubyTokenId.ENSURE;
378 
379         case Tokens.kWHEN:
380             return RubyTokenId.WHEN;
381 
382         case Tokens.kRESCUE:
383             return RubyTokenId.RESCUE;
384 
385         case Tokens.kSUPER:
386             return RubyTokenId.SUPER;
387 
388         case Tokens.kSELF:
389             return RubyTokenId.SELF;
390 
391         case Tokens.kRESCUE_MOD:
392         case Tokens.kDO_COND:
393         case Tokens.kDO_BLOCK:
394         case Tokens.kUNDEF:
395         case Tokens.kTHEN:
396         case Tokens.kBREAK:
397         case Tokens.kNEXT:
398         case Tokens.kREDO:
399         case Tokens.kRETRY:
400         case Tokens.kIN:
401         case Tokens.kRETURN:
402         case Tokens.kYIELD:
403         case Tokens.kNIL:
404         case Tokens.kTRUE:
405         case Tokens.kFALSE:
406         case Tokens.kAND:
407         case Tokens.kOR:
408         case Tokens.kNOT:
409         case Tokens.kIF_MOD:
410         case Tokens.kUNLESS_MOD:
411         case Tokens.kWHILE_MOD:
412         case Tokens.kUNTIL_MOD:
413         case Tokens.kALIAS:
414         case Tokens.kDEFINED:
415         case Tokens.klBEGIN: // "BEGIN { }": not matched with END { }
416         case Tokens.klEND: // "END { }": not matched with BEGIN { }
417         case Tokens.k__LINE__:
418         case Tokens.k__FILE__:
419             return RubyTokenId.ANY_KEYWORD;
420 
421         case Tokens.tDOT:
422             return RubyTokenId.DOT;
423 
424         case Tokens.tDOT2:
425         case Tokens.tDOT3:
426             return RubyTokenId.RANGE;
427 
428         case Tokens.tCOLON3:
429             return RubyTokenId.COLON3;
430 
431         default:
432             return RubyTokenId.IDENTIFIER;
433         }
434     }
435 
436     private static class JRubyLexerRestartInfo {
437         /** Bit set when we're in regular expressions */
438         private static final int IN_REGEXP = 1;
439 
440         /** Bit set when we're in symbols */
441         private static final int IN_SYMBOL = 2;
442 
443         /** Bit set when we're in an embedded ruby context... #{here} */
444         private static final int IN_EMBEDDED = 4;
445 
446         /** Bit set when we're in a substituting/doublequoted string */
447         private static final int IN_SUBSTITUTING = 8;
448 
449         /** Bit set when we need to set the spaceSeen flag in RubyYaccLexer */
450         private static final int SET_SPACE_SEEN = 16;
451 
452         /** Bit set when we need to set commandStart in RubyYaccLexer */
453         private static final int SET_COMMAND_START = 32;
454         private StrTerm strTerm;
455         private int localState;
456         private LexState lexState;
457         private Object   strTermState;
458 
459         JRubyLexerRestartInfo(RubyLexer rubyLexer) {
460             strTerm = rubyLexer.lexer.getStrTerm();
461 
462             if (strTerm != null) {
463                 strTermState = strTerm.getMutableState();
464             }
465 
466             lexState = rubyLexer.lexer.getLexState();
467 
468             if (rubyLexer.inRegexp) {
469                 localState += IN_REGEXP;
470             }
471 
472             if (rubyLexer.inSymbol) {
473                 localState += IN_SYMBOL;
474             }
475 
476             if (rubyLexer.inEmbedded) {
477                 localState += IN_EMBEDDED;
478             }
479 
480             if (rubyLexer.substituting) {
481                 localState += IN_SUBSTITUTING;
482             }
483 
484             if (rubyLexer.lexer.isSetSpaceSeen()) {
485                 localState += SET_SPACE_SEEN;
486             }
487 
488             if (rubyLexer.lexer.isCommandStart()) {
489                 localState += SET_COMMAND_START;
490             }
491         }
492 
493         /** Return true iff the given lexer needs custom state storage beyond the state integers */
494         public static boolean needsStateStorage(RubyLexer rubyLexer) {
495             return rubyLexer.inRegexp || rubyLexer.inSymbol || rubyLexer.inEmbedded ||
496             rubyLexer.substituting || rubyLexer.lexer.isCommandStart() ||
497             rubyLexer.lexer.isSetSpaceSeen();
498         }
499 
500         public boolean equals(Object   obj) {
501             if (obj == null) {
502                 return false;
503             }
504 
505             if (getClass() != obj.getClass()) {
506                 return false;
507             }
508 
509             final JRubyLexerRestartInfo other = (JRubyLexerRestartInfo)obj;
510 
511             if ((this.strTerm != other.strTerm) &&
512                     ((this.strTerm == null) || !this.strTerm.equals(other.strTerm))) {
513                 return false;
514             }
515 
516             if (this.localState != other.localState) {
517                 return false;
518             }
519 
520             if ((this.lexState != other.lexState) &&
521                     ((this.lexState == null) ||
522                     !(this.lexState.getOrdinal() == other.lexState.getOrdinal()))) {
523                 return false;
524             }
525 
526             if ((this.strTermState != other.strTermState) &&
527                     ((this.strTermState == null) || !this.strTermState.equals(other.strTermState))) {
528                 return false;
529             }
530 
531             return true;
532         }
533 
534         public int hashCode() {
535             int hash = 7;
536 
537             hash = (43 * hash) + this.localState;
538             hash = (43 * hash) + ((this.strTerm != null) ? this.strTerm.hashCode() : 0);
539             hash = (43 * hash) + ((this.strTermState != null) ? this.strTermState.hashCode() : 0);
540 
541             return hash;
542         }
543 
544         public String   toString() {
545             return "RubyLexerState[" + localState + "," + strTerm + "," + lexState + "," +
546             strTermState + "]";
547         }
548 
549         void initializeState(RubyLexer rubyLexer) {
550             rubyLexer.lexer.setStrTerm(strTerm);
551 
552             if ((strTermState != null) && (strTerm != null)) {
553                 strTerm.setMutableState(strTermState);
554             }
555 
556             if ((localState & IN_REGEXP) != 0) {
557                 rubyLexer.inRegexp = true;
558             }
559 
560             if ((localState & IN_SYMBOL) != 0) {
561                 rubyLexer.inSymbol = true;
562             }
563 
564             if ((localState & IN_EMBEDDED) != 0) {
565                 rubyLexer.inEmbedded = true;
566             }
567 
568             if ((localState & IN_SUBSTITUTING) != 0) {
569                 rubyLexer.substituting = true;
570             }
571 
572             if ((localState & SET_COMMAND_START) != 0) {
573                 rubyLexer.lexer.setCommandStart(true);
574             }
575 
576             if ((localState & SET_SPACE_SEEN) != 0) {
577                 rubyLexer.lexer.setSpaceSeen(true);
578             }
579 
580             rubyLexer.lexer.setLexState(lexState);
581         }
582     }
583 
584     private class LexerInputReader extends Reader   {
585         private LexerInput input;
586 
587         LexerInputReader(LexerInput input) {
588             this.input = input;
589         }
590 
591         public int read(char[] buf, int off, int len) throws IOException   {
592             for (int i = 0; i < len; i++) {
593                 int c = input.read();
594 
595                 if (c == LexerInput.EOF) {
596                     return -1;
597                 }
598 
599                 buf[i + off] = (char)c;
600             }
601 
602             return len;
603         }
604 
605         public void close() throws IOException   {
606         }
607     }
608 }
609
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags