ATESyntaxLexer


1   /*
2   
3   [The "BSD licence"]
4   Copyright (c) 2005 Jean Bovet
5   All rights reserved.
6   
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10  
11  1. Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13  2. Redistributions in binary form must reproduce the above copyright
14  notice, this list of conditions and the following disclaimer in the
15  documentation and/or other materials provided with the distribution.
16  3. The name of the author may not be used to endorse or promote products
17  derived from this software without specific prior written permission.
18  
19  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  
30  */
31  
32  package org.antlr.works.ate.syntax.generic;
33  
34  import org.antlr.works.ate.syntax.misc.ATELine;
35  import org.antlr.works.ate.syntax.misc.ATEToken;
36  
37  import java.util.ArrayList  ;
38  import java.util.List  ;
39  
40  public class ATESyntaxLexer {
41  
42      public static final int TOKEN_SINGLE_QUOTE_STRING = 1;
43      public static final int TOKEN_DOUBLE_QUOTE_STRING = 2;
44      public static final int TOKEN_SINGLE_COMMENT = 3;
45      public static final int TOKEN_COMPLEX_COMMENT = 4;
46      public static final int TOKEN_ID = 5;
47      public static final int TOKEN_CHAR = 6;
48      public static final int TOKEN_LPAREN = 7;
49      public static final int TOKEN_RPAREN = 8;
50      public static final int TOKEN_LCURLY = 9;
51      public static final int TOKEN_RCURLY = 10;
52      public static final int TOKEN_LBRACK = 11;
53      public static final int TOKEN_RBRACK = 12;
54      public static final int TOKEN_COLON = 13;
55      public static final int TOKEN_SEMI = 14;
56      public static final int TOKEN_OTHER = 15;
57  
58      protected List  <ATEToken> tokens;
59      protected String   text;
60      protected int position;
61  
62      protected int lineNumber;
63      protected int lineIndex;    // position of the line in characters
64      protected List  <ATELine> lines;
65  
66      /** True if the current character is a control character (that is preceeded by a \) */
67      protected boolean controlCharacter;
68  
69      /** c0 and c1 are character cache for quick access to the current
70       * character (c0) and the next character (c1)
71       */
72      protected char c0;
73      protected char c1;
74  
75      public ATESyntaxLexer() {
76          lines = new ArrayList  <ATELine>();
77          tokens = new ArrayList  <ATEToken>();
78      }
79  
80      public List  <ATEToken> getTokens() {
81          return tokens;
82      }
83  
84      public List  <ATELine> getLines() {
85          return lines;
86      }
87  
88      public int getLineNumber() {
89          return lineNumber;
90      }
91  
92      public void tokenize(String   text) {
93          this.text = text;
94  
95          position = -1;
96          lineNumber = 0;
97          lines.clear();
98          lines.add(new ATELine(0));
99  
100         tokens.clear();
101         tokenize();
102     }
103 
104     protected void tokenize() {
105         while(nextCharacter()) {
106             ATEToken token = customMatch();
107 
108             if(token != null) {
109                 // custom match matched something
110             } else if(c0 == '\'')
111                 token = matchSingleQuoteString();
112             else if(c0 == '\"')
113                 token = matchDoubleQuoteString();
114             else if(c0 == '/' && c1 == '/')
115                 token = matchSingleComment();
116             else if(c0 == '/' && c1 == '*')
117                 token = matchComplexComment();
118             else if(isLetter())
119                 token = matchID();
120             else if(c0 == '(')
121                 token = createNewToken(TOKEN_LPAREN);
122             else if(c0 == ')')
123                 token = createNewToken(TOKEN_RPAREN);
124             else if(c0 == '{')
125                 token = createNewToken(TOKEN_LCURLY);
126             else if(c0 == '}')
127                 token = createNewToken(TOKEN_RCURLY);
128             else if(c0 == '[')
129                 token = createNewToken(TOKEN_LBRACK);
130             else if(c0 == ']')
131                 token = createNewToken(TOKEN_RBRACK);
132             else if(c0 == ':')
133                 token = createNewToken(TOKEN_COLON);
134             else if(c0 == ';')
135                 token = createNewToken(TOKEN_SEMI);
136             else if(!isWhitespace())
137                 token = createNewToken(TOKEN_CHAR);
138 
139             addToken(token);
140         }
141     }
142 
143     protected ATEToken customMatch() {
144         return null;
145     }
146 
147     public void addToken(ATEToken token) {
148         if(token != null) {
149             token.index = tokens.size();
150             tokens.add(token);
151         }
152     }
153 
154     protected ATEToken matchID() {
155         int sp = position;
156         while(isID(c1) && nextCharacter()) {
157         }
158         return createNewToken(TOKEN_ID, sp);
159     }
160 
161     public ATEToken matchSingleQuoteString() {
162         int sp = position;
163         while(nextCharacter()) {
164             if((c0 == '\'' || matchNewLine()) && !controlCharacter) {
165                 return createNewToken(TOKEN_SINGLE_QUOTE_STRING, sp);
166             }
167         }
168         return null;
169     }
170 
171     public ATEToken matchDoubleQuoteString() {
172         int sp = position;
173         while(nextCharacter()) {
174             if((c0 == '\"' || matchNewLine()) && !controlCharacter) {
175                 return createNewToken(TOKEN_DOUBLE_QUOTE_STRING, sp);
176             }
177         }
178         return null;
179     }
180 
181     public ATEToken matchSingleComment() {
182         int sp = position;
183         while(nextCharacter()) {
184             if(matchNewLine()) {
185                 return createNewToken(TOKEN_SINGLE_COMMENT, sp);
186             }
187         }
188         return createNewToken(TOKEN_SINGLE_COMMENT, sp, position);
189     }
190 
191     public ATEToken matchComplexComment() {
192         int sp = position;
193         while(nextCharacter()) {
194             if(c0 == '*' && c1 == '/') {
195                 // Don't forget to eat the next character ;-)
196                 nextCharacter();
197                 return createNewToken(TOKEN_COMPLEX_COMMENT, sp, Math.min(position+1, text.length()));
198             }
199         }
200         // Complex comment terminates also at the end of the text
201         return createNewToken(TOKEN_COMPLEX_COMMENT, sp, position);
202     }
203 
204     public boolean nextCharacter() {
205         boolean valid = false;
206         final int length = text.length();
207         controlCharacter = false;
208 
209         c0 = c1 = 0;
210         position++;
211         if(position < length) {
212             // Skip control character
213             if(text.charAt(position) == '\\') {
214                 controlCharacter = true;
215                 position += 1;
216             }
217 
218             valid = position < length;
219             if(valid) {
220                 c0 = text.charAt(position);
221                 if(position + 1 < length)
222                     c1 = text.charAt(position+1);
223             }
224 
225             if(matchNewLine()) {
226                 lineNumber++;
227                 lineIndex = position+1;
228                 lines.add(new ATELine(lineIndex));
229             }
230         }
231         return valid;
232     }
233 
234     public boolean matchNewLine() {
235         if(c0 == '\n') {
236             // Unix
237             return true;
238         } else if(c0 == '\r' && c1 == '\n') {
239             // Windows
240             return true;
241         } else if(c0 == '\r') {
242             // Mac
243             return true;
244         } else {
245             return false;
246         }
247     }
248 
249     public boolean isWhitespace() {
250         return Character.isWhitespace(c0);
251     }
252 
253     public boolean isLetter() {
254         return Character.isLetter(c0);
255     }
256 
257     public boolean isLetterOrDigit() {
258         return isLetterOrDigit(c0);
259     }
260 
261     public boolean isLetterOrDigit(char c) {
262         return Character.isLetterOrDigit(c);
263     }
264 
265     public boolean isID(char c) {
266         if(Character.isLetterOrDigit(c))
267             return true;
268 
269         return c == '_' || c == '$';
270     }
271 
272     public ATEToken createNewToken(int type) {
273         return createNewToken(type, position);
274     }
275 
276     public ATEToken createNewToken(int type, int start) {
277         return createNewToken(type, start, position+1);
278     }
279 
280     public ATEToken createNewToken(int type, int start, int end) {
281         return createNewToken(type, start, end, lineNumber, lineNumber, lineIndex, lineIndex);
282     }
283 
284     public ATEToken createNewToken(int type, int start, int end,
285                                    int startLineNumber, int endLineNumber,
286                                    int startLineIndex, int endLineIndex) {
287         return new ATEToken(type, start, end, startLineNumber, endLineNumber, startLineIndex, endLineIndex, text);
288     }
289 
290 }
291
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags