KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > icl > saxon > expr > Tokenizer


1 package com.icl.saxon.expr;
2
3 import java.util.*;
4
5 /**
6 * Tokenizer for patterns.
7 *
8 * This code is copied with minor changes from James Clark's xt.
9 * See copyright notice at end of file.
10 *
11 */

12
13
14 final class Tokenizer {
15     private static final int UNKNOWN = -1;
16     public static final int EOF = 0;
17     public static final int NAME = 1;
18     public static final int FUNCTION = 2;
19     public static final int LITERAL = 3;
20     public static final int VBAR = 4;
21     public static final int SLASH = 5;
22     public static final int AT = 6;
23     public static final int LSQB = 7;
24     public static final int RSQB = 8;
25     public static final int LPAR = 9;
26     public static final int RPAR = 10;
27     public static final int EQUALS = 11;
28     public static final int DOT = 12;
29     public static final int DOTDOT = 13;
30     public static final int STAR = 14;
31     public static final int COMMA = 15;
32     public static final int SLSL = 16;
33     public static final int PREFIX = 17;
34     public static final int OR = 18;
35     public static final int AND = 19;
36     public static final int NUMBER = 20;
37     public static final int GT = 21;
38     public static final int LT = 22;
39     public static final int GE = 23;
40     public static final int LE = 24;
41     public static final int PLUS = 25;
42     public static final int MINUS = 26;
43     public static final int MULT = 27;
44     public static final int DIV = 28;
45     public static final int MOD = 29;
46     public static final int DOLLAR = 31;
47     public static final int NODETYPE = 32;
48     public static final int AXIS = 33;
49     public static final int NE = 34;
50
51     public static final int NEGATE = 99; // unary minus: not actually a token, but we
52
// use token numbers to identify operators.
53

54
55     public static String JavaDoc[] tokens =
56              {"EOF", "<name>", "<function>", "<literal>", "|", "/", "@", "[", "]",
57                                 "(", ")", "=", ".", "..", "*", ",", "//", "^",
58                                 "or", "and", "<number>", ">", "<", ">=", "<=", "+", "-",
59                                 "*", "div", "mod", "--quo--", "$", "<nodetype>()",
60                                 "<axis>()", "!="};
61
62     public int currentToken = EOF;
63     public String JavaDoc currentTokenValue = null;
64     public double currentNumericValue = 0.0;
65
66     private int currentTokenStartIndex = 0;
67     public String JavaDoc pattern;
68     private int patternIndex = 0;
69     private int patternLength;
70
71     private int precedingToken = UNKNOWN;
72
73     //
74
// Lexical analyser for patterns
75
//
76

77     public void tokenize(String JavaDoc pattern) throws XPathException {
78         currentToken = EOF;
79         currentTokenValue = null;
80         currentTokenStartIndex = 0;
81         patternIndex = 0;
82         this.pattern = pattern;
83         this.patternLength = pattern.length();
84         next();
85     }
86
87     //diagnostic version of next(): change real version to realnext()
88
//
89
//public void next() throws XPathException {
90
// realnext();
91
// System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
92
//}
93

94     public void next() throws XPathException {
95         precedingToken = currentToken;
96         currentTokenValue = null;
97         currentTokenStartIndex = patternIndex;
98         for (;;) {
99             if (patternIndex >= patternLength) {
100                 currentToken = EOF;
101                 return;
102             }
103             char c = pattern.charAt(patternIndex++);
104             switch (c) {
105             case '/':
106                 if (patternIndex < patternLength
107                         && pattern.charAt(patternIndex) == '/') {
108                     patternIndex++;
109                     currentToken = SLSL;
110                     return;
111                 }
112                 currentToken = SLASH;
113                 return;
114             case '@':
115                 currentToken = AT;
116                 return;
117             case '[':
118                 currentToken = LSQB;
119                 return;
120             case ']':
121                 currentToken = RSQB;
122                 return;
123             case '(':
124                 currentToken = LPAR;
125                 return;
126             case ')':
127                 currentToken = RPAR;
128                 return;
129             case '+':
130                 currentToken = PLUS;
131                 return;
132             case '-':
133                 currentToken = MINUS; // not detected if part of a name
134
return;
135             case '=':
136                 currentToken = EQUALS;
137                 return;
138             case '!':
139                 if (patternIndex < patternLength
140                         && pattern.charAt(patternIndex) == '=') {
141                     patternIndex++;
142                     currentToken = NE;
143                     return;
144                 }
145                 throw new XPathException("\"!\" without \"=\" in expression " + pattern);
146             case '*':
147                 if (precedingToken==EOF ||
148                         precedingToken==AT ||
149                         precedingToken==LPAR ||
150                         precedingToken==LSQB ||
151                         precedingToken==COMMA ||
152                         precedingToken==FUNCTION ||
153                         precedingToken==AXIS ||
154                         isOperator(precedingToken)) {
155                     currentToken = STAR;
156                 } else {
157                     currentToken = MULT;
158                 }
159                 return;
160             case ',':
161                 currentToken = COMMA;
162                 return;
163             case '$':
164                 currentToken = DOLLAR;
165                 // disallow white space (other errors will be found by the parser)
166
if (patternIndex < patternLength) {
167                     char ahead = pattern.charAt(patternIndex);
168                     if (" \r\t\n".indexOf(ahead) >= 0) {
169                         throw new XPathException("Whitespace is not allowed after '$' sign");
170                     }
171                 }
172                 return;
173             case '|':
174                 currentToken = VBAR;
175                 return;
176             case '<':
177                 if (patternIndex < patternLength
178                         && pattern.charAt(patternIndex) == '=') {
179                     patternIndex++;
180                     currentToken = LE;
181                     return;
182                 }
183                 currentToken = LT;
184                 return;
185             case '>':
186                 if (patternIndex < patternLength
187                         && pattern.charAt(patternIndex) == '=') {
188                     patternIndex++;
189                     currentToken = GE;
190                     return;
191                 }
192                 currentToken = GT;
193                 return;
194             case '.':
195                 if (patternIndex < patternLength
196                         && pattern.charAt(patternIndex) == '.') {
197                     patternIndex++;
198                     currentToken = DOTDOT;
199                     return;
200                 }
201                 if (patternIndex == patternLength
202                         || pattern.charAt(patternIndex) < '0'
203                         || pattern.charAt(patternIndex) > '9') {
204                     currentToken = DOT;
205                     return;
206                 }
207                 // otherwise drop through: we have a number starting with a decimal point
208
case '0':
209             case '1':
210             case '2':
211             case '3':
212             case '4':
213             case '5':
214             case '6':
215             case '7':
216             case '8':
217             case '9':
218                 for (;patternIndex < patternLength; patternIndex++) {
219                     c = pattern.charAt(patternIndex);
220                     if (!(c=='.' || Character.isDigit(c))) break;
221                 }
222                 currentTokenValue = pattern.substring(currentTokenStartIndex, patternIndex);
223                 try {
224                     currentNumericValue = new Double JavaDoc(currentTokenValue).doubleValue();
225                 } catch (NumberFormatException JavaDoc err) {
226                     throw new XPathException("Invalid number (" + currentTokenValue + ") in expression " + pattern);
227                 }
228                 currentToken = NUMBER;
229                 return;
230             case '"':
231             case '\'':
232                 patternIndex = pattern.indexOf(c, patternIndex);
233                 if (patternIndex < 0) {
234                     patternIndex = currentTokenStartIndex + 1;
235                     throw new XPathException("Unmatched quote in expression " + pattern);
236                 }
237                 currentTokenValue = pattern.substring(currentTokenStartIndex + 1,
238                           patternIndex++).intern();
239                 currentToken = LITERAL;
240                 return;
241             case ' ':
242             case '\t':
243             case '\r':
244             case '\n':
245                 currentTokenStartIndex = patternIndex;
246                 break;
247             default:
248                 if (c < 0x80 && !Character.isLetter(c))
249                     throw new XPathException("Invalid character (" + c + ") in expression " + pattern);
250                 /* fall through */
251             case '_':
252             loop:
253                 for (;patternIndex < patternLength; patternIndex++) {
254                     c = pattern.charAt(patternIndex);
255                     switch (c) {
256                     case ':':
257                         if (patternIndex+1 < patternLength &&
258                                 pattern.charAt(patternIndex+1) == ':') {
259                             currentTokenValue = pattern.substring(currentTokenStartIndex,
260                                                             patternIndex).intern();
261                             currentToken = AXIS;
262                             patternIndex+=2;
263                             return;
264                         }
265                         if (patternIndex+1 < patternLength &&
266                                 pattern.charAt(patternIndex+1) == '*') {
267                             currentTokenValue = pattern.substring(currentTokenStartIndex,
268                                                             patternIndex).intern();
269                             currentToken = PREFIX;
270                             patternIndex+=2;
271                             return;
272                         }
273                         break;
274                     case '.':
275                     case '-':
276                     case '_':
277                         break;
278                     case '(':
279                         currentTokenValue = pattern.substring(currentTokenStartIndex,
280                                                                 patternIndex).intern();
281                         int op = getBinaryOp(currentTokenValue);
282                         if (op != UNKNOWN) {
283                             currentToken = op;
284                             return;
285                         }
286                         patternIndex++; // swallows the '('
287
currentToken = getFunctionType(currentTokenValue);
288                         return;
289                     default:
290                         if (c < 0x80 && !Character.isLetterOrDigit(c))
291                             break loop;
292                         break;
293                     }
294                 }
295                 currentTokenValue = pattern.substring(currentTokenStartIndex,
296                                                         patternIndex).intern();
297             lookahead:
298                 for (int i = patternIndex; i < patternLength; i++) {
299                     switch (pattern.charAt(i)) {
300                     case ' ':
301                     case '\t':
302                     case '\r':
303                     case '\n':
304                         break;
305                     case ':':
306                         if (i+1 < patternLength && pattern.charAt(i+1) == ':') {
307                             currentToken = AXIS;
308                             patternIndex = i+2;
309                             return;
310                         }
311                         break lookahead;
312                     case '(':
313                         int oper = getBinaryOp(currentTokenValue);
314                         if (oper != UNKNOWN) {
315                             currentToken = oper;
316                             return;
317                         } else {
318                             currentToken = getFunctionType(currentTokenValue);
319                             patternIndex = i + 1;
320                             return;
321                         }
322                         /* fall through */
323                     default:
324                         break lookahead;
325                     }
326                 }
327                 int optype = getBinaryOp(currentTokenValue);
328                 if (optype!=UNKNOWN && !
329                          ( precedingToken==EOF ||
330                             precedingToken==AT ||
331                             precedingToken==LPAR ||
332                             precedingToken==LSQB ||
333                             precedingToken==COMMA ||
334                             precedingToken==FUNCTION ||
335                             precedingToken==AXIS ||
336                             precedingToken==DOLLAR ||
337                             isOperator(precedingToken))
338                         ) {
339                     currentToken = optype;
340                 } else {
341                     currentToken = NAME;
342                 }
343                 return;
344             }
345         }
346     }
347
348     /**
349     * Identify a binary operator
350     * @param s String representation of the operator - must be interned
351     */

352
353     static private int getBinaryOp(String JavaDoc s) {
354         if (s=="and") return AND;
355         if (s=="or") return OR;
356         if (s=="div") return DIV;
357         if (s=="mod") return MOD;
358         return UNKNOWN;
359     }
360
361     /**
362     * Distinguish axis names, nodetype names, and function names, which appear in the
363     * same syntactic context
364     * @param s the name - must be interned
365     */

366
367     static private int getFunctionType(String JavaDoc s) {
368         if (s=="node") return NODETYPE;
369         if (s=="text") return NODETYPE;
370         if (s=="comment") return NODETYPE;
371         if (s=="processing-instruction") return NODETYPE;
372         return FUNCTION;
373     }
374
375     /**
376     * Test whether a token is an operator
377     */

378
379     static private boolean isOperator(int tok) {
380         return (
381             tok==SLASH || tok==SLSL || tok==VBAR ||
382             tok==EQUALS || tok==OR || tok==AND || tok==GT || tok==LT || tok==NE ||
383             tok==GE || tok==LE || tok==PLUS || tok==MINUS || tok==MULT || tok==DIV ||
384             tok==MOD );
385     }
386 }
387
388 /*
389
390 The following copyright notice is copied from the licence for xt, from which the
391 original version of this module was derived:
392 --------------------------------------------------------------------------------
393 Copyright (c) 1998, 1999 James Clark
394
395 Permission is hereby granted, free of charge, to any person obtaining
396 a copy of this software and associated documentation files (the
397 "Software"), to deal in the Software without restriction, including
398 without limitation the rights to use, copy, modify, merge, publish,
399 distribute, sublicense, and/or sell copies of the Software, and to
400 permit persons to whom the Software is furnished to do so, subject to
401 the following conditions:
402
403 The above copyright notice and this permission notice shall be included
404 in all copies or substantial portions of the Software.
405
406 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
407 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
408 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
409 IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
410 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
411 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
412 OTHER DEALINGS IN THE SOFTWARE.
413
414 Except as contained in this notice, the name of James Clark shall
415 not be used in advertising or otherwise to promote the sale, use or
416 other dealings in this Software without prior written authorization
417 from James Clark.
418 ---------------------------------------------------------------------------
419 */

420
Popular Tags