KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > sf > saxon > expr > Tokenizer


1 package net.sf.saxon.expr;
2 import net.sf.saxon.functions.NormalizeSpace;
3 import net.sf.saxon.trans.StaticError;
4
5 import java.util.ArrayList JavaDoc;
6 import java.util.List JavaDoc;
7
8 /**
9  * Tokenizer for expressions and inputs.
10  *
11  * This code was originally derived from James Clark's xt, though it has been greatly modified since.
12  * See copyright notice at end of file.
13  */

14
15
16 public final class Tokenizer {
17
18     public int getState() {
19         return state;
20     }
21
22     public void setState(int state) {
23         this.state = state;
24         if (state==DEFAULT_STATE) {
25             // force the followsOperator() test to return true
26
precedingToken = Token.UNKNOWN;
27             currentToken = Token.UNKNOWN;
28         } else if (state==OPERATOR_STATE) {
29             precedingToken = Token.RPAR;
30             currentToken = Token.RPAR;
31         }
32     }
33
34     private int state = DEFAULT_STATE;
35         // we may need to make this a stack at some time
36

37     /**
38      * Initial default state of the Tokenizer
39      */

40     public static final int DEFAULT_STATE = 0;
41
42     /**
43      * State in which a name is NOT to be merged with what comes next, for example "("
44      */

45     public static final int BARE_NAME_STATE = 1;
46
47     /**
48      * State in which the next thing to be read is a SequenceType
49      */

50     public static final int SEQUENCE_TYPE_STATE = 2;
51     /**
52      * State in which the next thing to be read is an operator
53      */

54
55     public static final int OPERATOR_STATE = 3;
56
57     /**
58      * The starting line number (for XPath in XSLT, the line number in the stylesheet)
59      */

60     public int startLineNumber;
61     /**
62      * The number identifying the most recently read token
63      */

64     public int currentToken = Token.EOF;
65     /**
66      * The string value of the most recently read token
67      */

68     public String JavaDoc currentTokenValue = null;
69     /**
70      * The position in the input expression where the current token starts
71      */

72     public int currentTokenStartOffset = 0;
73     /**
74      * The number of the next token to be returned
75      */

76     private int nextToken = Token.EOF;
77     /**
78      * The string value of the next token to be returned
79      */

80     private String JavaDoc nextTokenValue = null;
81     /**
82      * The position in the expression of the start of the next token
83      */

84     private int nextTokenStartOffset = 0;
85     /**
86      * The string being parsed
87      */

88     public String JavaDoc input;
89     /**
90      * The current position within the input string
91      */

92     public int inputOffset = 0;
93     /**
94      * The length of the input string
95      */

96     private int inputLength;
97     /**
98      * The line number (within the expression) of the current token
99      */

100     private int lineNumber = 1;
101     /**
102      * The line number (within the expression) of the next token
103      */

104     private int nextLineNumber = 1;
105
106     /**
107      * List containing the positions (offsets in the input string) at which newline characters
108      * occur
109      */

110
111     private List JavaDoc newlineOffsets = null;
112
113     /**
114      * The token number of the token that preceded the current token
115      */

116     private int precedingToken = Token.UNKNOWN;
117
118
119     //public boolean recognizePragmas = false;
120
//public String lastPragma = null;
121

122     //
123
// Lexical analyser for expressions, queries, and XSLT patterns
124
//
125

126     /**
127      * Prepare a string for tokenization.
128      * The actual tokens are obtained by calls on next()
129      *
130      * @param input the string to be tokenized
131      * @param start start point within the string
132      * @param end end point within the string (last character not read):
133      * -1 means end of string
134      * @exception net.sf.saxon.trans.StaticError if a lexical error occurs, e.g. unmatched
135      * string quotes
136      */

137     public void tokenize(String JavaDoc input, int start, int end, int lineNumber) throws StaticError {
138         nextToken = Token.EOF;
139         nextTokenValue = null;
140         nextTokenStartOffset = 0;
141         inputOffset = start;
142         this.input = input;
143         this.startLineNumber = lineNumber;
144         this.lineNumber = lineNumber;
145         this.nextLineNumber = lineNumber;
146         if (end==-1) {
147             this.inputLength = input.length();
148         } else {
149             this.inputLength = end;
150         }
151
152         // The tokenizer actually reads one token ahead. The raw lexical analysis performed by
153
// the lookAhead() method does not (in general) distinguish names used as QNames from names
154
// used for operators, axes, and functions. The next() routine further refines names into the
155
// correct category, by looking at the following token. In addition, it combines compound tokens
156
// such as "instance of" and "cast as".
157

158         lookAhead();
159         next();
160     }
161
162     //diagnostic version of next(): change real version to realnext()
163
//
164
//public void next() throws XPathException {
165
// realnext();
166
// System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
167
//}
168

169     /**
170      * Get the next token from the input expression. The type of token is returned in the
171      * currentToken variable, the string value of the token in currentTokenValue.
172      *
173      * @exception net.sf.saxon.trans.StaticError if a lexical error is detected
174      */

175
176     public void next() throws StaticError {
177         precedingToken = currentToken;
178         currentToken = nextToken;
179         currentTokenValue = nextTokenValue;
180         if (currentTokenValue==null) {
181             currentTokenValue="";
182         }
183         currentTokenStartOffset = nextTokenStartOffset;
184         lineNumber = nextLineNumber;
185
186         // disambiguate the current token based on the tokenizer state
187

188         switch (currentToken) {
189             case Token.NAME:
190                 int optype = getBinaryOp(currentTokenValue);
191                 if (optype!=Token.UNKNOWN && !followsOperator()) {
192                     currentToken = optype;
193                 }
194                 break;
195             case Token.LT:
196                 if (followsOperator()) {
197                     currentToken = Token.TAG;
198                 }
199                 break;
200             case Token.STAR:
201                 if (!followsOperator()) {
202                     currentToken = Token.MULT;
203                 }
204                 break;
205         }
206
207         if (currentToken == Token.TAG || currentToken == Token.RCURLY) {
208             // No lookahead after encountering "<" at the start of an XML-like tag.
209
// After an RCURLY, the parser must do an explicit lookahead() to continue
210
// tokenizing; otherwise it can continue with direct character reading
211
return;
212         }
213
214         lookAhead();
215
216         if (currentToken == Token.NAME) {
217             if (state == BARE_NAME_STATE) {
218                 return;
219             }
220             switch (nextToken) {
221                 case Token.LPAR:
222                     int op = getBinaryOp(currentTokenValue);
223                     if (op == Token.UNKNOWN) {
224                         currentToken = getFunctionType(currentTokenValue);
225                         lookAhead(); // swallow the "("
226
} else {
227                         currentToken = op;
228                     }
229                     break;
230
231                 case Token.LCURLY:
232                     if (!(state == SEQUENCE_TYPE_STATE)) {
233                         currentToken = Token.KEYWORD_CURLY;
234                         lookAhead(); // swallow the "{"
235
}
236                     break;
237
238                 case Token.COLONCOLON:
239                     lookAhead();
240                     currentToken = Token.AXIS;
241                     break;
242
243                 case Token.COLONSTAR:
244                     lookAhead();
245                     currentToken = Token.PREFIX;
246                     break;
247
248                 case Token.DOLLAR:
249                     if (currentTokenValue=="for") {
250                         currentToken = Token.FOR;
251                     } else if (currentTokenValue=="some") {
252                         currentToken = Token.SOME;
253                     } else if (currentTokenValue=="every") {
254                         currentToken = Token.EVERY;
255                     } else if (currentTokenValue=="let") {
256                         currentToken = Token.LET;
257                     }
258                     break;
259
260                 case Token.NAME:
261                     int candidate = -1;
262                     if (currentTokenValue.equals("element")) {
263                         candidate = Token.ELEMENT_QNAME;
264                     } else if (currentTokenValue.equals("attribute")) {
265                         candidate = Token.ATTRIBUTE_QNAME;
266                     } else if (currentTokenValue.equals("processing-instruction")) {
267                         candidate = Token.PI_QNAME;
268                     }
269                     if (candidate != -1) {
270                         // <'element' QName '{'> constructor
271
// <'attribute' QName '{'> constructor
272
// <'processing-instruction' QName '{'> constructor
273

274                         String JavaDoc qname = nextTokenValue;
275                         String JavaDoc saveTokenValue = currentTokenValue;
276                         int savePosition = inputOffset;
277                         lookAhead();
278                         if (nextToken == Token.LCURLY) {
279                             currentToken = candidate;
280                             currentTokenValue = qname;
281                             lookAhead();
282                             return;
283                         } else {
284                             // backtrack (we don't have 2-token lookahead; this is the
285
// only case where it's needed. So we backtrack instead.)
286
currentToken = Token.NAME;
287                             currentTokenValue = saveTokenValue;
288                             inputOffset = savePosition;
289                             nextToken = Token.NAME;
290                             nextTokenValue = qname;
291                         }
292
293                     }
294                     String JavaDoc composite = currentTokenValue + ' ' + nextTokenValue;
295                     Integer JavaDoc val = (Integer JavaDoc)Token.doubleKeywords.get(composite);
296                     if (val==null) {
297                         break;
298                     } else {
299                         currentToken = val.intValue();
300                         currentTokenValue = composite;
301                         lookAhead();
302                         return;
303                     }
304                 default:
305                     // no action needed
306
}
307         }
308     }
309
310     /**
311      * Force the current token to be treated as an operator if possible
312      */

313
314     public void treatCurrentAsOperator() {
315         switch (currentToken) {
316             case Token.NAME:
317                 int optype = getBinaryOp(currentTokenValue);
318                 if (optype!=Token.UNKNOWN) {
319                     currentToken = optype;
320                 }
321                 break;
322             case Token.STAR:
323                 currentToken = Token.MULT;
324                 break;
325         }
326     }
327
328     /**
329      * Look ahead by one token. This method does the real tokenization work.
330      * The method is normally called internally, but the XQuery parser also
331      * calls it to resume normal tokenization after dealing with pseudo-XML
332      * syntax.
333      * @exception net.sf.saxon.trans.StaticError if a lexical error occurs
334      */

335     public void lookAhead() throws StaticError {
336         precedingToken = nextToken;
337         nextTokenValue = null;
338         nextTokenStartOffset = inputOffset;
339         for (;;) {
340             if (inputOffset >= inputLength) {
341                 nextToken = Token.EOF;
342                 return;
343             }
344             char c = input.charAt(inputOffset++);
345             switch (c) {
346             case '/':
347                 if (inputOffset < inputLength
348                         && input.charAt(inputOffset) == '/') {
349                     inputOffset++;
350                     nextToken = Token.SLSL;
351                     return;
352                 }
353                 nextToken = Token.SLASH;
354                 return;
355             case ':':
356                 if (inputOffset < inputLength) {
357                     if (input.charAt(inputOffset) == ':') {
358                         inputOffset++;
359                         nextToken = Token.COLONCOLON;
360                         return;
361                     } else if (input.charAt(inputOffset) == '=') {
362                         nextToken = Token.ASSIGN;
363                         inputOffset++;
364                         return;
365                     }
366                 }
367                 throw new StaticError("Unexpected colon at start of token");
368             case '@':
369                 nextToken = Token.AT;
370                 return;
371             case '?':
372                 nextToken = Token.QMARK;
373                 return;
374             case '[':
375                 nextToken = Token.LSQB;
376                 return;
377             case ']':
378                 nextToken = Token.RSQB;
379                 return;
380             case '{':
381                 nextToken = Token.LCURLY;
382                 return;
383             case '}':
384                 nextToken = Token.RCURLY;
385                 return;
386             case ';':
387                 nextToken = Token.SEMICOLON;
388                 state = DEFAULT_STATE;
389                 return;
390             case '(':
391                 if (inputOffset < inputLength && input.charAt(inputOffset) == '#') {
392                     inputOffset++;
393                     int pragmaStart = inputOffset;
394                     int nestingDepth = 1;
395                     while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
396                         if (input.charAt(inputOffset) == '\n') {
397                             incrementLineNumber();
398                         } else if (input.charAt(inputOffset) == '#' &&
399                                input.charAt(inputOffset+1) == ')') {
400                             nestingDepth--;
401                             inputOffset++;
402                         } else if (input.charAt(inputOffset) == '(' &&
403                                input.charAt(inputOffset+1) == '#') {
404                             nestingDepth++;
405                             inputOffset++;
406                         }
407                         inputOffset++;
408                     }
409                     if (nestingDepth > 0) {
410                         throw new StaticError("Unclosed XPath comment");
411                     }
412                     nextToken = Token.PRAGMA;
413                     nextTokenValue = input.substring(pragmaStart, inputOffset-2 );
414                     return;
415                 }
416                 if (inputOffset < inputLength && input.charAt(inputOffset) == ':') {
417                     // XPath comment syntax is (: .... :)
418
// Comments may be nested, and must not be empty
419
inputOffset++;
420                     int nestingDepth = 1;
421                     while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
422                         if (input.charAt(inputOffset) == '\n') {
423                             incrementLineNumber();
424                         } else if (input.charAt(inputOffset) == ':' &&
425                                 input.charAt(inputOffset+1) == ')') {
426                             if (input.charAt(inputOffset-2) == '(' &&
427                                     input.charAt(inputOffset-1) == ':') {
428                                 throw new StaticError("Empty XPath comments are not allowed");
429                             }
430                             nestingDepth--;
431                             inputOffset++;
432                         } else if (input.charAt(inputOffset) == '(' &&
433                                input.charAt(inputOffset+1) == ':') {
434                             nestingDepth++;
435                             inputOffset++;
436                         }
437                         inputOffset++;
438                     }
439                     if (nestingDepth > 0) {
440                         throw new StaticError("Unclosed XPath comment");
441                     }
442                     lookAhead();
443                 } else {
444                     nextToken = Token.LPAR;
445                 }
446                 return;
447             case ')':
448                 nextToken = Token.RPAR;
449                 return;
450             case '+':
451                 nextToken = Token.PLUS;
452                 return;
453             case '-':
454                 nextToken = Token.MINUS; // not detected if part of a name
455
return;
456             case '=':
457                 nextToken = Token.EQUALS;
458                 return;
459             case '!':
460                 if (inputOffset < inputLength
461                         && input.charAt(inputOffset) == '=') {
462                     inputOffset++;
463                     nextToken = Token.NE;
464                     return;
465                 }
466                 throw new StaticError("'!' without '='");
467             case '*':
468                 // disambiguation of MULT and STAR is now done later
469
//if (followsOperator()) {
470
if (inputOffset < inputLength
471                             && input.charAt(inputOffset) == ':') {
472                         inputOffset++;
473                         nextToken = Token.SUFFIX;
474                         // we leave the parser to get the following name as a separate
475
// token, but first check there's no intervening white space
476
if (inputOffset < inputLength) {
477                             char ahead = input.charAt(inputOffset);
478                             if (" \r\t\n".indexOf(ahead) >= 0) {
479                                 throw new StaticError("Whitespace is not allowed after '*:'");
480                             }
481                         }
482                         return;
483                     }
484                     nextToken = Token.STAR;
485                 //} else {
486
// nextToken = MULT;
487
//}
488
return;
489             case ',':
490                 nextToken = Token.COMMA;
491                 return;
492             case '$':
493                 nextToken = Token.DOLLAR;
494                 return;
495             case '|':
496                 nextToken = Token.UNION;
497                 return;
498             case '<':
499                 if (inputOffset < inputLength
500                         && input.charAt(inputOffset) == '=') {
501                     inputOffset++;
502                     nextToken = Token.LE;
503                     return;
504                 }
505                 if (inputOffset < inputLength
506                         && input.charAt(inputOffset) == '<') {
507                     inputOffset++;
508                     nextToken = Token.PRECEDES;
509                     return;
510                 }
511                 nextToken = Token.LT;
512                 return;
513             case '>':
514                 if (inputOffset < inputLength
515                         && input.charAt(inputOffset) == '=') {
516                     inputOffset++;
517                     nextToken = Token.GE;
518                     return;
519                 }
520                 if (inputOffset < inputLength
521                         && input.charAt(inputOffset) == '>') {
522                     inputOffset++;
523                     nextToken = Token.FOLLOWS;
524                     return;
525                 }
526                 nextToken = Token.GT;
527                 return;
528             case '.':
529                 if (inputOffset < inputLength
530                         && input.charAt(inputOffset) == '.') {
531                     inputOffset++;
532                     nextToken = Token.DOTDOT;
533                     return;
534                 }
535                 if (inputOffset == inputLength
536                         || input.charAt(inputOffset) < '0'
537                         || input.charAt(inputOffset) > '9') {
538                     nextToken = Token.DOT;
539                     return;
540                 }
541                 // otherwise drop through: we have a number starting with a decimal point
542
case '0':
543             case '1':
544             case '2':
545             case '3':
546             case '4':
547             case '5':
548             case '6':
549             case '7':
550             case '8':
551             case '9':
552                 // The logic here can return some tokens that are not legitimate numbers,
553
// for example "23e" or "1.0e+". However, this will only happen if the XPath
554
// expression as a whole is syntactically incorrect.
555
// These errors will be caught by the numeric constructor.
556
boolean allowE = true;
557                 boolean allowSign = false;
558                 boolean allowDot = true;
559                 boolean endOfNum = false;
560             numloop:
561                 while (!endOfNum) {
562                     switch (c) {
563                         case '0': case '1': case '2': case '3': case '4':
564                         case '5': case '6': case '7': case '8': case '9':
565                             allowSign = false;
566                             break;
567                         case '.':
568                             if (allowDot) {
569                                 allowDot = false;
570                                 allowSign = false;
571                             } else {
572                                 inputOffset--;
573                                 break numloop;
574                             }
575                             break;
576                         case 'E': case 'e':
577                             if (allowE) {
578                                 allowSign = true;
579                                 allowE = false;
580                             } else {
581                                 inputOffset--;
582                                 break numloop;
583                             }
584                             break;
585                         case '+': case '-':
586                             if (allowSign) {
587                                 allowSign = false;
588                             } else {
589                                 inputOffset--;
590                                 break numloop;
591                             }
592                             break;
593                         default:
594                             if (('a' <= c && c <= 'z') || c>127) {
595                                 // this prevents the famous "10div 3"
596
throw new StaticError("Separator needed after numeric literal");
597                             }
598                             inputOffset--;
599                             break numloop;
600                     }
601                     if (inputOffset >= inputLength) break;
602                     c = input.charAt(inputOffset++);
603                 }
604                 nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
605                 nextToken = Token.NUMBER;
606                 return;
607             case '"':
608             case '\'':
609                 nextTokenValue = "";
610                 while (true) {
611                     inputOffset = input.indexOf(c, inputOffset);
612                     if (inputOffset < 0) {
613                         inputOffset = nextTokenStartOffset + 1;
614                         throw new StaticError("Unmatched quote in expression");
615                     }
616                     nextTokenValue += input.substring(nextTokenStartOffset + 1, inputOffset++);
617                     // look for doubled delimiters
618
if (inputOffset < inputLength && input.charAt(inputOffset) == c) {
619                         nextTokenValue += c;
620                         nextTokenStartOffset = inputOffset;
621                         inputOffset++;
622                     } else {
623                         break;
624                     }
625                 }
626
627                 // maintain line number if there are newlines in the string
628
if (nextTokenValue.indexOf('\n') >= 0) {
629                     for (int i = 0; i<nextTokenValue.length(); i++) {
630                         if (nextTokenValue.charAt(i) == '\n') {
631                             lineNumber++;
632                             if (newlineOffsets==null) {
633                                 newlineOffsets = new ArrayList JavaDoc(20);
634                             }
635                             newlineOffsets.add(new Integer JavaDoc(nextTokenStartOffset+i));
636                         }
637                     }
638                 }
639                 nextTokenValue = nextTokenValue.intern();
640                 nextToken = Token.STRING_LITERAL;
641                 return;
642             case '\n':
643                 incrementLineNumber();
644                 // drop through
645
case ' ':
646             case '\t':
647             case '\r':
648                 nextTokenStartOffset = inputOffset;
649                 break;
650             default:
651                 if (c < 0x80 && !Character.isLetter(c)) {
652                     throw new StaticError("Invalid character '" + c + "' in expression");
653                 }
654                 /* fall through */
655             case '_':
656             loop:
657                 for (;inputOffset < inputLength; inputOffset++) {
658                     c = input.charAt(inputOffset);
659                     switch (c) {
660                     case ':':
661                         if (inputOffset+1 < inputLength) {
662                             char nc = input.charAt(inputOffset+1);
663                             if (nc == ':') {
664                                 nextTokenValue = input.substring(nextTokenStartOffset,
665                                                                 inputOffset).intern();
666                                 nextToken = Token.AXIS;
667                                 inputOffset+=2;
668                                 return;
669                             } else if (nc == '*') {
670                                 nextTokenValue = input.substring(nextTokenStartOffset,
671                                                                 inputOffset).intern();
672                                 nextToken = Token.PREFIX;
673                                 inputOffset+=2;
674                                 return;
675                             } else if (nc == '=') {
676                                 // as in "let $x:=2"
677
nextTokenValue = input.substring(nextTokenStartOffset,
678                                                                 inputOffset).intern();
679                                 nextToken = Token.NAME;
680                                 return;
681                             }
682                         }
683                         break;
684                     case '.':
685                     case '-':
686                     case '_':
687                         break;
688
689                     default:
690                         if (c < 0x80 && !Character.isLetterOrDigit(c))
691                             break loop;
692                         break;
693                     }
694                 }
695                 nextTokenValue = input.substring(nextTokenStartOffset,
696                                                         inputOffset).intern();
697                 nextToken = Token.NAME;
698                 return;
699             }
700         }
701     }
702
703     /**
704      * Identify a binary operator
705      *
706      * @param s String representation of the operator - must be interned
707      * @return the token number of the operator, or UNKNOWN if it is not a
708      * known operator
709      */

710
711     private static int getBinaryOp(String JavaDoc s) {
712         switch(s.length()) {
713             case 2:
714                 if (s=="or") return Token.OR;
715                 if (s=="is") return Token.IS;
716                 if (s=="to") return Token.TO;
717                 if (s=="in") return Token.IN;
718                 if (s=="eq") return Token.FEQ;
719                 if (s=="ne") return Token.FNE;
720                 if (s=="gt") return Token.FGT;
721                 if (s=="ge") return Token.FGE;
722                 if (s=="lt") return Token.FLT;
723                 if (s=="le") return Token.FLE;
724                 break;
725             case 3:
726                 if (s=="and") return Token.AND;
727                 if (s=="div") return Token.DIV;
728                 if (s=="mod") return Token.MOD;
729                 break;
730             case 4:
731                 if (s=="idiv") return Token.IDIV;
732                 if (s=="then") return Token.THEN;
733                 if (s=="else") return Token.ELSE;
734                 if (s=="case") return Token.CASE;
735                 break;
736             case 5:
737                 if (s=="where") return Token.WHERE;
738                 if (s=="union") return Token.UNION;
739                 break;
740             case 6:
741                 if (s=="except") return Token.EXCEPT;
742                 if (s=="return") return Token.RETURN;
743                 break;
744             case 7:
745                 if (s=="default") return Token.DEFAULT;
746             case 9:
747                 if (s=="intersect") return Token.INTERSECT;
748                 if (s=="satisfies") return Token.SATISFIES;
749                 break;
750         }
751         return Token.UNKNOWN;
752     }
753
754     /**
755      * Distinguish nodekind names, "if", and function names, which are all
756      * followed by a "("
757      *
758      * @param s the name - must be interned
759      * @return the token number
760      */

761
762     private static int getFunctionType(String JavaDoc s) {
763         switch(s.length()) {
764             case 2:
765                 if (s=="if") return Token.IF;
766                 break;
767             case 4:
768                 if (s=="node") return Token.NODEKIND;
769                 if (s=="item") return Token.NODEKIND;
770                 if (s=="text") return Token.NODEKIND;
771                 if (s=="void") return Token.NODEKIND; // TODO: removed from latest draft
772
break;
773             case 7:
774                 if (s=="element") return Token.NODEKIND;
775                 if (s=="comment") return Token.NODEKIND;
776                 break;
777             case 9:
778                 if (s=="attribute") return Token.NODEKIND;
779                 if (s=="namespace") return Token.NODEKIND;
780                 break;
781             case 10:
782                 if (s=="typeswitch") return Token.TYPESWITCH;
783                 break;
784             default:
785                 if (s=="document-node") return Token.NODEKIND;
786                 if (s=="empty-sequence") return Token.NODEKIND;
787                 if (s=="schema-element") return Token.NODEKIND;
788                 if (s=="schema-attribute") return Token.NODEKIND;
789                 if (s=="processing-instruction") return Token.NODEKIND;
790
791                 break;
792         }
793         return Token.FUNCTION;
794     }
795
796     /**
797      * Test whether the previous token is an operator
798      * @return true if the previous token is an operator token
799      */

800
801     private boolean followsOperator() {
802         return precedingToken <= Token.LAST_OPERATOR;
803     }
804
805     /**
806      * Read next character directly. Used by the XQuery parser when parsing pseudo-XML syntax
807      * @return the next character from the input
808      * @throws StringIndexOutOfBoundsException if an attempt is made to read beyond
809      * the end of the string. This will only occur in the event of a syntax error in the
810      * input.
811      */

812
813     public char nextChar() throws StringIndexOutOfBoundsException JavaDoc {
814         char c = input.charAt(inputOffset++);
815         //c = normalizeLineEnding(c);
816
if (c=='\n') {
817             incrementLineNumber();
818             lineNumber++;
819         }
820         return c;
821     }
822
823     /**
824      * Normalize line endings according to the rules in XML 1.1.
825      * @param c the most recently read character. The value of inputOffset must be the immediately following
826      * character
827      * @return c the current character after newline normalization
828      */

829
830 // private char normalizeLineEnding(char c) throws StringIndexOutOfBoundsException {
831
// switch (c) {
832
// case '\r':
833
// if (input.charAt(inputOffset) == '\n' || input.charAt(inputOffset) == 0x85) {
834
// inputOffset++;
835
// return '\n';
836
// } else {
837
// return '\n';
838
// }
839
// case 0x85:
840
// return '\n';
841
// case 0x2028:
842
// return '\n';
843
// default:
844
// return c;
845
// }
846
// }
847

848     /**
849      * Increment the line number, making a record of where in the input string the newline character occurred.
850      */

851
852     private void incrementLineNumber() {
853         nextLineNumber++;
854         if (newlineOffsets==null) {
855             newlineOffsets = new ArrayList JavaDoc(20);
856         }
857         newlineOffsets.add(new Integer JavaDoc(inputOffset-1));
858     }
859
860     /**
861      * Step back one character. If this steps back to a previous line, adjust the line number.
862      */

863
864     public void unreadChar() {
865         if (input.charAt(--inputOffset) == '\n') {
866             nextLineNumber--;
867             lineNumber--;
868             if (newlineOffsets != null) {
869                 newlineOffsets.remove(newlineOffsets.size()-1);
870             }
871         }
872     }
873
874     /**
875      * Get the most recently read text (for use in an error message)
876      */

877
878     public String JavaDoc recentText() {
879         if (inputOffset > inputLength) {
880             inputOffset = inputLength;
881         }
882         if (inputOffset < 34) {
883             return input.substring(0, inputOffset);
884         } else {
885             return NormalizeSpace.normalize(
886                     "..." + input.substring(inputOffset-30, inputOffset)).toString();
887         }
888     }
889
890     /**
891      * Get the line number of the current token
892      */

893
894     public int getLineNumber() {
895         return lineNumber;
896     }
897
898     /**
899      * Get the column number of the current token
900      */

901
902     public int getColumnNumber() {
903         return (int)(getLineAndColumn(currentTokenStartOffset)&0x7fffffff);
904     }
905
906 // --Commented out by Inspection START (16/12/04 14:40):
907
// /**
908
// * Get the line and column number of the current token,
909
// * as a long value with the line number in the top half
910
// * and the column number in the lower half
911
// * @return the line and column number, packed together
912
// */
913
//
914
// public long getLineAndColumn() {
915
// return ((long)getLineNumber()) << 32 | ((long)getColumnNumber());
916
// }
917
// --Commented out by Inspection STOP (16/12/04 14:40)
918

919
920     /**
921      * Get the line and column number corresponding to a given offset in the input expression,
922      * as a long value with the line number in the top half
923      * and the column number in the lower half
924      * @return the line and column number, packed together
925      */

926
927     public long getLineAndColumn(int offset) {
928         if (newlineOffsets==null) {
929             return ((long)startLineNumber) << 32 | (long)offset;
930         }
931         for (int line=newlineOffsets.size()-1; line>=0; line--) {
932             int nloffset = ((Integer JavaDoc)newlineOffsets.get(line)).intValue();
933             if (offset > nloffset) {
934                 return ((long)(line+startLineNumber+1)<<32) | ((long)(offset - nloffset));
935             }
936         }
937         return ((long)startLineNumber) << 32 | (long)(offset+1);
938     }
939
940     public int getLineNumber(int offset) {
941         return (int)((getLineAndColumn(offset))>>32);
942     }
943
944     public int getColumnNumber(int offset) {
945         return (int)((getLineAndColumn(offset))&0x7fffffff);
946     }
947
948 }
949
950 /*
951
952 The following copyright notice is copied from the licence for xt, from which the
953 original version of this module was derived:
954 --------------------------------------------------------------------------------
955 Copyright (c) 1998, 1999 James Clark
956
957 Permission is hereby granted, free of charge, to any person obtaining
958 a copy of this software and associated documentation files (the
959 "Software"), to deal in the Software without restriction, including
960 without limitation the rights to use, copy, modify, merge, publish,
961 distribute, sublicense, and/or sell copies of the Software, and to
962 permit persons to whom the Software is furnished to do so, subject to
963 the following conditions:
964
965 The above copyright notice and this permission notice shall be included
966 in all copies or substantial portions of the Software.
967
968 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
969 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
970 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
971 IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
972 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
973 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
974 OTHER DEALINGS IN THE SOFTWARE.
975
976 Except as contained in this notice, the name of James Clark shall
977 not be used in advertising or otherwise to promote the sale, use or
978 other dealings in this Software without prior written authorization
979 from James Clark.
980 ---------------------------------------------------------------------------
981 */

982
983 //
984
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
985
// you may not use this file except in compliance with the License. You may obtain a copy of the
986
// License at http://www.mozilla.org/MPL/
987
//
988
// Software distributed under the License is distributed on an "AS IS" basis,
989
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
990
// See the License for the specific language governing rights and limitations under the License.
991
//
992
// The Original Code is: all this file, other than the parts developed by James Clark as part of xt.
993
//
994
// The Initial Developer of the Original Code is Michael H. Kay.
995
//
996
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
997
//
998
// Contributor(s): none.
999
//
Popular Tags