Tokenizer


1   package net.sf.saxon.expr;
2   import net.sf.saxon.functions.NormalizeSpace;
3   import net.sf.saxon.trans.StaticError;
4   
5   import java.util.ArrayList  ;
6   import java.util.List  ;
7   
8   /**
9    * Tokenizer for expressions and inputs.
10   *
11   * This code was originally derived from James Clark's xt, though it has been greatly modified since.
12   * See copyright notice at end of file.
13   */
14  
15  
16  public final class Tokenizer {
17  
18      public int getState() {
19          return state;
20      }
21  
22      public void setState(int state) {
23          this.state = state;
24          if (state==DEFAULT_STATE) {
25              // force the followsOperator() test to return true
26              precedingToken = Token.UNKNOWN;
27              currentToken = Token.UNKNOWN;
28          } else if (state==OPERATOR_STATE) {
29              precedingToken = Token.RPAR;
30              currentToken = Token.RPAR;
31          }
32      }
33  
34      private int state = DEFAULT_STATE;
35          // we may need to make this a stack at some time
36  
37      /**
38       * Initial default state of the Tokenizer
39       */
40      public static final int DEFAULT_STATE = 0;
41  
42      /**
43       * State in which a name is NOT to be merged with what comes next, for example "("
44       */
45      public static final int BARE_NAME_STATE = 1;
46  
47      /**
48       * State in which the next thing to be read is a SequenceType
49       */
50      public static final int SEQUENCE_TYPE_STATE = 2;
51      /**
52       * State in which the next thing to be read is an operator
53       */
54  
55      public static final int OPERATOR_STATE = 3;
56  
57      /**
58       * The starting line number (for XPath in XSLT, the line number in the stylesheet)
59       */
60      public int startLineNumber;
61      /**
62       * The number identifying the most recently read token
63       */
64      public int currentToken = Token.EOF;
65      /**
66       * The string value of the most recently read token
67       */
68      public String   currentTokenValue = null;
69      /**
70       * The position in the input expression where the current token starts
71       */
72      public int currentTokenStartOffset = 0;
73      /**
74       * The number of the next token to be returned
75       */
76      private int nextToken = Token.EOF;
77      /**
78       * The string value of the next token to be returned
79       */
80      private String   nextTokenValue = null;
81      /**
82       * The position in the expression of the start of the next token
83       */
84      private int nextTokenStartOffset = 0;
85      /**
86       * The string being parsed
87       */
88      public String   input;
89      /**
90       * The current position within the input string
91       */
92      public int inputOffset = 0;
93      /**
94       * The length of the input string
95       */
96      private int inputLength;
97      /**
98       * The line number (within the expression) of the current token
99       */
100     private int lineNumber = 1;
101     /**
102      * The line number (within the expression) of the next token
103      */
104     private int nextLineNumber = 1;
105 
106     /**
107      * List containing the positions (offsets in the input string) at which newline characters
108      * occur
109      */
110 
111     private List   newlineOffsets = null;
112 
113     /**
114      * The token number of the token that preceded the current token
115      */
116     private int precedingToken = Token.UNKNOWN;
117 
118 
119     //public boolean recognizePragmas = false;
120     //public String lastPragma = null;
121 
122     //
123     // Lexical analyser for expressions, queries, and XSLT patterns
124     //
125 
126     /**
127      * Prepare a string for tokenization.
128      * The actual tokens are obtained by calls on next()
129      *
130      * @param input the string to be tokenized
131      * @param start start point within the string
132      * @param end end point within the string (last character not read):
133      * -1 means end of string
134      * @exception net.sf.saxon.trans.StaticError if a lexical error occurs, e.g. unmatched
135      *     string quotes
136      */
137     public void tokenize(String   input, int start, int end, int lineNumber) throws StaticError {
138         nextToken = Token.EOF;
139         nextTokenValue = null;
140         nextTokenStartOffset = 0;
141         inputOffset = start;
142         this.input = input;
143         this.startLineNumber = lineNumber;
144         this.lineNumber = lineNumber;
145         this.nextLineNumber = lineNumber;
146         if (end==-1) {
147             this.inputLength = input.length();
148         } else {
149             this.inputLength = end;
150         }
151 
152         // The tokenizer actually reads one token ahead. The raw lexical analysis performed by
153         // the lookAhead() method does not (in general) distinguish names used as QNames from names
154         // used for operators, axes, and functions. The next() routine further refines names into the
155         // correct category, by looking at the following token. In addition, it combines compound tokens
156         // such as "instance of" and "cast as".
157 
158         lookAhead();
159         next();
160     }
161 
162     //diagnostic version of next(): change real version to realnext()
163     //
164     //public void next() throws XPathException {
165     //    realnext();
166     //    System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
167     //}
168 
169     /**
170      * Get the next token from the input expression. The type of token is returned in the
171      * currentToken variable, the string value of the token in currentTokenValue.
172      *
173      * @exception net.sf.saxon.trans.StaticError if a lexical error is detected
174      */
175 
176     public void next() throws StaticError {
177         precedingToken = currentToken;
178         currentToken = nextToken;
179         currentTokenValue = nextTokenValue;
180         if (currentTokenValue==null) {
181             currentTokenValue="";
182         }
183         currentTokenStartOffset = nextTokenStartOffset;
184         lineNumber = nextLineNumber;
185 
186         // disambiguate the current token based on the tokenizer state
187 
188         switch (currentToken) {
189             case Token.NAME:
190                 int optype = getBinaryOp(currentTokenValue);
191                 if (optype!=Token.UNKNOWN && !followsOperator()) {
192                     currentToken = optype;
193                 }
194                 break;
195             case Token.LT:
196                 if (followsOperator()) {
197                     currentToken = Token.TAG;
198                 }
199                 break;
200             case Token.STAR:
201                 if (!followsOperator()) {
202                     currentToken = Token.MULT;
203                 }
204                 break;
205         }
206 
207         if (currentToken == Token.TAG || currentToken == Token.RCURLY) {
208             // No lookahead after encountering "<" at the start of an XML-like tag.
209             // After an RCURLY, the parser must do an explicit lookahead() to continue
210             // tokenizing; otherwise it can continue with direct character reading
211             return;
212         }
213 
214         lookAhead();
215 
216         if (currentToken == Token.NAME) {
217             if (state == BARE_NAME_STATE) {
218                 return;
219             }
220             switch (nextToken) {
221                 case Token.LPAR:
222                     int op = getBinaryOp(currentTokenValue);
223                     if (op == Token.UNKNOWN) {
224                         currentToken = getFunctionType(currentTokenValue);
225                         lookAhead();    // swallow the "("
226                     } else {
227                         currentToken = op;
228                     }
229                     break;
230 
231                 case Token.LCURLY:
232                     if (!(state == SEQUENCE_TYPE_STATE)) {
233                         currentToken = Token.KEYWORD_CURLY;
234                         lookAhead();        // swallow the "{"
235                     }
236                     break;
237 
238                 case Token.COLONCOLON:
239                     lookAhead();
240                     currentToken = Token.AXIS;
241                     break;
242 
243                 case Token.COLONSTAR:
244                     lookAhead();
245                     currentToken = Token.PREFIX;
246                     break;
247 
248                 case Token.DOLLAR:
249                     if (currentTokenValue=="for") {
250                         currentToken = Token.FOR;
251                     } else if (currentTokenValue=="some") {
252                         currentToken = Token.SOME;
253                     } else if (currentTokenValue=="every") {
254                         currentToken = Token.EVERY;
255                     } else if (currentTokenValue=="let") {
256                         currentToken = Token.LET;
257                     }
258                     break;
259 
260                 case Token.NAME:
261                     int candidate = -1;
262                     if (currentTokenValue.equals("element")) {
263                         candidate = Token.ELEMENT_QNAME;
264                     } else if (currentTokenValue.equals("attribute")) {
265                         candidate = Token.ATTRIBUTE_QNAME;
266                     } else if (currentTokenValue.equals("processing-instruction")) {
267                         candidate = Token.PI_QNAME;
268                     }
269                     if (candidate != -1) {
270                         // <'element' QName '{'> constructor
271                         // <'attribute' QName '{'> constructor
272                         // <'processing-instruction' QName '{'> constructor
273 
274                         String   qname = nextTokenValue;
275                         String   saveTokenValue = currentTokenValue;
276                         int savePosition = inputOffset;
277                         lookAhead();
278                         if (nextToken == Token.LCURLY) {
279                             currentToken = candidate;
280                             currentTokenValue = qname;
281                             lookAhead();
282                             return;
283                         } else {
284                             // backtrack (we don't have 2-token lookahead; this is the
285                             // only case where it's needed. So we backtrack instead.)
286                             currentToken = Token.NAME;
287                             currentTokenValue = saveTokenValue;
288                             inputOffset = savePosition;
289                             nextToken = Token.NAME;
290                             nextTokenValue = qname;
291                         }
292 
293                     }
294                     String   composite = currentTokenValue + ' ' + nextTokenValue;
295                     Integer   val = (Integer  )Token.doubleKeywords.get(composite);
296                     if (val==null) {
297                         break;
298                     } else {
299                         currentToken = val.intValue();
300                         currentTokenValue = composite;
301                         lookAhead();
302                         return;
303                     }
304                 default:
305                     // no action needed
306             }
307         }
308     }
309 
310     /**
311      * Force the current token to be treated as an operator if possible
312      */
313 
314     public void treatCurrentAsOperator() {
315         switch (currentToken) {
316             case Token.NAME:
317                 int optype = getBinaryOp(currentTokenValue);
318                 if (optype!=Token.UNKNOWN) {
319                     currentToken = optype;
320                 }
321                 break;
322             case Token.STAR:
323                 currentToken = Token.MULT;
324                 break;
325         }
326     }
327 
328     /**
329      * Look ahead by one token. This method does the real tokenization work.
330      * The method is normally called internally, but the XQuery parser also
331      * calls it to resume normal tokenization after dealing with pseudo-XML
332      * syntax.
333      * @exception net.sf.saxon.trans.StaticError if a lexical error occurs
334      */
335     public void lookAhead() throws StaticError {
336         precedingToken = nextToken;
337         nextTokenValue = null;
338         nextTokenStartOffset = inputOffset;
339         for (;;) {
340             if (inputOffset >= inputLength) {
341                 nextToken = Token.EOF;
342                 return;
343             }
344             char c = input.charAt(inputOffset++);
345             switch (c) {
346             case '/':
347                 if (inputOffset < inputLength
348                         && input.charAt(inputOffset) == '/') {
349                     inputOffset++;
350                     nextToken = Token.SLSL;
351                     return;
352                 }
353                 nextToken = Token.SLASH;
354                 return;
355             case ':':
356                 if (inputOffset < inputLength) {
357                     if (input.charAt(inputOffset) == ':') {
358                         inputOffset++;
359                         nextToken = Token.COLONCOLON;
360                         return;
361                     } else if (input.charAt(inputOffset) == '=') {
362                         nextToken = Token.ASSIGN;
363                         inputOffset++;
364                         return;
365                     }
366                 }
367                 throw new StaticError("Unexpected colon at start of token");
368             case '@':
369                 nextToken = Token.AT;
370                 return;
371             case '?':
372                 nextToken = Token.QMARK;
373                 return;
374             case '[':
375                 nextToken = Token.LSQB;
376                 return;
377             case ']':
378                 nextToken = Token.RSQB;
379                 return;
380             case '{':
381                 nextToken = Token.LCURLY;
382                 return;
383             case '}':
384                 nextToken = Token.RCURLY;
385                 return;
386             case ';':
387                 nextToken = Token.SEMICOLON;
388                 state = DEFAULT_STATE;
389                 return;
390             case '(':
391                 if (inputOffset < inputLength && input.charAt(inputOffset) == '#') {
392                     inputOffset++;
393                     int pragmaStart = inputOffset;
394                     int nestingDepth = 1;
395                     while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
396                         if (input.charAt(inputOffset) == '\n') {
397                             incrementLineNumber();
398                         } else if (input.charAt(inputOffset) == '#' &&
399                                input.charAt(inputOffset+1) == ')') {
400                             nestingDepth--;
401                             inputOffset++;
402                         } else if (input.charAt(inputOffset) == '(' &&
403                                input.charAt(inputOffset+1) == '#') {
404                             nestingDepth++;
405                             inputOffset++;
406                         }
407                         inputOffset++;
408                     }
409                     if (nestingDepth > 0) {
410                         throw new StaticError("Unclosed XPath comment");
411                     }
412                     nextToken = Token.PRAGMA;
413                     nextTokenValue = input.substring(pragmaStart, inputOffset-2 );
414                     return;
415                 }
416                 if (inputOffset < inputLength && input.charAt(inputOffset) == ':') {
417                     // XPath comment syntax is (: .... :)
418                     // Comments may be nested, and must not be empty
419                     inputOffset++;
420                     int nestingDepth = 1;
421                     while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
422                         if (input.charAt(inputOffset) == '\n') {
423                             incrementLineNumber();
424                         } else if (input.charAt(inputOffset) == ':' &&
425                                 input.charAt(inputOffset+1) == ')') {
426                             if (input.charAt(inputOffset-2) == '(' &&
427                                     input.charAt(inputOffset-1) == ':') {
428                                 throw new StaticError("Empty XPath comments are not allowed");
429                             }
430                             nestingDepth--;
431                             inputOffset++;
432                         } else if (input.charAt(inputOffset) == '(' &&
433                                input.charAt(inputOffset+1) == ':') {
434                             nestingDepth++;
435                             inputOffset++;
436                         }
437                         inputOffset++;
438                     }
439                     if (nestingDepth > 0) {
440                         throw new StaticError("Unclosed XPath comment");
441                     }
442                     lookAhead();
443                 } else {
444                     nextToken = Token.LPAR;
445                 }
446                 return;
447             case ')':
448                 nextToken = Token.RPAR;
449                 return;
450             case '+':
451                 nextToken = Token.PLUS;
452                 return;
453             case '-':
454                 nextToken = Token.MINUS;   // not detected if part of a name
455                 return;
456             case '=':
457                 nextToken = Token.EQUALS;
458                 return;
459             case '!':
460                 if (inputOffset < inputLength
461                         && input.charAt(inputOffset) == '=') {
462                     inputOffset++;
463                     nextToken = Token.NE;
464                     return;
465                 }
466                 throw new StaticError("'!' without '='");
467             case '*':
468                 // disambiguation of MULT and STAR is now done later
469                 //if (followsOperator()) {
470                     if (inputOffset < inputLength
471                             && input.charAt(inputOffset) == ':') {
472                         inputOffset++;
473                         nextToken = Token.SUFFIX;
474                         // we leave the parser to get the following name as a separate
475                         // token, but first check there's no intervening white space
476                         if (inputOffset < inputLength) {
477                             char ahead = input.charAt(inputOffset);
478                             if (" \r\t\n".indexOf(ahead) >= 0) {
479                                 throw new StaticError("Whitespace is not allowed after '*:'");
480                             }
481                         }
482                         return;
483                     }
484                     nextToken = Token.STAR;
485                 //} else {
486                 //    nextToken = MULT;
487                 //}
488                 return;
489             case ',':
490                 nextToken = Token.COMMA;
491                 return;
492             case '$':
493                 nextToken = Token.DOLLAR;
494                 return;
495             case '|':
496                 nextToken = Token.UNION;
497                 return;
498             case '<':
499                 if (inputOffset < inputLength
500                         && input.charAt(inputOffset) == '=') {
501                     inputOffset++;
502                     nextToken = Token.LE;
503                     return;
504                 }
505                 if (inputOffset < inputLength
506                         && input.charAt(inputOffset) == '<') {
507                     inputOffset++;
508                     nextToken = Token.PRECEDES;
509                     return;
510                 }
511                 nextToken = Token.LT;
512                 return;
513             case '>':
514                 if (inputOffset < inputLength
515                         && input.charAt(inputOffset) == '=') {
516                     inputOffset++;
517                     nextToken = Token.GE;
518                     return;
519                 }
520                 if (inputOffset < inputLength
521                         && input.charAt(inputOffset) == '>') {
522                     inputOffset++;
523                     nextToken = Token.FOLLOWS;
524                     return;
525                 }
526                 nextToken = Token.GT;
527                 return;
528             case '.':
529                 if (inputOffset < inputLength
530                         && input.charAt(inputOffset) == '.') {
531                     inputOffset++;
532                     nextToken = Token.DOTDOT;
533                     return;
534                 }
535                 if (inputOffset == inputLength
536                         || input.charAt(inputOffset) < '0'
537                         || input.charAt(inputOffset) > '9') {
538                     nextToken = Token.DOT;
539                     return;
540                 }
541                 // otherwise drop through: we have a number starting with a decimal point
542             case '0':
543             case '1':
544             case '2':
545             case '3':
546             case '4':
547             case '5':
548             case '6':
549             case '7':
550             case '8':
551             case '9':
552                 // The logic here can return some tokens that are not legitimate numbers,
553                 // for example "23e" or "1.0e+". However, this will only happen if the XPath
554                 // expression as a whole is syntactically incorrect.
555                 // These errors will be caught by the numeric constructor.
556                 boolean allowE = true;
557                 boolean allowSign = false;
558                 boolean allowDot = true;
559                 boolean endOfNum = false;
560             numloop:
561                 while (!endOfNum) {
562                     switch (c) {
563                         case '0': case '1': case '2': case '3': case '4':
564                         case '5': case '6': case '7': case '8': case '9':
565                             allowSign = false;
566                             break;
567                         case '.':
568                             if (allowDot) {
569                                 allowDot = false;
570                                 allowSign = false;
571                             } else {
572                                 inputOffset--;
573                                 break numloop;
574                             }
575                             break;
576                         case 'E': case 'e':
577                             if (allowE) {
578                                 allowSign = true;
579                                 allowE = false;
580                             } else {
581                                 inputOffset--;
582                                 break numloop;
583                             }
584                             break;
585                         case '+': case '-':
586                             if (allowSign) {
587                                 allowSign = false;
588                             } else {
589                                 inputOffset--;
590                                 break numloop;
591                             }
592                             break;
593                         default:
594                             if (('a' <= c && c <= 'z') || c>127) {
595                                 // this prevents the famous "10div 3"
596                                 throw new StaticError("Separator needed after numeric literal");
597                             }
598                             inputOffset--;
599                             break numloop;
600                     }
601                     if (inputOffset >= inputLength) break;
602                     c = input.charAt(inputOffset++);
603                 }
604                 nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
605                 nextToken = Token.NUMBER;
606                 return;
607             case '"':
608             case '\'':
609                 nextTokenValue = "";
610                 while (true) {
611                     inputOffset = input.indexOf(c, inputOffset);
612                     if (inputOffset < 0) {
613                         inputOffset = nextTokenStartOffset + 1;
614                         throw new StaticError("Unmatched quote in expression");
615                     }
616                     nextTokenValue += input.substring(nextTokenStartOffset + 1, inputOffset++);
617                     // look for doubled delimiters
618                     if (inputOffset < inputLength && input.charAt(inputOffset) == c) {
619                         nextTokenValue += c;
620                         nextTokenStartOffset = inputOffset;
621                         inputOffset++;
622                     } else {
623                         break;
624                     }
625                 }
626 
627                 // maintain line number if there are newlines in the string
628                 if (nextTokenValue.indexOf('\n') >= 0) {
629                     for (int i = 0; i<nextTokenValue.length(); i++) {
630                         if (nextTokenValue.charAt(i) == '\n') {
631                             lineNumber++;
632                             if (newlineOffsets==null) {
633                                 newlineOffsets = new ArrayList  (20);
634                             }
635                             newlineOffsets.add(new Integer  (nextTokenStartOffset+i));
636                         }
637                     }
638                 }
639                 nextTokenValue = nextTokenValue.intern();
640                 nextToken = Token.STRING_LITERAL;
641                 return;
642             case '\n':
643                 incrementLineNumber();
644                 // drop through
645             case ' ':
646             case '\t':
647             case '\r':
648                 nextTokenStartOffset = inputOffset;
649                 break;
650             default:
651                 if (c < 0x80 && !Character.isLetter(c)) {
652                     throw new StaticError("Invalid character '" + c + "' in expression");
653                 }
654                 /* fall through */
655             case '_':
656             loop:
657                 for (;inputOffset < inputLength; inputOffset++) {
658                     c = input.charAt(inputOffset);
659                     switch (c) {
660                     case ':':
661                         if (inputOffset+1 < inputLength) {
662                             char nc = input.charAt(inputOffset+1);
663                             if (nc == ':') {
664                                 nextTokenValue = input.substring(nextTokenStartOffset,
665                                                                 inputOffset).intern();
666                                 nextToken = Token.AXIS;
667                                 inputOffset+=2;
668                                 return;
669                             } else if (nc == '*') {
670                                 nextTokenValue = input.substring(nextTokenStartOffset,
671                                                                 inputOffset).intern();
672                                 nextToken = Token.PREFIX;
673                                 inputOffset+=2;
674                                 return;
675                             } else if (nc == '=') {
676                                 // as in "let $x:=2"
677                                 nextTokenValue = input.substring(nextTokenStartOffset,
678                                                                 inputOffset).intern();
679                                 nextToken = Token.NAME;
680                                 return;
681                             }
682                         }
683                         break;
684                     case '.':
685                     case '-':
686                     case '_':
687                         break;
688 
689                     default:
690                         if (c < 0x80 && !Character.isLetterOrDigit(c))
691                             break loop;
692                         break;
693                     }
694                 }
695                 nextTokenValue = input.substring(nextTokenStartOffset,
696                                                         inputOffset).intern();
697                 nextToken = Token.NAME;
698                 return;
699             }
700         }
701     }
702 
703     /**
704      * Identify a binary operator
705      *
706      * @param s String representation of the operator - must be interned
707      * @return the token number of the operator, or UNKNOWN if it is not a
708      *     known operator
709      */
710 
711     private static int getBinaryOp(String   s) {
712         switch(s.length()) {
713             case 2:
714                 if (s=="or") return Token.OR;
715                 if (s=="is") return Token.IS;
716                 if (s=="to") return Token.TO;
717                 if (s=="in") return Token.IN;
718                 if (s=="eq") return Token.FEQ;
719                 if (s=="ne") return Token.FNE;
720                 if (s=="gt") return Token.FGT;
721                 if (s=="ge") return Token.FGE;
722                 if (s=="lt") return Token.FLT;
723                 if (s=="le") return Token.FLE;
724                 break;
725             case 3:
726                 if (s=="and") return Token.AND;
727                 if (s=="div") return Token.DIV;
728                 if (s=="mod") return Token.MOD;
729                 break;
730             case 4:
731                 if (s=="idiv") return Token.IDIV;
732                 if (s=="then") return Token.THEN;
733                 if (s=="else") return Token.ELSE;
734                 if (s=="case") return Token.CASE;
735                 break;
736             case 5:
737                 if (s=="where") return Token.WHERE;
738                 if (s=="union") return Token.UNION;
739                 break;
740             case 6:
741                 if (s=="except") return Token.EXCEPT;
742                 if (s=="return") return Token.RETURN;
743                 break;
744             case 7:
745                 if (s=="default") return Token.DEFAULT;
746             case 9:
747                 if (s=="intersect") return Token.INTERSECT;
748                 if (s=="satisfies") return Token.SATISFIES;
749                 break;
750         }
751         return Token.UNKNOWN;
752     }
753 
754     /**
755      * Distinguish nodekind names, "if", and function names, which are all
756      * followed by a "("
757      *
758      * @param s the name - must be interned
759      * @return the token number
760      */
761 
762     private static int getFunctionType(String   s) {
763         switch(s.length()) {
764             case 2:
765                 if (s=="if") return Token.IF;
766                 break;
767             case 4:
768                 if (s=="node") return Token.NODEKIND;
769                 if (s=="item") return Token.NODEKIND;
770                 if (s=="text") return Token.NODEKIND;
771                 if (s=="void") return Token.NODEKIND;   // TODO: removed from latest draft
772                 break;
773             case 7:
774                 if (s=="element") return Token.NODEKIND;
775                 if (s=="comment") return Token.NODEKIND;
776                 break;
777             case 9:
778                 if (s=="attribute") return Token.NODEKIND;
779                 if (s=="namespace") return Token.NODEKIND;
780                 break;
781             case 10:
782                 if (s=="typeswitch") return Token.TYPESWITCH;
783                 break;
784             default:
785                 if (s=="document-node") return Token.NODEKIND;
786                 if (s=="empty-sequence") return Token.NODEKIND;
787                 if (s=="schema-element") return Token.NODEKIND;
788                 if (s=="schema-attribute") return Token.NODEKIND;
789                 if (s=="processing-instruction") return Token.NODEKIND;
790 
791                 break;
792         }
793         return Token.FUNCTION;
794     }
795 
796     /**
797      * Test whether the previous token is an operator
798      * @return true if the previous token is an operator token
799      */
800 
801     private boolean followsOperator() {
802         return precedingToken <= Token.LAST_OPERATOR;
803     }
804 
805     /**
806      * Read next character directly. Used by the XQuery parser when parsing pseudo-XML syntax
807      * @return the next character from the input
808      * @throws StringIndexOutOfBoundsException if an attempt is made to read beyond
809      * the end of the string. This will only occur in the event of a syntax error in the
810      * input.
811      */
812 
813     public char nextChar() throws StringIndexOutOfBoundsException   {
814         char c = input.charAt(inputOffset++);
815         //c = normalizeLineEnding(c);
816         if (c=='\n') {
817             incrementLineNumber();
818             lineNumber++;
819         }
820         return c;
821     }
822 
823     /**
824      * Normalize line endings according to the rules in XML 1.1.
825      * @param c the most recently read character. The value of inputOffset must be the immediately following
826      * character
827      * @return c the current character after newline normalization
828      */
829 
830 //    private char normalizeLineEnding(char c) throws StringIndexOutOfBoundsException {
831 //        switch (c)  {
832 //            case '\r':
833 //                if (input.charAt(inputOffset) == '\n' || input.charAt(inputOffset) == 0x85) {
834 //                    inputOffset++;
835 //                    return '\n';
836 //                } else {
837 //                    return '\n';
838 //                }
839 //            case 0x85:
840 //                return '\n';
841 //            case 0x2028:
842 //                return '\n';
843 //            default:
844 //                return c;
845 //        }
846 //    }
847 
848     /**
849      * Increment the line number, making a record of where in the input string the newline character occurred.
850      */
851 
852     private void incrementLineNumber() {
853         nextLineNumber++;
854         if (newlineOffsets==null) {
855             newlineOffsets = new ArrayList  (20);
856         }
857         newlineOffsets.add(new Integer  (inputOffset-1));
858     }
859 
860     /**
861      * Step back one character. If this steps back to a previous line, adjust the line number.
862      */
863 
864     public void unreadChar() {
865         if (input.charAt(--inputOffset) == '\n') {
866             nextLineNumber--;
867             lineNumber--;
868             if (newlineOffsets != null) {
869                 newlineOffsets.remove(newlineOffsets.size()-1);
870             }
871         }
872     }
873 
874     /**
875      * Get the most recently read text (for use in an error message)
876      */
877 
878     public String   recentText() {
879         if (inputOffset > inputLength) {
880             inputOffset = inputLength;
881         }
882         if (inputOffset < 34) {
883             return input.substring(0, inputOffset);
884         } else {
885             return NormalizeSpace.normalize(
886                     "..." + input.substring(inputOffset-30, inputOffset)).toString();
887         }
888     }
889 
890     /**
891      * Get the line number of the current token
892      */
893 
894     public int getLineNumber() {
895         return lineNumber;
896     }
897 
898     /**
899      * Get the column number of the current token
900      */
901 
902     public int getColumnNumber() {
903         return (int)(getLineAndColumn(currentTokenStartOffset)&0x7fffffff);
904     }
905 
906 // --Commented out by Inspection START (16/12/04 14:40):
907 //    /**
908 //     * Get the line and column number of the current token,
909 //     * as a long value with the line number in the top half
910 //     * and the column number in the lower half
911 //     * @return the line and column number, packed together
912 //     */
913 //
914 //    public long getLineAndColumn() {
915 //        return ((long)getLineNumber()) << 32 | ((long)getColumnNumber());
916 //    }
917 // --Commented out by Inspection STOP (16/12/04 14:40)
918 
919 
920     /**
921      * Get the line and column number corresponding to a given offset in the input expression,
922      * as a long value with the line number in the top half
923      * and the column number in the lower half
924      * @return the line and column number, packed together
925      */
926 
927     public long getLineAndColumn(int offset) {
928         if (newlineOffsets==null) {
929             return ((long)startLineNumber) << 32 | (long)offset;
930         }
931         for (int line=newlineOffsets.size()-1; line>=0; line--) {
932             int nloffset = ((Integer  )newlineOffsets.get(line)).intValue();
933             if (offset > nloffset) {
934                 return ((long)(line+startLineNumber+1)<<32) | ((long)(offset - nloffset));
935             }
936         }
937         return ((long)startLineNumber) << 32 | (long)(offset+1);
938     }
939 
940     public int getLineNumber(int offset) {
941         return (int)((getLineAndColumn(offset))>>32);
942     }
943 
944     public int getColumnNumber(int offset) {
945         return (int)((getLineAndColumn(offset))&0x7fffffff);
946     }
947 
948 }
949 
950 /*
951 
952 The following copyright notice is copied from the licence for xt, from which the
953 original version of this module was derived:
954 --------------------------------------------------------------------------------
955 Copyright (c) 1998, 1999 James Clark
956 
957 Permission is hereby granted, free of charge, to any person obtaining
958 a copy of this software and associated documentation files (the
959 "Software"), to deal in the Software without restriction, including
960 without limitation the rights to use, copy, modify, merge, publish,
961 distribute, sublicense, and/or sell copies of the Software, and to
962 permit persons to whom the Software is furnished to do so, subject to
963 the following conditions:
964 
965 The above copyright notice and this permission notice shall be included
966 in all copies or substantial portions of the Software.
967 
968 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
969 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
970 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
971 IN NO EVENT SHALL JAMES CLARK BE LIABLE FOR ANY CLAIM, DAMAGES OR
972 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
973 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
974 OTHER DEALINGS IN THE SOFTWARE.
975 
976 Except as contained in this notice, the name of James Clark shall
977 not be used in advertising or otherwise to promote the sale, use or
978 other dealings in this Software without prior written authorization
979 from James Clark.
980 ---------------------------------------------------------------------------
981 */
982 
983 //
984 // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
985 // you may not use this file except in compliance with the License. You may obtain a copy of the
986 // License at http://www.mozilla.org/MPL/
987 //
988 // Software distributed under the License is distributed on an "AS IS" basis,
989 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
990 // See the License for the specific language governing rights and limitations under the License.
991 //
992 // The Original Code is: all this file, other than the parts developed by James Clark as part of xt.
993 //
994 // The Initial Developer of the Original Code is Michael H. Kay.
995 //
996 // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
997 //
998 // Contributor(s): none.
999 //
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags