KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > lib > xml > lexer > XMLLexer


1 /*
2  * Sun Public License Notice
3  *
4  * The contents of this file are subject to the Sun Public License
5  * Version 1.0 (the "License"). You may not use this file except in
6  * compliance with the License. A copy of the License is available at
7  * http://www.sun.com/
8  *
9  * The Original Code is NetBeans. The Initial Developer of the Original
10  * Code is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
11  * Microsystems, Inc. All Rights Reserved.
12  */

13
14 package org.netbeans.lib.xml.lexer;
15
16 import org.netbeans.api.xml.lexer.XMLTokenId;
17 import org.netbeans.api.lexer.Token;
18 import org.netbeans.spi.lexer.Lexer;
19 import org.netbeans.spi.lexer.LexerInput;
20 import org.netbeans.spi.lexer.LexerRestartInfo;
21 import org.netbeans.spi.lexer.TokenFactory;
22
23 /**
24  * Lexical analyzer for XML. Based on original XML lexer from xml/editor module.
25  *
26  * @author Petr Nejedly
27  * @author Miloslav Metelka
28  * @author Jan Lahoda
29  * @author Marek Fukala
30  * @author Tomasz Slota
31  * @version 1.00
32  */

33
34 public class XMLLexer implements Lexer<XMLTokenId> {
35     private LexerInput input;
36     
37     private TokenFactory<XMLTokenId> tokenFactory;
38     
39     public Object JavaDoc state() {
40         Integer JavaDoc encoded = (subState << 020) + (this.state << 010) + (subInternalDTD ? 1 : 0);
41         return encoded;
42     }
43     
44     private void loadState(final Object JavaDoc state) {
45         if (state == null) {
46             subState = INIT;
47             this.state = INIT;
48             subInternalDTD = false;
49         } else {
50             int encoded = ((Integer JavaDoc) state).intValue();
51             
52             subState = (encoded & 0xff0000) >> 020;
53             this.state = (encoded & 0xff00) >> 010;
54             subInternalDTD = encoded % 2 == 1;
55         }
56     }
57     
58     /**
59      * Internal state of the lexical analyzer before entering subanalyzer of
60      * character references. It is initially set to INIT, but before first
61      * usage, this will be overwritten with state, which originated
62      * ransition to charref subanalyzer.
63      */

64     protected int state = INIT;
65     
66     /**
67      * Internal state of the lexical analyzer before entering subanalyzer of
68      * character references. It is initially set to INIT, but before first
69      * usage, this will be overwritten with state, which originated
70      * ransition to charref subanalyzer.
71      */

72     protected int subState = INIT;
73     
74     /**
75      * Identifies internal DTD layer. Most of functionality is same
76      * as at document layer, however there are minor exceptions.
77      * @see isInternalDTD checks in code
78      */

79     protected boolean subInternalDTD = false;
80     
81     /** Initial internal state of the analyzer */
82     public static final int INIT = 0;
83     
84     // Internal states I = in state
85
// P = expected (char probed but not consumed)
86
// A = after (char probed and consumed)
87

88     
89     private static final int ISI_TEXT = 1; // Plain text between tags
90
private static final int ISI_ERROR = 2; // Syntax error in XML syntax
91
private static final int ISA_LT = 3; // After start of tag delimiter - "<"
92
private static final int ISA_SLASH = 4; // After ETAGO - "</"
93
private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
94
private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
95
private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
96
private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
97
private static final int ISP_TAG_X = 9; // X-switch after TAG's name
98
private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
99
private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
100
private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
101
private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
102
private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
103
private static final int ISP_EQ_WS = 15; // In WS after '='
104
private static final int ISI_VAL_APOS = 17; // Single-quoted value - may contain " chars
105
private static final int ISI_VAL_QUOT = 18; // Double-quoted value - may contain ' chars
106
private static final int ISA_SGML_ESCAPE = 19; // After "<!"
107
private static final int ISA_SGML_DASH = 20; // After "<!-"
108
private static final int ISI_XML_COMMENT = 21; // Somewhere after "<!--"
109
private static final int ISA_XML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
110
private static final int ISI_XML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
111
private static final int ISI_SGML_DECL = 24;
112     private static final int ISA_SGML_DECL_DASH = 25;
113     // private static final int ISI_SGML_COMMENT = 26;
114
// private static final int ISA_SGML_COMMENT_DASH = 27;
115
private static final int ISA_REF = 28; // when comes to character reference, e.g. &amp;, after &
116
private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
117
private static final int ISA_REF_HASH = 30; // for numeric references - after &#
118
private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. &#345;
119
private static final int ISA_REF_X = 32; //
120
private static final int ISI_REF_HEX = 33; // hexadecimal reference, in &#xa.. of &#X9..
121

122     
123     private static final int ISI_PI = 35; //after <?...
124
private static final int ISI_PI_TARGET = 36; //in <?..|..
125
private static final int ISP_PI_TARGET_WS = 37; //after <?...|
126
private static final int ISI_PI_CONTENT = 38; //in PI content
127
private static final int ISA_PI_CONTENT_QMARK = 39; //after ? in content
128
private static final int ISP_PI_CONTENT_QMARK = 40; //spotet ? in content
129

130     // CDATA section handler
131
private static final int ISA_LTEXBR = 41;
132     private static final int ISA_LTEXBRC = 42;
133     private static final int ISA_LTEXBRCD = 43;
134     private static final int ISA_LTEXBRCDA = 44;
135     private static final int ISA_LTEXBRCDAT = 45;
136     private static final int ISA_LTEXBRCDATA = 46;
137     private static final int ISI_CDATA = 47;
138     private static final int ISA_CDATA_BR = 48;
139     private static final int ISA_CDATA_BRBR = 49;
140     
141     // strings in declaration
142
private static final int ISI_DECL_CHARS = 50;
143     private static final int ISI_DECL_STRING = 51;
144     private static final int ISP_DECL_CHARS = 52;
145     private static final int ISP_DECL_STRING = 53;
146     
147     // internal DTD handling
148
private static final int ISA_INIT_BR = 54;
149     
150     public XMLLexer(LexerRestartInfo<XMLTokenId> info) {
151         this.input = info.input();
152         this.tokenFactory = info.tokenFactory();
153         loadState(info.state());
154     }
155     
156     private final boolean isAZ( int ch ) {
157         return( (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') );
158     }
159     
160     /**
161      * Resolves if given char is whitespace in terms of XML4.0 specs
162      * According to specs, following characters are treated as whitespace:
163      * Space - <CODE>' '</CODE>, Tab - <CODE>' '</CODE>,
164      * Formfeed - <CODE>' '</CODE>,Zero-width space - <CODE>'?'</CODE>,
165      * Carriage return - <CODE>'
166 '</CODE> and Line feed - <CODE>'
167 '</CODE>
168      * CR's are included for completenes only, they should never appear in document
169      */

170     
171     private final boolean isWS( int ch ) {
172         return Character.isWhitespace(ch);
173         // return ( ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
174
// || ch == '\u200b' || ch == '\n' || ch == '\r' );
175
}
176     
177     private void enterInternalDTD() {
178         subInternalDTD = true;
179     }
180     
181     private void leaveInternalDTD() {
182         subInternalDTD = false;
183     }
184     
185     private boolean isInternalDTD() {
186         return subInternalDTD;
187     }
188     
189     public Token<XMLTokenId> nextToken() {
190         
191         int actChar;
192         while(true) {
193             actChar = input.read();
194             
195             if (actChar == LexerInput.EOF){
196                 
197                 if (input.readLength() == 0){
198                     return null;
199                 }
200                 
201                 input.backup(1);
202                 break;
203             }
204             
205             switch( state ) {
206                 case INIT: // DONE
207
switch( actChar ) {
208                         case '<':
209                             state = ISA_LT;
210                             break;
211                         case '&':
212                             if (isInternalDTD() == false) {
213                                 state = ISA_REF;
214                                 subState = ISI_TEXT;
215                             } else {
216                                 state = ISI_TEXT;
217                             }
218                             break;
219                         case '%':
220                             if (isInternalDTD()) {
221                                 state = ISA_REF;
222                                 subState = INIT;
223                             } else {
224                                 state = ISI_TEXT;
225                             }
226                             break;
227                         case ']':
228                             if (isInternalDTD()) {
229                                 state = ISA_INIT_BR;
230                             } else {
231                                 state = ISI_TEXT;
232                             }
233                             break;
234                         default:
235                             state = ISI_TEXT;
236                             break;
237                     }
238                     
239                     break;
240                     
241                 case ISI_TEXT: // DONE
242
switch( actChar ) {
243                         case '<':
244                             state = INIT;
245                             input.backup(1);
246                             return token(XMLTokenId.TEXT);
247                         case '&':
248                             if (isInternalDTD() == false) {
249                                 state = INIT;
250                                 input.backup(1);
251                                 return token(XMLTokenId.TEXT);
252                             }
253                             break;
254                         case '%':
255                             if (isInternalDTD()) {
256                                 state = INIT;
257                                 input.backup(1);
258                                 return token(XMLTokenId.TEXT);
259                             }
260                             break;
261                         case ']':
262                             if (isInternalDTD()) {
263                                 state = ISA_INIT_BR;
264                             }
265                             break;
266                     }
267                     break;
268                     
269                 case ISI_ERROR: // DONE
270
state = INIT;
271                     return token(XMLTokenId.ERROR);
272                     
273                 case ISA_LT: // DONE
274

275                     if( UnicodeClasses.isXMLNameStartChar( actChar ) && isInternalDTD() == false) {
276                         state = ISI_TAG;
277                         break;
278                     }
279                     switch( actChar ) {
280                         case '/': // ETAGO - </
281
state = ISA_SLASH;
282                             break;
283                         case '!':
284                             state = ISA_SGML_ESCAPE;
285                             break;
286                         case '?':
287                             state = ISI_PI;
288                             return token(XMLTokenId.PI_START);
289                         default:
290                             state = ISI_TEXT; //RELAXED to allow editing in the middle of document
291
continue; // don't eat the char, maybe its '&'
292
}
293                     break;
294                     
295                 case ISI_PI:
296                     if ( UnicodeClasses.isXMLNameStartChar( actChar )) {
297                         state = ISI_PI_TARGET;
298                         break;
299                     }
300                     state = ISI_ERROR;
301                     break;
302                     
303                 case ISI_PI_TARGET:
304                     if ( UnicodeClasses.isXMLNameChar( actChar )) break;
305                     if (isWS( actChar )) {
306                         state = ISP_PI_TARGET_WS;
307                         input.backup(1);
308                         return token(XMLTokenId.PI_TARGET);
309                     }
310                     state = ISI_ERROR;
311                     break;
312                     
313                 case ISP_PI_TARGET_WS:
314                     if (isWS( actChar)) break;
315                     state = ISI_PI_CONTENT;
316                     input.backup(1);
317                     return token(XMLTokenId.WS);
318                     
319                 case ISI_PI_CONTENT:
320                     if (actChar != '?') break; // eat content
321
state = ISP_PI_CONTENT_QMARK;
322                     input.backup(1);
323                     return token(XMLTokenId.PI_CONTENT); // may do extra break
324

325                 case ISP_PI_CONTENT_QMARK:
326                     if (actChar != '?') throw new IllegalStateException JavaDoc("'?' expected in ISP_PI_CONTENT_QMARK");
327                     state = ISA_PI_CONTENT_QMARK;
328                     break;
329                     
330                 case ISA_PI_CONTENT_QMARK:
331                     if (actChar != '>') {
332                         state = ISI_PI_CONTENT;
333                         break;
334                     }
335                     state = INIT;
336                     return token(XMLTokenId.PI_END);
337                     
338                 case ISA_SLASH: // DONE
339

340                     if( UnicodeClasses.isXMLNameStartChar( actChar )){
341                         state = ISI_ENDTAG;
342                         break;
343                     }
344                     switch( actChar ) {
345                         case ' ':
346                             state = ISI_TEXT;
347                             continue;
348                         case '\n':
349                             state = ISI_TEXT;
350                             continue;
351                         case '\r':
352                             state = ISI_TEXT;
353                             continue;
354                         default: // Part of text, e.g. </3, </'\n', RELAXED
355
state = ISI_TEXT;
356                             continue; // don'e eat the char
357
}
358                     //break;
359

360                 case ISI_ENDTAG: // DONE
361
if( UnicodeClasses.isXMLNameChar( actChar )){
362                         break; // Still in endtag identifier, eat next char
363
}
364                     
365                     state = ISP_ENDTAG_X;
366                     input.backup(1);
367                     return token(XMLTokenId.TAG);
368                     
369                     
370                 case ISP_ENDTAG_X: // DONE
371
if( isWS( actChar ) ) {
372                         state = ISP_ENDTAG_WS;
373                         break;
374                     }
375                     switch( actChar ) {
376                         case '>': // Closing of endtag, e.g. </H6 _>_
377
state = INIT;
378                             return token(XMLTokenId.TAG);
379                         default:
380                             state = ISI_ERROR;
381                             continue; //don't eat
382
}
383                     //break;
384

385                 case ISP_ENDTAG_WS: // DONE
386
if( isWS( actChar ) ) break; // eat all WS
387
state = ISP_ENDTAG_X;
388                     input.backup(1);
389                     return token(XMLTokenId.WS);
390                     
391                     
392                 case ISI_TAG: // DONE
393
if( UnicodeClasses.isXMLNameChar( actChar ) ) break; // Still in tag identifier, eat next char
394
state = ISP_TAG_X;
395                     input.backup(1);
396                     return token(XMLTokenId.TAG);
397                     
398                 case ISP_TAG_X: // DONE
399
if( isWS( actChar ) ) {
400                         state = ISP_TAG_WS;
401                         break;
402                     }
403                     if( UnicodeClasses.isXMLNameStartChar( actChar ) ) {
404                         state = ISI_ARG;
405                         break;
406                     }
407                     switch( actChar ) {
408                         case '/':
409                             break;
410                         case '?': //Prolog and PI's now similar to Tag
411
break;
412                         case '>':
413                             state = INIT;
414                             return token(XMLTokenId.TAG);
415                         default:
416                             state = ISI_ERROR;
417                             continue;
418                     }
419                     break;
420                     
421                     
422                 case ISP_TAG_WS: // DONE
423
//input.backup(1);
424
if( isWS( actChar ) ) break; // eat all WS
425
state = ISP_TAG_X;
426                     input.backup(1);
427                     return token(XMLTokenId.WS);
428                     
429                 case ISI_ARG: // DONE
430
if( UnicodeClasses.isXMLNameChar( actChar ) ) break; // eat next char
431
state = ISP_ARG_X;
432                     input.backup(1);
433                     return token(XMLTokenId.ARGUMENT);
434                     
435                 case ISP_ARG_X:
436                     if( isWS( actChar ) ) {
437                         state = ISP_ARG_WS;
438                         break;
439                     }
440                     switch( actChar ) {
441                         case '=':
442                             state = ISP_EQ;
443                             return token(XMLTokenId.OPERATOR);
444                         default:
445                             state = ISI_ERROR;
446                             continue;
447                     }
448                     //break;
449

450                 case ISP_ARG_WS:
451                     if( isWS( actChar ) ) break; // Eat all WhiteSpace
452
state = ISP_ARG_X;
453                     input.backup(1);
454                     return token(XMLTokenId.WS);
455                     
456                 case ISP_EQ:
457                     if( isWS( actChar ) ) {
458                         state = ISP_EQ_WS;
459                         break;
460                     }
461                     switch( actChar ) {
462                         case '\'':
463                             state = ISI_VAL_APOS;
464                             break;
465                         case '"':
466                             state = ISI_VAL_QUOT;
467                             break;
468                         default:
469                             state = ISI_ERROR;
470                             continue;
471                     }
472                     break;
473                     
474                 case ISP_EQ_WS:
475                     if( isWS( actChar ) ) break; // Consume all WS
476
state = ISP_EQ;
477                     input.backup(1);
478                     return token(XMLTokenId.WS);
479                     
480                 case ISI_VAL_APOS:
481                     switch( actChar ) {
482                         case '\'':
483                             state = ISP_TAG_X;
484                             return token(XMLTokenId.VALUE);
485                         case '&':
486                             if(input.readLength() == 1) {
487                                 subState = state;
488                                 state = ISA_REF;
489                                 break;
490                             } else {
491                                 input.backup(1);
492                                 return token(XMLTokenId.VALUE);
493                             }
494                     }
495                     break; // else simply consume next char of VALUE
496

497                 case ISI_VAL_QUOT:
498                     switch( actChar ) {
499                         case '"':
500                             state = ISP_TAG_X;
501                             return token(XMLTokenId.VALUE);
502                         case '&':
503                             if(input.readLength() == 1) {
504                                 subState = state;
505                                 state = ISA_REF;
506                                 break;
507                             } else {
508                                 input.backup(1);
509                                 return token(XMLTokenId.VALUE);
510                             }
511                     }
512                     break; // else simply consume next char of VALUE
513

514                     
515                 case ISA_SGML_ESCAPE: // DONE
516
if (actChar == '[') {
517                         state = ISA_LTEXBR;
518                         break;
519                     } else if( isAZ(actChar) ) {
520                         state = ISI_SGML_DECL;
521                         break;
522                     }
523                     switch( actChar ) {
524                         case '-':
525                             state = ISA_SGML_DASH;
526                             break;
527                         default:
528                             state = ISI_TEXT;
529                             continue;
530                     }
531                     break;
532                     
533                 case ISA_LTEXBR:
534                     if (actChar == 'C') {
535                         state = ISA_LTEXBRC;
536                         break;
537                     } else {
538                         state = ISI_TEXT;
539                         continue;
540                     }
541                     
542                 case ISA_LTEXBRC:
543                     if (actChar == 'D') {
544                         state = ISA_LTEXBRCD;
545                         break;
546                     } else {
547                         state = ISI_TEXT;
548                         continue;
549                     }
550                     
551                 case ISA_LTEXBRCD:
552                     if (actChar == 'A') {
553                         state = ISA_LTEXBRCDA;
554                         break;
555                     } else {
556                         state = ISI_TEXT;
557                         continue;
558                     }
559                     
560                 case ISA_LTEXBRCDA:
561                     if (actChar == 'T') {
562                         state = ISA_LTEXBRCDAT;
563                         break;
564                     } else {
565                         state = ISI_TEXT;
566                         continue;
567                     }
568                     
569                 case ISA_LTEXBRCDAT:
570                     if (actChar == 'A') {
571                         state = ISA_LTEXBRCDATA;
572                         break;
573                     } else {
574                         state = ISI_TEXT;
575                         continue;
576                     }
577                     
578                 case ISA_LTEXBRCDATA:
579                     if (actChar == '[') {
580                         state = ISI_CDATA;
581                         break;
582                     } else {
583                         state = ISI_TEXT;
584                         continue;
585                     }
586                     
587                 case ISI_CDATA:
588                     if (actChar == ']') {
589                         state = ISA_CDATA_BR;
590                         break;
591                     }
592                     
593                 case ISA_CDATA_BR:
594                     if (actChar == ']') {
595                         state = ISA_CDATA_BRBR;
596                         break;
597                     } else {
598                         state = ISI_CDATA;
599                         break;
600                     }
601                     
602                 case ISA_CDATA_BRBR:
603                     if (actChar == '>') {
604                         state = ISI_TEXT; //It s allowed only in content
605
return token(XMLTokenId.CDATA_SECTION);
606                     } else if (actChar == ']') {
607                         // stay in the same state
608
break;
609                     } else {
610                         state = ISI_CDATA;
611                         break;
612                     }
613                     
614                     
615                 case ISA_SGML_DASH: // DONE
616
switch( actChar ) {
617                         case '-':
618                             state = ISI_XML_COMMENT;
619                             break;
620                         default:
621                             state=ISI_ERROR;
622                             continue;
623                     }
624                     break;
625                     
626                 case ISI_XML_COMMENT: // DONE
627
switch( actChar ) {
628                         case '-':
629                             state = ISA_XML_COMMENT_DASH;
630                             break;
631                             //create an XML comment token for each line of the comment - a workaround fix for performance bug #39446
632
//this also causes a SyntaxtElement to be created for each line of the comment - see XMLSyntaxSupport.createElement:277
633
//PENDING - this code can be removed after editor solve it somehow in their code
634
case '\n':
635                             //leave the some state - we are still in an XML comment,
636
//we just need to create a token for each line.
637
return token(XMLTokenId.BLOCK_COMMENT);
638                     }
639                     break;
640                     
641                 case ISA_XML_COMMENT_DASH:
642                     switch( actChar ) {
643                         case '-':
644                             state = ISI_XML_COMMENT_WS;
645                             break;
646                         default:
647                             state = ISI_XML_COMMENT;
648                             continue;
649                     }
650                     break;
651                     
652                 case ISI_XML_COMMENT_WS: // DONE
653
if( isWS( actChar ) ) break; // Consume all WS
654
switch( actChar ) {
655                         case '>':
656                             state = INIT;
657                             return token(XMLTokenId.BLOCK_COMMENT);
658                         default:
659                             state = ISI_ERROR;
660                             input.backup(1);
661                             return token(XMLTokenId.BLOCK_COMMENT);
662                     }
663                     
664                 case ISP_DECL_STRING:
665                     if (actChar != '"') throw new IllegalStateException JavaDoc("Unexpected " + actChar);
666                     state = ISI_DECL_STRING;
667                     break;
668                     
669                 case ISI_DECL_STRING:
670                     if ( actChar == '"') {
671                         state = ISI_SGML_DECL;
672                         return token(XMLTokenId.VALUE);
673                     }
674                     break;
675                     
676                 case ISP_DECL_CHARS:
677                     if (actChar != '\'') throw new IllegalStateException JavaDoc("Unexpected " + actChar);
678                     state = ISI_DECL_CHARS;
679                     break;
680                     
681                 case ISI_DECL_CHARS:
682                     if ( actChar == '\'') {
683                         state = ISI_SGML_DECL;
684                         return token(XMLTokenId.VALUE);
685                     }
686                     break;
687                     
688                 case ISI_SGML_DECL:
689                     switch( actChar ) {
690                         case '"':
691                             state = ISP_DECL_STRING;
692                             input.backup(1);
693                             return token(XMLTokenId.DECLARATION);
694                         case '\'':
695                             state = ISP_DECL_CHARS;
696                             input.backup(1);
697                             return token(XMLTokenId.DECLARATION);
698                         case '[':
699                             state = INIT;
700                             enterInternalDTD();
701                             return token(XMLTokenId.DECLARATION);
702                         case '>':
703                             state = INIT;
704                             return token(XMLTokenId.DECLARATION);
705                     }
706                     break;
707                     
708                 case ISA_INIT_BR:
709                     if (isWS(actChar)) break;
710                     if (actChar == '>') {
711                         state = INIT;
712                         leaveInternalDTD();
713                         return token(XMLTokenId.DECLARATION);
714                     } else {
715                         state = INIT;
716                         input.backup(1);
717                         return token(XMLTokenId.ERROR);
718                     }
719                     
720                 case ISA_SGML_DECL_DASH:
721                     if( actChar == '-' ) {
722                         state = ISI_ERROR;
723                         break;
724                     } else {
725                         if(isWS(actChar)){
726                             state = ISI_ERROR;
727                             continue;
728                         } else {
729                             state = ISI_SGML_DECL;
730                             continue;
731                         }
732                     }
733                     
734                 case ISA_REF:
735                     if( UnicodeClasses.isXMLNameStartChar( actChar ) ) {
736                         state = ISI_REF_NAME;
737                         break;
738                     }
739                     if( actChar == '#') {
740                         state = ISA_REF_HASH;
741                         break;
742                     }
743                     state = subState;
744                     continue;
745                     
746                 case ISI_REF_NAME:
747                     if( UnicodeClasses.isXMLNameChar( actChar ) ) break;
748                     if( actChar != ';' ) input.backup(1);
749                     state = subState;
750                     return token(XMLTokenId.CHARACTER);
751                     
752                 case ISA_REF_HASH:
753                     if( actChar >= '0' && actChar <= '9' ) {
754                         state = ISI_REF_DEC;
755                         break;
756                     }
757                     if( actChar == 'x' || actChar == 'X' ) {
758                         state = ISA_REF_X;
759                         break;
760                     }
761                     if( isAZ( actChar ) ) {
762                         state = subState;
763                         return token(XMLTokenId.ERROR);
764                     }
765                     state = subState;
766                     continue;
767                     
768                 case ISI_REF_DEC:
769                     if( actChar >= '0' && actChar <= '9' ) break;
770                     if( actChar != ';' ) input.backup(1);
771                     state = subState;
772                     return token(XMLTokenId.CHARACTER);
773                     
774                 case ISA_REF_X:
775                     if (isHex(actChar)) {
776                         state = ISI_REF_HEX;
777                         break;
778                     }
779                     state = subState;
780                     input.backup(1);
781                     return token(XMLTokenId.ERROR); // error on previous "&#x" sequence
782

783                 case ISI_REF_HEX:
784                     if (isHex(actChar)) break;
785                     if (actChar != ';' ) input.backup(1);
786                     state = subState;
787                     return token(XMLTokenId.CHARACTER);
788             }
789         } // end of while(offset...)
790

791         switch( state ) {
792             case INIT:
793             case ISI_TEXT:
794             case ISA_LT:
795             case ISA_SLASH:
796             case ISA_SGML_ESCAPE:
797             case ISA_SGML_DASH:
798                 return token(XMLTokenId.TEXT);
799                 
800             case ISA_REF:
801             case ISA_REF_HASH:
802                 if( subState == ISI_TEXT ) return token(XMLTokenId.TEXT);
803                 else return token(XMLTokenId.VALUE);
804                 
805             case ISI_XML_COMMENT:
806             case ISA_XML_COMMENT_DASH:
807             case ISI_XML_COMMENT_WS:
808                 return token(XMLTokenId.BLOCK_COMMENT);
809                 
810             case ISI_TAG:
811             case ISI_ENDTAG:
812                 return token(XMLTokenId.TAG);
813                 
814             case ISI_ARG:
815                 return token(XMLTokenId.ARGUMENT);
816                 
817             case ISI_ERROR:
818                 return token(XMLTokenId.ERROR);
819                 
820             case ISP_ARG_WS:
821             case ISP_TAG_WS:
822             case ISP_ENDTAG_WS:
823             case ISP_EQ_WS:
824                 return token(XMLTokenId.WS);
825                 
826             case ISP_ARG_X:
827             case ISP_TAG_X:
828             case ISP_ENDTAG_X:
829             case ISP_EQ:
830                 return token(XMLTokenId.WS);
831                 
832             case ISI_VAL_APOS:
833             case ISI_VAL_QUOT:
834             case ISI_DECL_CHARS:
835             case ISI_DECL_STRING:
836                 return token(XMLTokenId.VALUE);
837                 
838             case ISI_SGML_DECL:
839             case ISA_SGML_DECL_DASH:
840             case ISP_DECL_STRING:
841             case ISP_DECL_CHARS:
842                 return token(XMLTokenId.DECLARATION);
843                 
844             case ISI_REF_NAME:
845             case ISI_REF_DEC:
846             case ISA_REF_X:
847             case ISI_REF_HEX:
848                 return token(XMLTokenId.CHARACTER);
849                 
850             case ISI_PI:
851                 return token(XMLTokenId.PI_START);
852             case ISI_PI_TARGET:
853                 return token(XMLTokenId.PI_TARGET);
854             case ISP_PI_TARGET_WS:
855                 return token(XMLTokenId.WS);
856             case ISI_PI_CONTENT:
857                 return token(XMLTokenId.PI_CONTENT);
858             case ISA_PI_CONTENT_QMARK:
859             case ISP_PI_CONTENT_QMARK:
860                 // we are at end of the last buffer and expect that next char will be '>'
861
return token(XMLTokenId.PI_END);
862                 
863             case ISA_LTEXBR:
864             case ISA_LTEXBRC:
865             case ISA_LTEXBRCD:
866             case ISA_LTEXBRCDA:
867             case ISA_LTEXBRCDAT:
868             case ISA_LTEXBRCDATA:
869                 return token(XMLTokenId.TEXT);
870                 
871             case ISI_CDATA:
872             case ISA_CDATA_BR:
873             case ISA_CDATA_BRBR:
874                 return token(XMLTokenId.CDATA_SECTION);
875                 
876             case ISA_INIT_BR:
877                 return token(XMLTokenId.TEXT);
878                 
879             default:
880                 throw new IllegalStateException JavaDoc("Last buffer does not handle state " + state + "!"); //NOI18N
881
}
882         
883     }
884     
885     private Token<XMLTokenId> token(XMLTokenId id) {
886 // System.out.print("--- token(" + id + "; '" + input.readText().toString() + "')");
887
// if(input.readLength() == 0) {
888
// System.out.println("XMLLexer error - zero length token!");
889
// }
890
Token<XMLTokenId> t = tokenFactory.createToken(id);
891 // System.out.println(t.id() + "; " + t.length());
892
return t;
893     }
894     
895     private boolean isHex(int ch) {
896         return (ch >= '0' && ch <= '9') || isAF(ch);
897     }
898     
899     private boolean isAF(int ch) {
900         return( (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') );
901     }
902
903     public void release() {
904     }
905
906 }
907
Popular Tags