KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > netbeans > lib > html > lexer > HTMLLexer


1 /*
2  * Sun Public License Notice
3  *
4  * The contents of this file are subject to the Sun Public License
5  * Version 1.0 (the "License"). You may not use this file except in
6  * compliance with the License. A copy of the License is available at
7  * http://www.sun.com/
8  *
9  * The Original Code is NetBeans. The Initial Developer of the Original
10  * Code is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
11  * Microsystems, Inc. All Rights Reserved.
12  */

13
14 package org.netbeans.lib.html.lexer;
15
16 import java.util.logging.Level JavaDoc;
17 import java.util.logging.Logger JavaDoc;
18 import org.netbeans.api.html.lexer.HTMLTokenId;
19 import org.netbeans.api.lexer.Token;
20 import org.netbeans.spi.lexer.Lexer;
21 import org.netbeans.spi.lexer.LexerInput;
22 import org.netbeans.spi.lexer.LexerRestartInfo;
23 import org.netbeans.spi.lexer.TokenFactory;
24
25 /**
26  * Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
27  *
28  * @author Petr Nejedly
29  * @author Miloslav Metelka
30  * @author Jan Lahoda
31  * @author Marek Fukala
32  * @version 1.00
33  */

34
35 public final class HTMLLexer implements Lexer<HTMLTokenId> {
36     
37     private static final Logger JavaDoc LOGGER = Logger.getLogger(HTMLLexer.class.getName());
38     private static final boolean LOG = Boolean.getBoolean("j2ee_lexer_debug"); //NOI18N
39

40     private static final int EOF = LexerInput.EOF;
41     
42     private final LexerInput input;
43     
44     private final TokenFactory<HTMLTokenId> tokenFactory;
45     
46     public Object JavaDoc state() {
47         return lexerSubState * 1000000 + lexerState * 1000 + lexerScriptState;
48     }
49     
50     
51     /** Internal state of the lexical analyzer before entering subanalyzer of
52      * character references. It is initially set to INIT, but before first usage,
53      * this will be overwritten with state, which originated transition to
54      * charref subanalyzer.
55      */

56     private int lexerSubState = INIT;
57     private int lexerState = INIT;
58     
59     /** indicated whether we are in a script */
60     private int lexerScriptState = INIT;
61     
62     // internal 'in script' state. 'scriptState' internal state is set to it when the
63
// analyzer goes into a script tag body
64
private static final int ISI_SCRIPT = 1;
65     
66     // Internal states
67
private static final int INIT = 0;
68     private static final int ISI_TEXT = 1; // Plain text between tags
69
private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
70
private static final int ISA_LT = 3; // After start of tag delimiter - "<"
71
private static final int ISA_SLASH = 4; // After ETAGO - "</"
72
private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
73
private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
74
private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
75
private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
76
private static final int ISP_TAG_X = 9; // X-switch after TAG's name
77
private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
78
private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
79
private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
80
private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
81
private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
82
private static final int ISP_EQ_WS = 15; // In WS after '='
83
private static final int ISI_VAL = 16; // Non-quoted value
84
private static final int ISI_VAL_QUOT = 17; // Single-quoted value - may contain " chars
85
private static final int ISI_VAL_DQUOT = 18; // Double-quoted value - may contain ' chars
86
private static final int ISA_SGML_ESCAPE = 19; // After "<!"
87
private static final int ISA_SGML_DASH = 20; // After "<!-"
88
private static final int ISI_HTML_COMMENT = 21; // Somewhere after "<!--"
89
private static final int ISA_HTML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
90
private static final int ISI_HTML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
91
private static final int ISI_SGML_DECL = 24;
92     private static final int ISA_SGML_DECL_DASH = 25;
93     private static final int ISI_SGML_COMMENT = 26;
94     private static final int ISA_SGML_COMMENT_DASH = 27;
95     private static final int ISA_REF = 28; // when comes to character reference, e.g. &amp;, after &
96
private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
97
private static final int ISA_REF_HASH = 30; // for numeric references - after &#
98
private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. &#345;
99
private static final int ISA_REF_X = 32; //
100
private static final int ISI_REF_HEX = 33; // hexadecimal reference, in &#xa.. of &#X9..
101
private static final int ISI_TAG_SLASH = 34; //after slash in html tag
102

103     public HTMLLexer(LexerRestartInfo<HTMLTokenId> info) {
104         this.input = info.input();
105         this.tokenFactory = info.tokenFactory();
106         if (info.state() == null) {
107             this.lexerSubState = INIT;
108             this.lexerState = INIT;
109             this.lexerScriptState = INIT;
110         } else {
111             int encoded = ((Integer JavaDoc) info.state()).intValue();
112             this.lexerSubState = encoded / 1000000;
113             int remainder = encoded % 1000000;
114             this.lexerState = remainder / 1000;
115             this.lexerScriptState = remainder % 1000;
116         }
117     }
118     
119     private final boolean isAZ( int character ) {
120         return( (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') );
121     }
122     
123     private final boolean isName( int character ) {
124         return Character.isLetterOrDigit(character) ||
125                 character == '-' || character == '_' || character == '.' || character == ':';
126         // return( (ch >= 'a' && ch <= 'z') ||
127
// (ch >= 'A' && ch <= 'Z') ||
128
// (ch >= '0' && ch <= '9') ||
129
// ch == '-' || ch == '_' || ch == '.' || ch == ':' );
130

131     }
132     
133     /**
134      * Resolves if given char is whitespace in terms of HTML4.0 specs
135      * According to specs, following characters are treated as whitespace:
136      * Space - <CODE>' '</CODE>, Tab - <CODE>' '</CODE>,
137      * Formfeed - <CODE>' '</CODE>,Zero-width space - <CODE>'?'</CODE>,
138      * Carriage return - <CODE>'
139 '</CODE> and Line feed - <CODE>'
140 '</CODE>
141      * CR's are included for completenes only, they should never appear in document
142      */

143     
144     private final boolean isWS( int character ) {
145         return Character.isWhitespace(character);
146         // return ( ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
147
// || ch == '\u200b' || ch == '\n' || ch == '\r' );
148
}
149     
150     public Token<HTMLTokenId> nextToken() {
151         int actChar;
152         
153         while (true) {
154             actChar = input.read();
155             
156             if (actChar == EOF) {
157                 if(input.readLengthEOF() == 1) {
158                     return null; //just EOL is read
159
} else {
160                     //there is something else in the buffer except EOL
161
//we will return last token now
162
input.backup(1); //backup the EOL, we will return null in next nextToken() call
163
break;
164                 }
165             }
166             
167             //System.out.println("HTMLSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
168
// ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
169
switch( lexerState ) {
170                 case INIT: // DONE
171
switch( actChar ) {
172                         case '<':
173                             lexerState = ISA_LT;
174                             break;
175                         case '&':
176                             lexerState = ISA_REF;
177                             lexerSubState = ISI_TEXT;
178                             break;
179                         default:
180                             lexerState = ISI_TEXT;
181                             break;
182                     }
183                     break;
184                     
185                 case ISI_TEXT: // DONE
186
switch( actChar ) {
187                         case '<':
188                         case '&':
189                             lexerState = INIT;
190                             input.backup(1);
191                             if(input.readLength() > 0) { //is there any text before & or < ???
192
return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT);
193                             }
194                             break;
195                     }
196                     break;
197                     
198                 case ISI_ERROR: // DONE
199
lexerState = INIT;
200                     return token(HTMLTokenId.ERROR);
201                     
202                 case ISA_LT: // PENDING other transitions - e.g '<?'
203
if( isAZ( actChar ) ) { // <'a..Z'
204
lexerState = ISI_TAG;
205                         input.backup(1);
206                         return token(HTMLTokenId.TAG_OPEN_SYMBOL);
207                     }
208                     switch( actChar ) {
209                         case '/': // ETAGO - </
210
lexerState = ISA_SLASH;
211                             return token(HTMLTokenId.TAG_OPEN_SYMBOL);
212                         case '>': // Empty start tag <>, RELAXED
213
lexerState = INIT;
214                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
215                         case '!':
216                             lexerState = ISA_SGML_ESCAPE;
217                             break;
218                         default: // Part of text, RELAXED
219
lexerState = ISI_TEXT;
220                             break;
221                     }
222                     break;
223                     
224                 case ISA_SLASH: // DONE
225
if( isAZ( actChar ) ) { // </'a..Z'
226
lexerState = ISI_ENDTAG;
227                         break;
228                     }
229                     switch( actChar ) {
230                         case '>': // Empty end tag </>, RELAXED
231
lexerState = INIT;
232                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
233                         default: // Part of text, e.g. </3, </'\n', RELAXED
234
lexerState = ISI_TEXT;
235                             input.backup(1);
236                             break;
237                     }
238                     break;
239                     
240                 case ISI_ENDTAG: // DONE
241
if( isName( actChar ) ) break; // Still in endtag identifier, eat next char
242
lexerState = ISP_ENDTAG_X;
243                     input.backup(1);
244                     //test if the tagname is SCRIPT
245
//fixme: remove the 'script tag support' from the lexer completely, so far, just partially commented = disabled
246
// if("script".equalsIgnoreCase(input.readText().toString())) { //NOI18N
247
// lexerScriptState = INIT;
248
// //System.out.println("---end of script");
249
// }
250

251                     return token(HTMLTokenId.TAG_CLOSE);
252                     
253                     
254                 case ISP_ENDTAG_X: // DONE
255
if( isWS( actChar ) ) {
256                         lexerState = ISP_ENDTAG_WS;
257                         break;
258                     }
259                     switch( actChar ) {
260                         case '>': // Closing of endtag, e.g. </H6 _>_
261
lexerState = INIT;
262                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
263                         case '<': // next tag, e.g. </H6 _<_, RELAXED
264
lexerState = INIT;
265                             input.backup(1);
266                             break;
267                         default:
268                             lexerState = ISI_ERROR;
269                             input.backup(1);
270                             break;
271                     }
272                     break;
273                     
274                 case ISP_ENDTAG_WS: // DONE
275
if( isWS( actChar ) ) break; // eat all WS
276
lexerState = ISP_ENDTAG_X;
277                     input.backup(1);
278                     return token(HTMLTokenId.WS);
279                     
280                     
281                 case ISI_TAG: // DONE
282
if( isName( actChar ) ) break; // Still in tag identifier, eat next char
283
lexerState = ISP_TAG_X;
284                     input.backup(1);
285                     //test if the tagname is SCRIPT
286
// if("script".equalsIgnoreCase(input.readText().toString())) { //NOI18N
287
// lexerScriptState = ISI_SCRIPT;
288
// //System.out.println("+++start of script");
289
// }
290
return token(HTMLTokenId.TAG_OPEN);
291                     
292                 case ISP_TAG_X: // DONE
293
if( isWS( actChar ) ) {
294                         lexerState = ISP_TAG_WS;
295                         break;
296                     }
297                     if( isAZ( actChar ) ) {
298                         lexerState = ISI_ARG;
299                         break;
300                     }
301                     switch( actChar ) {
302                         case '/':
303                             lexerState = ISI_TAG_SLASH;
304                             break;
305                         case '>':
306                             lexerState = INIT;
307                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
308                         case '<':
309                             lexerState = INIT;
310                             input.backup(1);
311                             break;
312                         default:
313                             lexerState = ISI_ERROR;
314                             input.backup(1);
315                             break;
316                     }
317                     break;
318                     
319                 case ISP_TAG_WS: // DONE
320
if( isWS( actChar ) ) break; // eat all WS
321
lexerState = ISP_TAG_X;
322                     input.backup(1);
323                     return token(HTMLTokenId.WS);
324                     
325                 case ISI_TAG_SLASH:
326                     switch( actChar ) {
327                         case '>':
328                             lexerState = INIT;
329                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
330                         default:
331                             lexerState = ISI_ERROR;
332                             input.backup(1);
333                             break;
334                     }
335                     break;
336                     
337                 case ISI_ARG: // DONE
338
if( isName( actChar ) ) break; // eat next char
339
lexerState = ISP_ARG_X;
340                     input.backup(1);
341                     return token(HTMLTokenId.ARGUMENT);
342                     
343                 case ISP_ARG_X:
344                     if( isWS( actChar ) ) {
345                         lexerState = ISP_ARG_WS;
346                         break;
347                     }
348                     if( isAZ( actChar ) ) {
349                         lexerState = ISI_ARG;
350                         break;
351                     }
352                     switch( actChar ) {
353                         case '/':
354                         case '>':
355                             input.backup(1);
356                             lexerState = ISP_TAG_X;
357                             break;
358                         case '<':
359                             lexerState = INIT;
360                             input.backup(1);
361                             break;
362                         case '=':
363                             lexerState = ISP_EQ;
364                             return token(HTMLTokenId.OPERATOR);
365                         default:
366                             lexerState = ISI_ERROR;
367                             input.backup(1);
368                             break;
369                     }
370                     break;
371                     
372                 case ISP_ARG_WS:
373                     if( isWS( actChar ) ) break; // Eat all WhiteSpace
374
lexerState = ISP_ARG_X;
375                     input.backup(1);
376                     return token(HTMLTokenId.WS);
377                     
378                 case ISP_EQ:
379                     if( isWS( actChar ) ) {
380                         lexerState = ISP_EQ_WS;
381                         break;
382                     }
383                     switch( actChar ) {
384                         case '\'':
385                             lexerState = ISI_VAL_QUOT;
386                             break;
387                         case '"':
388                             lexerState = ISI_VAL_DQUOT;
389                             break;
390                         case '/':
391                         case '>':
392                             input.backup(1);
393                             lexerState = ISP_TAG_X;
394                             break;
395                         default:
396                             lexerState = ISI_VAL; //everything else if attribute value
397
break;
398                     }
399                     break;
400                     
401                 case ISP_EQ_WS:
402                     if( isWS( actChar ) ) break; // Consume all WS
403
lexerState = ISP_EQ;
404                     input.backup(1);
405                     return token(HTMLTokenId.WS);
406                     
407                     
408                 case ISI_VAL:
409                     if( !isWS( actChar )
410                     && !(actChar == '/' || actChar == '>' || actChar == '<')) break; // Consume whole value
411
lexerState = ISP_TAG_X;
412                     input.backup(1);
413                     return token(HTMLTokenId.VALUE);
414                     
415                 case ISI_VAL_QUOT:
416                     switch( actChar ) {
417                         case '\'':
418                             lexerState = ISP_TAG_X;
419                             return token(HTMLTokenId.VALUE);
420                         case '&':
421                             if( input.readLength() == 1 ) {
422                                 lexerSubState = lexerState;
423                                 lexerState = ISA_REF;
424                                 break;
425                             } else {
426                                 input.backup(1);
427                                 return token(HTMLTokenId.VALUE);
428                             }
429                     }
430                     break; // else simply consume next char of VALUE
431

432                 case ISI_VAL_DQUOT:
433                     switch( actChar ) {
434                         case '"':
435                             lexerState = ISP_TAG_X;
436                             return token(HTMLTokenId.VALUE);
437                         case '&':
438                             if( input.readLength() == 1 ) {
439                                 lexerSubState = lexerState;
440                                 lexerState = ISA_REF;
441                                 break;
442                             } else {
443                                 input.backup(1);
444                                 return token(HTMLTokenId.VALUE);
445                             }
446                     }
447                     break; // else simply consume next char of VALUE
448

449                     
450                     
451                 case ISA_SGML_ESCAPE: // DONE
452
if( isAZ(actChar) ) {
453                         lexerState = ISI_SGML_DECL;
454                         break;
455                     }
456                     switch( actChar ) {
457                         case '-':
458                             lexerState = ISA_SGML_DASH;
459                             break;
460                         default:
461                             lexerState = ISI_TEXT;
462                             input.backup(1);
463                             continue;
464                     }
465                     break;
466                     
467                 case ISA_SGML_DASH: // DONE
468
switch( actChar ) {
469                         case '-':
470                             lexerState = ISI_HTML_COMMENT;
471                             break;
472                         default:
473                             lexerState = ISI_TEXT;
474                             input.backup(1);
475                             continue;
476                     }
477                     break;
478                     
479                 case ISI_HTML_COMMENT: // DONE
480
switch( actChar ) {
481                         case '-':
482                             lexerState = ISA_HTML_COMMENT_DASH;
483                             break;
484                             //create an HTML comment token for each line of the comment - a performance fix for #43532
485
case '\n':
486                             //leave the some state - we are still in an HTML comment,
487
//we just need to create a token for each line.
488
return token(HTMLTokenId.BLOCK_COMMENT);
489                     }
490                     break;
491                     
492                 case ISA_HTML_COMMENT_DASH:
493                     switch( actChar ) {
494                         case '-':
495                             lexerState = ISI_HTML_COMMENT_WS;
496                             break;
497                         default:
498                             lexerState = ISI_HTML_COMMENT;
499                             continue;
500                     }
501                     break;
502                     
503                 case ISI_HTML_COMMENT_WS: // DONE
504
if( isWS( actChar ) ) break; // Consume all WS
505
switch( actChar ) {
506                         case '>':
507                             lexerState = INIT;
508                             return token(HTMLTokenId.BLOCK_COMMENT);
509                         default:
510                             lexerState = ISI_HTML_COMMENT;
511                             input.backup(1);
512                             break;
513                     }
514                     break;
515                     
516                 case ISI_SGML_DECL:
517                     switch( actChar ) {
518                         case '>':
519                             lexerState = INIT;
520                             return token(HTMLTokenId.DECLARATION);
521                         case '-':
522                             if( input.readLength() == 1 ) {
523                                 lexerState = ISA_SGML_DECL_DASH;
524                                 break;
525                             } else {
526                                 input.backup(1);
527                                 return token(HTMLTokenId.DECLARATION);
528                             }
529                     }
530                     break;
531                     
532                 case ISA_SGML_DECL_DASH:
533                     if( actChar == '-' ) {
534                         lexerState = ISI_SGML_COMMENT;
535                         break;
536                     } else {
537                         lexerState = ISI_SGML_DECL;
538                         input.backup(1);
539                         continue;
540                     }
541                     
542                 case ISI_SGML_COMMENT:
543                     switch( actChar ) {
544                         case '-':
545                             lexerState = ISA_SGML_COMMENT_DASH;
546                             break;
547                     }
548                     break;
549                     
550                 case ISA_SGML_COMMENT_DASH:
551                     if( actChar == '-' ) {
552                         lexerState = ISI_SGML_DECL;
553                         return token(HTMLTokenId.SGML_COMMENT);
554                     } else {
555                         lexerState = ISI_SGML_COMMENT;
556                         input.backup(1);
557                         continue;
558                     }
559                     
560                     
561                 case ISA_REF:
562                     if( isAZ( actChar ) ) {
563                         lexerState = ISI_REF_NAME;
564                         break;
565                     }
566                     if( actChar == '#' ) {
567                         lexerState = ISA_REF_HASH;
568                         break;
569                     }
570                     lexerState = lexerSubState;
571                     input.backup(1);
572                     continue;
573                     
574                 case ISI_REF_NAME:
575                     if( isName( actChar ) ) break;
576                     if( actChar != ';' )
577                         input.backup(1);
578                     lexerState = lexerSubState;
579                     return token(HTMLTokenId.CHARACTER);
580                     
581                 case ISA_REF_HASH:
582                     if( actChar >= '0' && actChar <= '9' ) {
583                         lexerState = ISI_REF_DEC;
584                         break;
585                     }
586                     if( actChar == 'x' || actChar == 'X' ) {
587                         lexerState = ISA_REF_X;
588                         break;
589                     }
590                     if( isAZ( actChar ) ) {
591                         lexerState = lexerSubState;
592                         return token(HTMLTokenId.ERROR);
593                     }
594                     lexerState = lexerSubState;
595                     input.backup(1);
596                     continue;
597                     
598                 case ISI_REF_DEC:
599                     if( actChar >= '0' && actChar <= '9' ) break;
600                     if( actChar != ';' )
601                         input.backup(1);
602                     lexerState = lexerSubState;
603                     return token(HTMLTokenId.CHARACTER);
604                     
605                 case ISA_REF_X:
606                     if( (actChar >= '0' && actChar <= '9') ||
607                             (actChar >= 'a' && actChar <= 'f') ||
608                             (actChar >= 'A' && actChar <= 'F')
609                             ) {
610                         lexerState = ISI_REF_HEX;
611                         break;
612                     }
613                     lexerState = lexerSubState;
614                     input.backup(1);
615                     return token(HTMLTokenId.ERROR); // error on previous "&#x" sequence
616

617                 case ISI_REF_HEX:
618                     if( (actChar >= '0' && actChar <= '9') ||
619                             (actChar >= 'a' && actChar <= 'f') ||
620                             (actChar >= 'A' && actChar <= 'F')
621                             ) break;
622                     if( actChar != ';' )
623                         input.backup(1);
624                     lexerState = lexerSubState;
625                     return token(HTMLTokenId.CHARACTER);
626             }
627         } // end of while(offset...)
628

629         /** At this stage there's no more text in the scanned buffer.
630          * Scanner first checks whether this is completely the last
631          * available buffer.
632          */

633         switch( lexerState ) {
634             case INIT:
635                 if (input.readLength() == 0) {
636                     return null;
637                 }
638                 break;
639             case ISI_TEXT:
640             case ISA_LT:
641             case ISA_SLASH:
642             case ISA_SGML_ESCAPE:
643             case ISA_SGML_DASH:
644                 lexerState = INIT;
645                 return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT);
646                 
647             case ISA_REF:
648             case ISA_REF_HASH:
649                 lexerState = INIT;
650                 if( lexerSubState == ISI_TEXT ) return token(lexerScriptState == INIT ? HTMLTokenId.TEXT : HTMLTokenId.SCRIPT);
651                 else return token(HTMLTokenId.VALUE);
652                 
653             case ISI_HTML_COMMENT:
654             case ISA_HTML_COMMENT_DASH:
655             case ISI_HTML_COMMENT_WS:
656                 lexerState = INIT;
657                 return token(HTMLTokenId.BLOCK_COMMENT);
658                 
659             case ISI_TAG:
660                 lexerState = INIT;
661                 return token(HTMLTokenId.TAG_OPEN);
662             case ISI_ENDTAG:
663                 lexerState = INIT;
664                 return token(HTMLTokenId.TAG_CLOSE);
665                 
666             case ISI_ARG:
667                 lexerState = INIT;
668                 return token(HTMLTokenId.ARGUMENT);
669                 
670             case ISI_ERROR:
671                 lexerState = INIT;
672                 return token(HTMLTokenId.ERROR);
673                 
674             case ISP_ARG_WS:
675             case ISP_TAG_WS:
676             case ISP_ENDTAG_WS:
677             case ISP_EQ_WS:
678                 lexerState = INIT;
679                 return token(HTMLTokenId.WS);
680                 
681             case ISP_ARG_X:
682             case ISP_TAG_X:
683             case ISP_ENDTAG_X:
684             case ISP_EQ:
685                 lexerState = INIT;
686                 return token(HTMLTokenId.WS);
687                 
688             case ISI_VAL:
689             case ISI_VAL_QUOT:
690             case ISI_VAL_DQUOT:
691                 lexerState = INIT;
692                 return token(HTMLTokenId.VALUE);
693                 
694             case ISI_SGML_DECL:
695             case ISA_SGML_DECL_DASH:
696                 lexerState = INIT;
697                 return token(HTMLTokenId.DECLARATION);
698                 
699             case ISI_SGML_COMMENT:
700             case ISA_SGML_COMMENT_DASH:
701                 lexerState = INIT;
702                 return token(HTMLTokenId.SGML_COMMENT);
703                 
704             case ISI_REF_NAME:
705             case ISI_REF_DEC:
706             case ISA_REF_X:
707             case ISI_REF_HEX:
708                 lexerState = INIT;
709                 return token(HTMLTokenId.CHARACTER);
710         }
711         
712         return null;
713     }
714     
715     private Token<HTMLTokenId> token(HTMLTokenId tokenId) {
716         if(LOG) {
717             if(input.readLength() == 0) {
718                 LOGGER.log(Level.INFO, "Found zero length token: ");
719             }
720             LOGGER.log(Level.INFO, "[" + this.getClass().getSimpleName() + "] token ('" + input.readText().toString() + "'; id=" + tokenId + "; state=" + state() + ")\n");
721         }
722         return tokenFactory.createToken(tokenId);
723     }
724     
725     public void release() {
726     }
727
728 }
729
Popular Tags