1 37 38 package org.htmlcleaner; 39 40 import java.io.*; 41 import java.util.*; 42 43 54 public class HtmlTokenizer { 55 56 private final static int WORKING_BUFFER_SIZE = 1024; 57 58 private BufferedReader _reader; 59 private char[] _working = new char[WORKING_BUFFER_SIZE]; 60 61 private transient int _pos = 0; 62 private transient int _len = -1; 63 64 private transient StringBuffer _saved = new StringBuffer (512); 65 66 private transient boolean _isLateForDoctype = false; 67 private transient TagToken _currentTagToken = null; 68 private transient List _tokenList = new ArrayList(); 69 70 private boolean _asExpected = true; 71 72 private boolean _isScriptContext = false; 73 private boolean _isStyleContext = false; 74 75 private HtmlCleaner cleaner; 76 77 82 public HtmlTokenizer(HtmlCleaner cleaner) throws IOException { 83 this._reader = new BufferedReader( cleaner.getReader() ); 84 this.cleaner = cleaner; 85 } 86 87 private void addToken(BaseToken token) { 88 _tokenList.add(token); 89 cleaner.makeTree( _tokenList, _tokenList.listIterator(_tokenList.size() - 1) ); 90 } 91 92 private void readIfNeeded(int neededChars) throws IOException { 93 if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) { 94 int numToCopy = WORKING_BUFFER_SIZE - _pos; 95 System.arraycopy(_working, _pos, _working, 0, numToCopy); 96 _pos = 0; 97 int size = _reader.read(_working, numToCopy, WORKING_BUFFER_SIZE - numToCopy); 98 99 if (size < WORKING_BUFFER_SIZE - numToCopy) { 100 _len = (size == -1 && numToCopy == 0) ? 0 : size + numToCopy; 101 } else if (size == -1) { 102 _len = numToCopy; 103 } 104 } 105 } 106 107 List getTokenList() { 108 return this._tokenList; 109 } 110 111 private void go() throws IOException { 112 _pos++; 113 readIfNeeded(0); 114 } 115 116 private void go(int step) throws IOException { 117 _pos += step; 118 readIfNeeded(step - 1); 119 } 120 121 127 private boolean startsWith(String value) throws IOException { 128 int valueLen = value.length(); 129 readIfNeeded(valueLen); 130 if (_len >= 0 && _pos + valueLen > _len) { 131 return false; 132 } 133 134 for (int i = 0; i < valueLen; i++) { 135 char ch1 = Character.toLowerCase( value.charAt(i) ); 136 char ch2 = Character.toLowerCase( _working[_pos + i] ); 137 if (ch1 != ch2) { 138 return false; 139 } 140 } 141 142 return true; 143 } 144 145 150 private boolean isWhitespace(int position) { 151 if (_len >= 0 && position >= _len) { 152 return false; 153 } 154 155 return Character.isWhitespace( _working[position] ); 156 } 157 158 162 private boolean isWhitespace() { 163 return isWhitespace(_pos); 164 } 165 166 172 private boolean isChar(int position, char ch) { 173 if (_len >= 0 && position >= _len) { 174 return false; 175 } 176 177 return Character.toLowerCase(ch) == Character.toLowerCase(_working[position]); 178 } 179 180 185 private boolean isChar(char ch) { 186 return isChar(_pos, ch); 187 } 188 189 194 private boolean isIdentifierStartChar(int position) { 195 if (_len >= 0 && position >= _len) { 196 return false; 197 } 198 199 char ch = _working[position]; 200 return Character.isUnicodeIdentifierStart(ch) || (':' == ch); 201 } 202 203 207 private boolean isIdentifierStartChar() { 208 return isIdentifierStartChar(_pos); 209 } 210 211 215 private boolean isIdentifierChar() { 216 if (_len >= 0 && _pos >= _len) { 217 return false; 218 } 219 220 char ch = _working[_pos]; 221 return Character.isUnicodeIdentifierStart(ch) || Character.isDigit(ch) || (':' == ch) || ('.' == ch) || ('-' == ch); 222 } 223 224 227 private boolean isAllRead() { 228 return _len >= 0 && _pos >= _len; 229 } 230 231 235 private void save(char ch) { 236 _saved.append(ch); 237 } 238 239 242 private void saveCurrent() { 243 if (!isAllRead()) { 244 save( _working[_pos] ); 245 } 246 } 247 248 252 private void saveCurrent(int size) throws IOException { 253 readIfNeeded(size); 254 int pos = _pos; 255 while ( !isAllRead() && (size > 0) ) { 256 save( _working[pos] ); 257 pos++; 258 size--; 259 } 260 } 261 262 267 private void skipWhitespaces() throws IOException { 268 while ( !isAllRead() && isWhitespace() ) { 269 saveCurrent(); 270 go(); 271 } 272 } 273 274 private void addSavedAsContent() { 275 if (_saved.length() > 0) { 276 addToken( new ContentToken(_saved.toString()) ); 277 _saved.delete(0, _saved.length()); 278 } 279 } 280 281 285 void start() throws IOException { 286 _currentTagToken = null; 288 _tokenList.clear(); 289 _asExpected = true; 290 _isScriptContext = false; 291 _isStyleContext = false; 292 _isLateForDoctype = false; 293 294 this._pos = WORKING_BUFFER_SIZE; 295 readIfNeeded(0); 296 297 while ( !isAllRead() ) { 298 _saved.delete(0, _saved.length()); 300 _currentTagToken = null; 301 _asExpected = true; 302 303 readIfNeeded(10); 305 306 if (_isScriptContext) { 307 if ( startsWith("</script") && (isWhitespace(_pos + 8) || isChar(_pos + 8, '>')) ) { 308 tagEnd(); 309 } else { 310 content(); 311 } 312 } else if (_isStyleContext) { 313 if ( startsWith("</style") && (isWhitespace(_pos + 7) || isChar(_pos + 7, '>')) ) { 314 tagEnd(); 315 } else { 316 content(); 317 } 318 } else { 319 if ( startsWith("<!doctype") ) { 320 if ( !_isLateForDoctype ) { 321 doctype(); 322 _isLateForDoctype = true; 323 } else { 324 ignore(); 325 } 326 } else if ( startsWith("</") && isIdentifierStartChar(_pos + 2) ) { 327 _isLateForDoctype = true; 328 tagEnd(); 329 } else if ( startsWith("<!--") ) { 330 comment(); 331 } else if ( startsWith("<") && isIdentifierStartChar(_pos + 1) ) { 332 _isLateForDoctype = true; 333 tagStart(); 334 } else { 335 content(); 336 } 337 } 338 } 339 340 _reader.close(); 341 } 342 343 349 private void tagStart() throws IOException { 350 saveCurrent(); 351 go(); 352 353 if ( isAllRead() ) { 354 return; 355 } 356 357 String tagName = identifier(); 358 _currentTagToken = new TagNode(tagName); 359 360 if (_asExpected) { 361 skipWhitespaces(); 362 tagAttributes(); 363 364 String originalSource = _saved.toString(); 365 addToken(_currentTagToken); 366 if ( isChar('>') ) { 367 go(); 368 if ( "script".equalsIgnoreCase(tagName) ) { 369 _isScriptContext = true; 370 } else if ( "style".equalsIgnoreCase(tagName) ) { 371 _isStyleContext = true; 372 } 373 originalSource += ">"; 374 } else if ( startsWith("/>") ) { 375 go(2); 376 addToken( new EndTagToken(tagName) ); 377 originalSource += "/>"; 378 } 379 380 _currentTagToken.setOriginalSource(originalSource); 381 _currentTagToken = null; 382 } else { 383 addSavedAsContent(); 384 } 385 } 386 387 388 394 private void tagEnd() throws IOException { 395 saveCurrent(2); 396 go(2); 397 398 if ( isAllRead() ) { 399 return; 400 } 401 402 String tagName = identifier(); 403 _currentTagToken = new EndTagToken(tagName); 404 405 if (_asExpected) { 406 skipWhitespaces(); 407 tagAttributes(); 408 409 String originalSource = _saved.toString(); 410 addToken(_currentTagToken); 411 412 if ( isChar('>') ) { 413 go(); 414 originalSource += ">"; 415 } 416 417 if ( "script".equalsIgnoreCase(tagName) ) { 418 _isScriptContext = false; 419 } else if ( "style".equalsIgnoreCase(tagName) ) { 420 _isStyleContext = false; 421 } 422 423 _currentTagToken.setOriginalSource(originalSource); 424 _currentTagToken = null; 425 } else { 426 addSavedAsContent(); 427 } 428 } 429 430 434 private String identifier() throws IOException { 435 _asExpected = true; 436 437 if ( !isIdentifierStartChar() ) { 438 _asExpected = false; 439 return null; 440 } 441 442 StringBuffer tagName = new StringBuffer (16); 443 444 while ( !isAllRead() && isIdentifierChar() ) { 445 saveCurrent(); 446 tagName.append( _working[_pos] ); 447 go(); 448 } 449 450 return tagName.toString(); 451 } 452 453 457 private void tagAttributes() throws IOException { 458 while( !isAllRead() && _asExpected && !isChar('>') && !startsWith("/>") ) { 459 skipWhitespaces(); 460 String attName = identifier(); 461 462 if (!_asExpected) { 463 if ( !isChar('<') && !isChar('>') && !startsWith("/>") ) { 464 saveCurrent(); 465 go(); 466 } 467 468 if (!isChar('<')) { 469 _asExpected = true; 470 } 471 472 continue; 473 } 474 475 String attValue = attName; 476 477 skipWhitespaces(); 478 if ( isChar('=') ) { 479 saveCurrent(); 480 go(); 481 attValue = attributeValue(); 482 } 483 484 if (_asExpected) { 485 _currentTagToken.addAttribute(attName, attValue); 486 } 487 } 488 } 489 490 498 private String attributeValue() throws IOException { 499 skipWhitespaces(); 500 501 if ( isChar('<') || isChar('>') || startsWith("/>") ) { 502 return ""; 503 } 504 505 boolean isQuoteMode = false; 506 boolean isAposMode = false; 507 508 StringBuffer result = new StringBuffer (); 509 510 if ( isChar('\'') ) { 511 isAposMode = true; 512 saveCurrent(); 513 go(); 514 } else if ( isChar('\"') ) { 515 isQuoteMode = true; 516 saveCurrent(); 517 go(); 518 } 519 520 while ( !isAllRead() && 521 ( (isAposMode && !isChar('\'')) || 522 (isQuoteMode && !isChar('\"')) || 523 (!isAposMode && !isQuoteMode && !isWhitespace() && !isChar('>') && !startsWith("/>")) 524 ) 525 ) { 526 result.append( _working[_pos] ); 527 saveCurrent(); 528 go(); 529 } 530 531 if ( isChar('\'') && isAposMode ) { 532 saveCurrent(); 533 go(); 534 } else if ( isChar('\"') && isQuoteMode ) { 535 saveCurrent(); 536 go(); 537 } 538 539 540 return result.toString(); 541 } 542 543 private void content() throws IOException { 544 while ( !isAllRead() ) { 545 saveCurrent(); 546 go(); 547 548 if ( isChar('<') ) { 549 break; 550 } 551 } 552 553 addSavedAsContent(); 554 } 555 556 private void ignore() throws IOException { 557 while ( !isAllRead() ) { 558 go(); 559 if ( isChar('<') ) { 560 break; 561 } 562 } 563 } 564 565 private void comment() throws IOException { 566 go(4); 567 while ( !isAllRead() && !startsWith("-->") ) { 568 saveCurrent(); 569 go(); 570 } 571 572 if (startsWith("-->")) { 573 go(3); 574 } 575 576 if (_saved.length() > 0) { 577 if ( !cleaner.isOmitComments() ) { 578 String hyphenRepl = cleaner.getHyphenReplacementInComment(); 579 String comment = _saved.toString().replaceAll("--", hyphenRepl + hyphenRepl); 580 581 if ( comment.length() > 0 && comment.charAt(0) == '-' ) { 582 comment = hyphenRepl + comment.substring(1); 583 } 584 int len = comment.length(); 585 if ( len > 0 && comment.charAt(len - 1) == '-' ) { 586 comment = comment.substring(0, len - 1) + hyphenRepl; 587 } 588 589 addToken( new CommentToken(comment) ); 590 } 591 _saved.delete(0, _saved.length()); 592 } 593 } 594 595 private void doctype() throws IOException { 596 go(9); 597 598 skipWhitespaces(); 599 String part1 = identifier(); 600 skipWhitespaces(); 601 String part2 = identifier(); 602 skipWhitespaces(); 603 String part3 = attributeValue(); 604 skipWhitespaces(); 605 String part4 = attributeValue(); 606 607 ignore(); 608 609 DoctypeToken _docType = new DoctypeToken(part1, part2, part3, part4); 610 611 if ( _docType.isValid() ) { 612 cleaner.setDoctype(_docType); 613 } 614 } 615 616 } | Popular Tags |