HtmlTokenizer


1   /*  Copyright (c) 2006-2007, Vladimir Nikic
2       All rights reserved.
3       
4       Redistribution and use of this software in source and binary forms, 
5       with or without modification, are permitted provided that the following 
6       conditions are met:
7       
8       * Redistributions of source code must retain the above
9         copyright notice, this list of conditions and the
10        following disclaimer.
11      
12      * Redistributions in binary form must reproduce the above
13        copyright notice, this list of conditions and the
14        following disclaimer in the documentation and/or other
15        materials provided with the distribution.
16      
17      * The name of HtmlCleaner may not be used to endorse or promote 
18        products derived from this software without specific prior
19        written permission.
20  
21      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
22      AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
23      IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
24      ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
25      LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
26      CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
27      SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
28      INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
29      CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
30      ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
31      POSSIBILITY OF SUCH DAMAGE.
32      
33      You can contact Vladimir Nikic by sending e-mail to
34      nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
35      subject line.
36  */
37  
38  package org.htmlcleaner;
39  
40  import java.io.*;
41  import java.util.*;
42  
43  /**
44   * Main HTML tokenizer.
45   * <p>Its taks is to parse HTML and produce list of valid tokens:
46   * open tag tokens, end tag tokens, contents (text) and comments.
47   * As soon as new item is added to token list, cleaner is invoked
48   * to clean current list at the end.</p>
49   *
50   * Created by: Vladimir Nikic.<br>
51   * Date: November, 2006
52  
53   */
54  public class HtmlTokenizer {
55      
56      private final static int WORKING_BUFFER_SIZE = 1024;
57  
58      private BufferedReader _reader;
59      private char[] _working = new char[WORKING_BUFFER_SIZE];
60      
61      private transient int _pos = 0;
62      private transient int _len = -1;
63  
64      private transient StringBuffer   _saved = new StringBuffer  (512);
65  
66      private transient boolean _isLateForDoctype = false;
67      private transient TagToken _currentTagToken = null;
68      private transient List _tokenList = new ArrayList();
69  
70      private boolean _asExpected = true;
71  
72      private boolean _isScriptContext = false;
73      private boolean _isStyleContext = false;
74  
75      private HtmlCleaner cleaner;
76  
77      /**
78       * Constructor - cretes instance of the parser with specified content.
79       * @param cleaner
80       * @throws IOException
81       */
82      public HtmlTokenizer(HtmlCleaner cleaner) throws IOException {
83          this._reader = new BufferedReader( cleaner.getReader() );
84          this.cleaner = cleaner;
85      }
86  
87      private void addToken(BaseToken token) {
88          _tokenList.add(token);
89          cleaner.makeTree( _tokenList, _tokenList.listIterator(_tokenList.size() - 1) );
90      }
91  
92      private void readIfNeeded(int neededChars) throws IOException {
93          if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) {
94              int numToCopy = WORKING_BUFFER_SIZE - _pos;
95              System.arraycopy(_working, _pos, _working, 0, numToCopy);
96              _pos = 0;
97              int size = _reader.read(_working, numToCopy, WORKING_BUFFER_SIZE - numToCopy);
98  
99              if (size < WORKING_BUFFER_SIZE - numToCopy) {
100                 _len = (size == -1 && numToCopy == 0) ? 0 : size + numToCopy;
101             } else if (size == -1) {
102                 _len = numToCopy;
103             }
104         }
105     }
106 
107     List getTokenList() {
108         return this._tokenList;
109     }
110 
111     private void go() throws IOException {
112         _pos++;
113         readIfNeeded(0);
114     }
115 
116     private void go(int step) throws IOException {
117         _pos += step;
118         readIfNeeded(step - 1);
119     }
120 
121     /**
122      * Checks if content starts with specified value at the current position.
123      * @param value
124      * @return true if starts with specified value, false otherwise.
125      * @throws IOException
126      */
127     private boolean startsWith(String   value) throws IOException {
128         int valueLen = value.length();
129         readIfNeeded(valueLen);
130         if (_len >= 0 && _pos + valueLen  > _len) {
131             return false;
132         }
133 
134         for (int i = 0; i < valueLen; i++) {
135             char ch1 = Character.toLowerCase( value.charAt(i) );
136             char ch2 = Character.toLowerCase( _working[_pos + i] );
137             if (ch1 != ch2) {
138                 return false;
139             }
140         }
141 
142         return true;
143     }
144 
145     /**
146      * Checks if character at specified position is whitespace.
147      * @param position
148      * @return true is whitespace, false otherwise.
149      */
150     private boolean isWhitespace(int position) {
151         if (_len >= 0 && position >= _len) {
152             return false;
153         }
154 
155         return Character.isWhitespace( _working[position] );
156     }
157 
158     /**
159      * Checks if character at current runtime position is whitespace.
160      * @return true is whitespace, false otherwise.
161      */
162     private boolean isWhitespace() {
163         return isWhitespace(_pos);
164     }
165 
166     /**
167      * Checks if character at specified position is equal to specified char.
168      * @param position
169      * @param ch
170      * @return true is equals, false otherwise.
171      */
172     private boolean isChar(int position, char ch) {
173         if (_len >= 0 && position >= _len) {
174             return false;
175         }
176 
177         return Character.toLowerCase(ch) == Character.toLowerCase(_working[position]);
178     }
179 
180     /**
181      * Checks if character at current runtime position is equal to specified char.
182      * @param ch
183      * @return true is equal, false otherwise.
184      */
185     private boolean isChar(char ch) {
186         return isChar(_pos, ch);
187     }
188 
189     /**
190      * Checks if character at specified position can be identifier start.
191      * @param position
192      * @return true is may be identifier start, false otherwise.
193      */
194     private boolean isIdentifierStartChar(int position) {
195         if (_len >= 0 && position >= _len) {
196             return false;
197         }
198 
199         char ch = _working[position];
200         return Character.isUnicodeIdentifierStart(ch) || (':' == ch);
201     }
202 
203     /**
204      * Checks if character at current runtime position can be identifier start.
205      * @return true is may be identifier start, false otherwise.
206      */
207     private boolean isIdentifierStartChar() {
208         return isIdentifierStartChar(_pos);
209     }
210 
211     /**
212      * Checks if character at current runtime position can be identifier part.
213      * @return true is may be identifier part, false otherwise.
214      */
215     private boolean isIdentifierChar() {
216         if (_len >= 0 && _pos >= _len) {
217             return false;
218         }
219 
220         char ch = _working[_pos];
221         return Character.isUnicodeIdentifierStart(ch) || Character.isDigit(ch) || (':' == ch) || ('.' == ch) || ('-' == ch);
222     }
223 
224     /**
225      * Checks if end of the content is reached.
226      */
227     private boolean isAllRead() {
228         return _len >= 0 && _pos >= _len;
229     }
230 
231     /**
232      * Saves specified character to the temporary buffer.
233      * @param ch
234      */
235     private void save(char ch) {
236         _saved.append(ch);
237     }
238 
239     /**
240      * Saves character at current runtime position to the temporary buffer.
241      */
242     private void saveCurrent() {
243         if (!isAllRead()) {
244             save( _working[_pos] );
245         }
246     }
247 
248     /**
249      * Saves specified number of characters at current runtime position to the temporary buffer.
250      * @throws IOException
251      */
252     private void saveCurrent(int size) throws IOException {
253         readIfNeeded(size);
254         int pos = _pos;
255         while ( !isAllRead() && (size > 0) ) {
256             save( _working[pos] );
257             pos++;
258             size--;
259         }
260     }
261 
262     /**
263      * Skips whitespaces at current position and moves foreward until
264      * non-whitespace character is found or the end of content is reached.
265      * @throws IOException
266      */
267     private void skipWhitespaces() throws IOException {
268         while ( !isAllRead() && isWhitespace() ) {
269             saveCurrent();
270             go();
271         }
272     }
273 
274     private void addSavedAsContent() {
275         if (_saved.length() > 0) {
276             addToken( new ContentToken(_saved.toString()) );
277             _saved.delete(0, _saved.length());
278         }
279     }
280 
281     /**
282      * Starts parsing HTML.
283      * @throws IOException
284      */
285     void start() throws IOException {
286         // initialize runtime values
287         _currentTagToken = null;
288         _tokenList.clear();
289         _asExpected = true;
290         _isScriptContext = false;
291         _isStyleContext = false;
292         _isLateForDoctype = false;
293 
294         this._pos = WORKING_BUFFER_SIZE;
295         readIfNeeded(0);
296 
297         while ( !isAllRead() ) {
298             // resets all the runtime values
299             _saved.delete(0, _saved.length());
300             _currentTagToken = null;
301             _asExpected = true;
302 
303             // this is enough for making decision
304             readIfNeeded(10);
305 
306             if (_isScriptContext) {
307                 if ( startsWith("</script") && (isWhitespace(_pos + 8) || isChar(_pos + 8, '>')) ) {
308                     tagEnd();
309                 } else {
310                     content();
311                 }
312             } else if (_isStyleContext) {
313                 if ( startsWith("</style") && (isWhitespace(_pos + 7) || isChar(_pos + 7, '>')) ) {
314                     tagEnd();
315                 } else {
316                     content();
317                 }
318             } else {
319                 if ( startsWith("<!doctype") ) {
320                     if ( !_isLateForDoctype ) {
321                         doctype();
322                         _isLateForDoctype = true;
323                     } else {
324                         ignore();
325                     }
326                 } else if ( startsWith("</") && isIdentifierStartChar(_pos + 2) ) {
327                     _isLateForDoctype = true;
328                     tagEnd();
329                 } else if ( startsWith("<!--") ) {
330                     comment();
331                 } else if ( startsWith("<") && isIdentifierStartChar(_pos + 1) ) {
332                     _isLateForDoctype = true;
333                     tagStart();
334                 } else {
335                     content();
336                 }
337             }
338         }
339 
340         _reader.close();
341     }
342 
343     /**
344      * Parses start of the tag.
345      * It expects that current position is at the "<" after which
346      * the tag's name follows.
347      * @throws IOException
348      */
349     private void tagStart() throws IOException {
350         saveCurrent();
351         go();
352 
353         if ( isAllRead() ) {
354             return;
355         }
356 
357         String   tagName = identifier();
358         _currentTagToken = new TagNode(tagName);
359 
360         if (_asExpected) {
361             skipWhitespaces();
362             tagAttributes();
363 
364             String   originalSource = _saved.toString();
365             addToken(_currentTagToken);
366             if ( isChar('>') ) {
367                 go();
368                 if ( "script".equalsIgnoreCase(tagName) ) {
369                     _isScriptContext = true;
370                 } else if ( "style".equalsIgnoreCase(tagName) ) {
371                     _isStyleContext = true;
372                 }
373                 originalSource += ">";
374             } else if ( startsWith("/>") ) {
375                 go(2);
376                 addToken( new EndTagToken(tagName) );
377                 originalSource += "/>";
378             }
379 
380             _currentTagToken.setOriginalSource(originalSource);
381             _currentTagToken = null;
382         } else {
383             addSavedAsContent();
384         }
385     }
386 
387 
388     /**
389      * Parses end of the tag.
390      * It expects that current position is at the "<" after which
391      * "/" and the tag's name follows.
392      * @throws IOException
393      */
394     private void tagEnd() throws IOException {
395         saveCurrent(2);
396         go(2);
397 
398         if ( isAllRead() ) {
399             return;
400         }
401 
402         String   tagName = identifier();
403         _currentTagToken = new EndTagToken(tagName);
404 
405         if (_asExpected) {
406             skipWhitespaces();
407             tagAttributes();
408 
409             String   originalSource = _saved.toString();
410             addToken(_currentTagToken);
411 
412             if ( isChar('>') ) {
413                 go();
414                 originalSource += ">";
415             }
416 
417             if ( "script".equalsIgnoreCase(tagName) ) {
418                 _isScriptContext = false;
419             } else if ( "style".equalsIgnoreCase(tagName) ) {
420                 _isStyleContext = false;
421             }
422 
423             _currentTagToken.setOriginalSource(originalSource);
424             _currentTagToken = null;
425         } else {
426             addSavedAsContent();
427         }
428     }
429 
430     /**
431      * Parses an identifier from the current position.
432      * @throws IOException
433      */
434     private String   identifier() throws IOException {
435         _asExpected = true;
436 
437         if ( !isIdentifierStartChar() ) {
438             _asExpected = false;
439             return null;
440         }
441 
442         StringBuffer   tagName = new StringBuffer  (16);
443 
444         while ( !isAllRead() && isIdentifierChar() ) {
445             saveCurrent();
446             tagName.append( _working[_pos] );
447             go();
448         }
449 
450         return tagName.toString();
451     }
452 
453     /**
454      * Parses list tag attributes from the current position.
455      * @throws IOException
456      */
457     private void tagAttributes() throws IOException {
458         while( !isAllRead() && _asExpected && !isChar('>') && !startsWith("/>") ) {
459             skipWhitespaces();
460             String   attName = identifier();
461 
462             if (!_asExpected) {
463                 if ( !isChar('<') && !isChar('>') && !startsWith("/>") ) {
464                     saveCurrent();
465                     go();
466                 }
467 
468                 if (!isChar('<')) {
469                     _asExpected = true;
470                 }
471 
472                 continue;
473             }
474 
475             String   attValue = attName;
476 
477             skipWhitespaces();
478             if ( isChar('=') ) {
479                 saveCurrent();
480                 go();
481                 attValue = attributeValue();
482             }
483 
484             if (_asExpected) {
485                 _currentTagToken.addAttribute(attName, attValue);
486             }
487         }
488     }
489 
490     /**
491      * Parses a single tag attribute - it is expected to be in one of the forms:
492      *      name=value
493      *      name="value"
494      *      name='value'
495      *      name
496      * @throws IOException
497      */
498     private String   attributeValue() throws IOException {
499         skipWhitespaces();
500         
501         if ( isChar('<') || isChar('>') || startsWith("/>") ) {
502             return "";
503         }
504 
505         boolean isQuoteMode = false;
506         boolean isAposMode = false;
507 
508         StringBuffer   result = new StringBuffer  ();
509 
510         if ( isChar('\'') ) {
511             isAposMode = true;
512             saveCurrent();
513             go();
514         } else if ( isChar('\"') ) {
515             isQuoteMode = true;
516             saveCurrent();
517             go();
518         }
519 
520         while ( !isAllRead() &&
521                 ( (isAposMode && !isChar('\'')) ||
522                   (isQuoteMode && !isChar('\"')) ||
523                   (!isAposMode && !isQuoteMode && !isWhitespace() && !isChar('>') && !startsWith("/>"))
524                 )
525               ) {
526             result.append( _working[_pos] );
527             saveCurrent();
528             go();
529         }
530 
531         if ( isChar('\'') && isAposMode ) {
532             saveCurrent();
533             go();
534         } else if ( isChar('\"') && isQuoteMode ) {
535             saveCurrent();
536             go();
537         }
538 
539 
540         return result.toString();
541     }
542 
543     private void content() throws IOException {
544         while ( !isAllRead() ) {
545             saveCurrent();
546             go();
547 
548             if ( isChar('<') ) {
549                 break;
550             }
551         }
552 
553         addSavedAsContent();
554     }
555 
556     private void ignore() throws IOException {
557         while ( !isAllRead() ) {
558             go();
559             if ( isChar('<') ) {
560                 break;
561             }
562         }
563     }
564 
565     private void comment() throws IOException {
566         go(4);
567         while ( !isAllRead() && !startsWith("-->") ) {
568             saveCurrent();
569             go();
570         }
571 
572         if (startsWith("-->")) {
573             go(3);
574         }
575 
576         if (_saved.length() > 0) {
577             if ( !cleaner.isOmitComments() ) {
578                 String   hyphenRepl = cleaner.getHyphenReplacementInComment();
579                 String   comment = _saved.toString().replaceAll("--", hyphenRepl + hyphenRepl);
580 
581                 if ( comment.length() > 0 && comment.charAt(0) == '-' ) {
582                     comment = hyphenRepl + comment.substring(1);
583                 }
584                 int len = comment.length();
585                 if ( len > 0 && comment.charAt(len - 1) == '-' ) {
586                     comment = comment.substring(0, len - 1) + hyphenRepl;
587                 }
588 
589                 addToken( new CommentToken(comment) );
590             }
591             _saved.delete(0, _saved.length());
592         }
593     }
594     
595     private void doctype() throws IOException {
596         go(9);
597 
598         skipWhitespaces();
599         String   part1 = identifier();
600         skipWhitespaces();
601         String   part2 = identifier();
602         skipWhitespaces();
603         String   part3 = attributeValue();
604         skipWhitespaces();
605         String   part4 = attributeValue();
606         
607         ignore();
608         
609         DoctypeToken _docType = new DoctypeToken(part1, part2, part3, part4);
610         
611         if ( _docType.isValid() ) {
612             cleaner.setDoctype(_docType);
613         }
614     }
615 
616 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags