KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlcleaner > HtmlTokenizer


1 /* Copyright (c) 2006-2007, Vladimir Nikic
2     All rights reserved.
3     
4     Redistribution and use of this software in source and binary forms,
5     with or without modification, are permitted provided that the following
6     conditions are met:
7     
8     * Redistributions of source code must retain the above
9       copyright notice, this list of conditions and the
10       following disclaimer.
11     
12     * Redistributions in binary form must reproduce the above
13       copyright notice, this list of conditions and the
14       following disclaimer in the documentation and/or other
15       materials provided with the distribution.
16     
17     * The name of HtmlCleaner may not be used to endorse or promote
18       products derived from this software without specific prior
19       written permission.
20
21     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31     POSSIBILITY OF SUCH DAMAGE.
32     
33     You can contact Vladimir Nikic by sending e-mail to
34     nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
35     subject line.
36 */

37
38 package org.htmlcleaner;
39
40 import java.io.*;
41 import java.util.*;
42
43 /**
44  * Main HTML tokenizer.
45  * <p>Its taks is to parse HTML and produce list of valid tokens:
46  * open tag tokens, end tag tokens, contents (text) and comments.
47  * As soon as new item is added to token list, cleaner is invoked
48  * to clean current list at the end.</p>
49  *
50  * Created by: Vladimir Nikic.<br>
51  * Date: November, 2006
52
53  */

54 public class HtmlTokenizer {
55     
56     private final static int WORKING_BUFFER_SIZE = 1024;
57
58     private BufferedReader _reader;
59     private char[] _working = new char[WORKING_BUFFER_SIZE];
60     
61     private transient int _pos = 0;
62     private transient int _len = -1;
63
64     private transient StringBuffer JavaDoc _saved = new StringBuffer JavaDoc(512);
65
66     private transient boolean _isLateForDoctype = false;
67     private transient TagToken _currentTagToken = null;
68     private transient List _tokenList = new ArrayList();
69
70     private boolean _asExpected = true;
71
72     private boolean _isScriptContext = false;
73     private boolean _isStyleContext = false;
74
75     private HtmlCleaner cleaner;
76
77     /**
78      * Constructor - cretes instance of the parser with specified content.
79      * @param cleaner
80      * @throws IOException
81      */

82     public HtmlTokenizer(HtmlCleaner cleaner) throws IOException {
83         this._reader = new BufferedReader( cleaner.getReader() );
84         this.cleaner = cleaner;
85     }
86
87     private void addToken(BaseToken token) {
88         _tokenList.add(token);
89         cleaner.makeTree( _tokenList, _tokenList.listIterator(_tokenList.size() - 1) );
90     }
91
92     private void readIfNeeded(int neededChars) throws IOException {
93         if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) {
94             int numToCopy = WORKING_BUFFER_SIZE - _pos;
95             System.arraycopy(_working, _pos, _working, 0, numToCopy);
96             _pos = 0;
97             int size = _reader.read(_working, numToCopy, WORKING_BUFFER_SIZE - numToCopy);
98
99             if (size < WORKING_BUFFER_SIZE - numToCopy) {
100                 _len = (size == -1 && numToCopy == 0) ? 0 : size + numToCopy;
101             } else if (size == -1) {
102                 _len = numToCopy;
103             }
104         }
105     }
106
107     List getTokenList() {
108         return this._tokenList;
109     }
110
111     private void go() throws IOException {
112         _pos++;
113         readIfNeeded(0);
114     }
115
116     private void go(int step) throws IOException {
117         _pos += step;
118         readIfNeeded(step - 1);
119     }
120
121     /**
122      * Checks if content starts with specified value at the current position.
123      * @param value
124      * @return true if starts with specified value, false otherwise.
125      * @throws IOException
126      */

127     private boolean startsWith(String JavaDoc value) throws IOException {
128         int valueLen = value.length();
129         readIfNeeded(valueLen);
130         if (_len >= 0 && _pos + valueLen > _len) {
131             return false;
132         }
133
134         for (int i = 0; i < valueLen; i++) {
135             char ch1 = Character.toLowerCase( value.charAt(i) );
136             char ch2 = Character.toLowerCase( _working[_pos + i] );
137             if (ch1 != ch2) {
138                 return false;
139             }
140         }
141
142         return true;
143     }
144
145     /**
146      * Checks if character at specified position is whitespace.
147      * @param position
148      * @return true is whitespace, false otherwise.
149      */

150     private boolean isWhitespace(int position) {
151         if (_len >= 0 && position >= _len) {
152             return false;
153         }
154
155         return Character.isWhitespace( _working[position] );
156     }
157
158     /**
159      * Checks if character at current runtime position is whitespace.
160      * @return true is whitespace, false otherwise.
161      */

162     private boolean isWhitespace() {
163         return isWhitespace(_pos);
164     }
165
166     /**
167      * Checks if character at specified position is equal to specified char.
168      * @param position
169      * @param ch
170      * @return true is equals, false otherwise.
171      */

172     private boolean isChar(int position, char ch) {
173         if (_len >= 0 && position >= _len) {
174             return false;
175         }
176
177         return Character.toLowerCase(ch) == Character.toLowerCase(_working[position]);
178     }
179
180     /**
181      * Checks if character at current runtime position is equal to specified char.
182      * @param ch
183      * @return true is equal, false otherwise.
184      */

185     private boolean isChar(char ch) {
186         return isChar(_pos, ch);
187     }
188
189     /**
190      * Checks if character at specified position can be identifier start.
191      * @param position
192      * @return true is may be identifier start, false otherwise.
193      */

194     private boolean isIdentifierStartChar(int position) {
195         if (_len >= 0 && position >= _len) {
196             return false;
197         }
198
199         char ch = _working[position];
200         return Character.isUnicodeIdentifierStart(ch) || (':' == ch);
201     }
202
203     /**
204      * Checks if character at current runtime position can be identifier start.
205      * @return true is may be identifier start, false otherwise.
206      */

207     private boolean isIdentifierStartChar() {
208         return isIdentifierStartChar(_pos);
209     }
210
211     /**
212      * Checks if character at current runtime position can be identifier part.
213      * @return true is may be identifier part, false otherwise.
214      */

215     private boolean isIdentifierChar() {
216         if (_len >= 0 && _pos >= _len) {
217             return false;
218         }
219
220         char ch = _working[_pos];
221         return Character.isUnicodeIdentifierStart(ch) || Character.isDigit(ch) || (':' == ch) || ('.' == ch) || ('-' == ch);
222     }
223
224     /**
225      * Checks if end of the content is reached.
226      */

227     private boolean isAllRead() {
228         return _len >= 0 && _pos >= _len;
229     }
230
231     /**
232      * Saves specified character to the temporary buffer.
233      * @param ch
234      */

235     private void save(char ch) {
236         _saved.append(ch);
237     }
238
239     /**
240      * Saves character at current runtime position to the temporary buffer.
241      */

242     private void saveCurrent() {
243         if (!isAllRead()) {
244             save( _working[_pos] );
245         }
246     }
247
248     /**
249      * Saves specified number of characters at current runtime position to the temporary buffer.
250      * @throws IOException
251      */

252     private void saveCurrent(int size) throws IOException {
253         readIfNeeded(size);
254         int pos = _pos;
255         while ( !isAllRead() && (size > 0) ) {
256             save( _working[pos] );
257             pos++;
258             size--;
259         }
260     }
261
262     /**
263      * Skips whitespaces at current position and moves foreward until
264      * non-whitespace character is found or the end of content is reached.
265      * @throws IOException
266      */

267     private void skipWhitespaces() throws IOException {
268         while ( !isAllRead() && isWhitespace() ) {
269             saveCurrent();
270             go();
271         }
272     }
273
274     private void addSavedAsContent() {
275         if (_saved.length() > 0) {
276             addToken( new ContentToken(_saved.toString()) );
277             _saved.delete(0, _saved.length());
278         }
279     }
280
281     /**
282      * Starts parsing HTML.
283      * @throws IOException
284      */

285     void start() throws IOException {
286         // initialize runtime values
287
_currentTagToken = null;
288         _tokenList.clear();
289         _asExpected = true;
290         _isScriptContext = false;
291         _isStyleContext = false;
292         _isLateForDoctype = false;
293
294         this._pos = WORKING_BUFFER_SIZE;
295         readIfNeeded(0);
296
297         while ( !isAllRead() ) {
298             // resets all the runtime values
299
_saved.delete(0, _saved.length());
300             _currentTagToken = null;
301             _asExpected = true;
302
303             // this is enough for making decision
304
readIfNeeded(10);
305
306             if (_isScriptContext) {
307                 if ( startsWith("</script") && (isWhitespace(_pos + 8) || isChar(_pos + 8, '>')) ) {
308                     tagEnd();
309                 } else {
310                     content();
311                 }
312             } else if (_isStyleContext) {
313                 if ( startsWith("</style") && (isWhitespace(_pos + 7) || isChar(_pos + 7, '>')) ) {
314                     tagEnd();
315                 } else {
316                     content();
317                 }
318             } else {
319                 if ( startsWith("<!doctype") ) {
320                     if ( !_isLateForDoctype ) {
321                         doctype();
322                         _isLateForDoctype = true;
323                     } else {
324                         ignore();
325                     }
326                 } else if ( startsWith("</") && isIdentifierStartChar(_pos + 2) ) {
327                     _isLateForDoctype = true;
328                     tagEnd();
329                 } else if ( startsWith("<!--") ) {
330                     comment();
331                 } else if ( startsWith("<") && isIdentifierStartChar(_pos + 1) ) {
332                     _isLateForDoctype = true;
333                     tagStart();
334                 } else {
335                     content();
336                 }
337             }
338         }
339
340         _reader.close();
341     }
342
343     /**
344      * Parses start of the tag.
345      * It expects that current position is at the "<" after which
346      * the tag's name follows.
347      * @throws IOException
348      */

349     private void tagStart() throws IOException {
350         saveCurrent();
351         go();
352
353         if ( isAllRead() ) {
354             return;
355         }
356
357         String JavaDoc tagName = identifier();
358         _currentTagToken = new TagNode(tagName);
359
360         if (_asExpected) {
361             skipWhitespaces();
362             tagAttributes();
363
364             String JavaDoc originalSource = _saved.toString();
365             addToken(_currentTagToken);
366             if ( isChar('>') ) {
367                 go();
368                 if ( "script".equalsIgnoreCase(tagName) ) {
369                     _isScriptContext = true;
370                 } else if ( "style".equalsIgnoreCase(tagName) ) {
371                     _isStyleContext = true;
372                 }
373                 originalSource += ">";
374             } else if ( startsWith("/>") ) {
375                 go(2);
376                 addToken( new EndTagToken(tagName) );
377                 originalSource += "/>";
378             }
379
380             _currentTagToken.setOriginalSource(originalSource);
381             _currentTagToken = null;
382         } else {
383             addSavedAsContent();
384         }
385     }
386
387
388     /**
389      * Parses end of the tag.
390      * It expects that current position is at the "<" after which
391      * "/" and the tag's name follows.
392      * @throws IOException
393      */

394     private void tagEnd() throws IOException {
395         saveCurrent(2);
396         go(2);
397
398         if ( isAllRead() ) {
399             return;
400         }
401
402         String JavaDoc tagName = identifier();
403         _currentTagToken = new EndTagToken(tagName);
404
405         if (_asExpected) {
406             skipWhitespaces();
407             tagAttributes();
408
409             String JavaDoc originalSource = _saved.toString();
410             addToken(_currentTagToken);
411
412             if ( isChar('>') ) {
413                 go();
414                 originalSource += ">";
415             }
416
417             if ( "script".equalsIgnoreCase(tagName) ) {
418                 _isScriptContext = false;
419             } else if ( "style".equalsIgnoreCase(tagName) ) {
420                 _isStyleContext = false;
421             }
422
423             _currentTagToken.setOriginalSource(originalSource);
424             _currentTagToken = null;
425         } else {
426             addSavedAsContent();
427         }
428     }
429
430     /**
431      * Parses an identifier from the current position.
432      * @throws IOException
433      */

434     private String JavaDoc identifier() throws IOException {
435         _asExpected = true;
436
437         if ( !isIdentifierStartChar() ) {
438             _asExpected = false;
439             return null;
440         }
441
442         StringBuffer JavaDoc tagName = new StringBuffer JavaDoc(16);
443
444         while ( !isAllRead() && isIdentifierChar() ) {
445             saveCurrent();
446             tagName.append( _working[_pos] );
447             go();
448         }
449
450         return tagName.toString();
451     }
452
453     /**
454      * Parses list tag attributes from the current position.
455      * @throws IOException
456      */

457     private void tagAttributes() throws IOException {
458         while( !isAllRead() && _asExpected && !isChar('>') && !startsWith("/>") ) {
459             skipWhitespaces();
460             String JavaDoc attName = identifier();
461
462             if (!_asExpected) {
463                 if ( !isChar('<') && !isChar('>') && !startsWith("/>") ) {
464                     saveCurrent();
465                     go();
466                 }
467
468                 if (!isChar('<')) {
469                     _asExpected = true;
470                 }
471
472                 continue;
473             }
474
475             String JavaDoc attValue = attName;
476
477             skipWhitespaces();
478             if ( isChar('=') ) {
479                 saveCurrent();
480                 go();
481                 attValue = attributeValue();
482             }
483
484             if (_asExpected) {
485                 _currentTagToken.addAttribute(attName, attValue);
486             }
487         }
488     }
489
490     /**
491      * Parses a single tag attribute - it is expected to be in one of the forms:
492      * name=value
493      * name="value"
494      * name='value'
495      * name
496      * @throws IOException
497      */

498     private String JavaDoc attributeValue() throws IOException {
499         skipWhitespaces();
500         
501         if ( isChar('<') || isChar('>') || startsWith("/>") ) {
502             return "";
503         }
504
505         boolean isQuoteMode = false;
506         boolean isAposMode = false;
507
508         StringBuffer JavaDoc result = new StringBuffer JavaDoc();
509
510         if ( isChar('\'') ) {
511             isAposMode = true;
512             saveCurrent();
513             go();
514         } else if ( isChar('\"') ) {
515             isQuoteMode = true;
516             saveCurrent();
517             go();
518         }
519
520         while ( !isAllRead() &&
521                 ( (isAposMode && !isChar('\'')) ||
522                   (isQuoteMode && !isChar('\"')) ||
523                   (!isAposMode && !isQuoteMode && !isWhitespace() && !isChar('>') && !startsWith("/>"))
524                 )
525               ) {
526             result.append( _working[_pos] );
527             saveCurrent();
528             go();
529         }
530
531         if ( isChar('\'') && isAposMode ) {
532             saveCurrent();
533             go();
534         } else if ( isChar('\"') && isQuoteMode ) {
535             saveCurrent();
536             go();
537         }
538
539
540         return result.toString();
541     }
542
543     private void content() throws IOException {
544         while ( !isAllRead() ) {
545             saveCurrent();
546             go();
547
548             if ( isChar('<') ) {
549                 break;
550             }
551         }
552
553         addSavedAsContent();
554     }
555
556     private void ignore() throws IOException {
557         while ( !isAllRead() ) {
558             go();
559             if ( isChar('<') ) {
560                 break;
561             }
562         }
563     }
564
565     private void comment() throws IOException {
566         go(4);
567         while ( !isAllRead() && !startsWith("-->") ) {
568             saveCurrent();
569             go();
570         }
571
572         if (startsWith("-->")) {
573             go(3);
574         }
575
576         if (_saved.length() > 0) {
577             if ( !cleaner.isOmitComments() ) {
578                 String JavaDoc hyphenRepl = cleaner.getHyphenReplacementInComment();
579                 String JavaDoc comment = _saved.toString().replaceAll("--", hyphenRepl + hyphenRepl);
580
581                 if ( comment.length() > 0 && comment.charAt(0) == '-' ) {
582                     comment = hyphenRepl + comment.substring(1);
583                 }
584                 int len = comment.length();
585                 if ( len > 0 && comment.charAt(len - 1) == '-' ) {
586                     comment = comment.substring(0, len - 1) + hyphenRepl;
587                 }
588
589                 addToken( new CommentToken(comment) );
590             }
591             _saved.delete(0, _saved.length());
592         }
593     }
594     
595     private void doctype() throws IOException {
596         go(9);
597
598         skipWhitespaces();
599         String JavaDoc part1 = identifier();
600         skipWhitespaces();
601         String JavaDoc part2 = identifier();
602         skipWhitespaces();
603         String JavaDoc part3 = attributeValue();
604         skipWhitespaces();
605         String JavaDoc part4 = attributeValue();
606         
607         ignore();
608         
609         DoctypeToken _docType = new DoctypeToken(part1, part2, part3, part4);
610         
611         if ( _docType.isValid() ) {
612             cleaner.setDoctype(_docType);
613         }
614     }
615
616 }
Popular Tags