KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > javolution > xml > pull > XmlPullParserImpl


1 /*
2  * Javolution - Java(TM) Solution for Real-Time and Embedded Systems
3  * Copyright (C) 2005 - Javolution (http://javolution.org/)
4  * All rights reserved.
5  *
6  * Permission to use, copy, modify, and distribute this software is
7  * freely granted, provided that this notice is preserved.
8  */

9 package javolution.xml.pull;
10
11 import java.io.IOException;
12 import java.io.InputStream;
13 import java.io.InputStreamReader;
14 import java.io.Reader;
15 import java.io.UnsupportedEncodingException;
16
17 import j2me.lang.CharSequence;
18 import j2me.lang.IllegalStateException;
19 import j2me.nio.ByteBuffer;
20
21 import javolution.io.Utf8ByteBufferReader;
22 import javolution.io.Utf8StreamReader;
23 import javolution.lang.PersistentReference;
24 import javolution.lang.Reusable;
25 import javolution.lang.Text;
26 import javolution.lang.TypeFormat;
27 import javolution.realtime.ObjectFactory;
28 import javolution.util.FastComparator;
29 import javolution.util.FastTable;
30 import javolution.xml.sax.Attributes;
31 import javolution.xml.sax.AttributesImpl;
32
33 /**
34  * <p> This class provides a real-time XPP-like XML parser; this parser is
35  * <i>extremely</i> fast and <b>does not create temporary objects</b>
36  * (no garbage generated and no GC interruption).</p>
37  *
38  * <p> The parser input source can be either a {@link #setInput(Reader) Reader},
39  * an {@link #setInput(InputStream) InputStream} or even a {@link
40  * #setInput(ByteBuffer) ByteBuffer} (e.g. <code>MappedByteBuffer</code>).</p>
41  *
42  * <p> This parser is light (less than 15Kbytes compressed) and maintains
43  * a very small memory footprint while parsing (e.g. less than 10Kbytes
44  * while parsing 32Mbytes files). Typical applications include SOAP
45  * messaging, embedded/realtime systems, web servers (possibly thousands
46  * instances running concurrently), etc.</p>
47  *
48  * <p> Namespaces (SAX2 feature), comments, predefined entities
49  * (<code>&amp;amp;, &amp;lt;, &amp;gt;, &amp;apos;, &amp;quot;</code>)
50  * numeric character references (e.g. <code>&amp;#10;</code> for
51  * linefeed) and <code>CDATA</code> are recognized. Processing instructions,
52  * comments and entities declarations are ignored.</p>
53  *
54  * <p> The {@link CharSequence CharSequence} generated by this parser have
55  * the following characteristics:
56  * <ul>
57  * <li> They are immutable within their definition scope. The <code>
58  * CharSequence</code> created while parsing an XML element are
59  * reused only after the element is out-of-scope (idem for the
60  * {@link #getSaxAttributes attributes lists}).</li>
61  * <li> They support equality or lexical comparison with any
62  * <code>CharSequence</code> (e.g. <code>String</code>).</li>
63  * <li> They have the same hashcode than <code>String</code> and can be
64  * used to retrieve data from a <code>Map</code> (e.g.
65  * {@link javolution.util.FastMap FastMap}) for which
66  * the keys are <code>String</code> instances.</li>
67  * <li> Like any <code>CharSequence</code>, they can be parsed
68  * to primitive types (e.g. int, double) using the utility class
69  * {@link TypeFormat}.</li>
70  * </ul></p>
71  *
72  * <p> Finally, this parser does not break up character data during call back
73  * (the whole character data between markups is always being returned).</p>
74  *
75  * @author <a HREF="mailto:javolution@arakelian.com">Gregory Arakelian</a>
76  * @author <a HREF="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
77  * @version 3.3, May 10, 2005
78  */

79 public final class XmlPullParserImpl implements XmlPullParser, Reusable {
80
81     /**
82      * Holds a factory producing AttributesImpl instances.
83      */

84     private static final ObjectFactory ATTRIBUTES_IMPL_FACTORY = new ObjectFactory() {
85         protected Object create() {
86             return new AttributesImpl();
87         }
88     };
89
90     /**
91      * Holds the reader buffer capacity.
92      */

93     private static final int READER_BUFFER_CAPACITY = 2048;
94
95     /**
96      * Holds the configurable nominal length for the data array length (must be
97      * larger than the reader buffer capacity to avoid overflow).
98      */

99     private static final PersistentReference DATA_SIZE = new PersistentReference(
100             "javolution.xml.pull.XmlPullParserImpl#DATA_SIZE", new Integer(
101                     READER_BUFFER_CAPACITY * 2));
102
103     /**
104      * Holds the configurable nominal length for the CharSequenceImpl stack.
105      */

106     private static final PersistentReference SEQ_SIZE = new PersistentReference(
107             "javolution.xml.pull.XmlPullParserImpl#SEQ_SIZE", new Integer(256));
108
109     /**
110      * Holds the parsing line.
111      */

112     private int _lineNumber;
113
114     /**
115      * Holds the column offset (column = _columnOffset + _index).
116      */

117     private int _columnOffset;
118
119     /**
120      * Holds the line length when line break occurs.
121      */

122     private int _lineLength;
123
124     /**
125      * Holds the current index in the character buffer.
126      */

127     private int _index;
128
129     /**
130      * Holds the data buffer for CharSequence produced by this parser.
131      */

132     private char[] _data = (char[]) new char[((Integer) DATA_SIZE.get())
133             .intValue()];
134
135     /**
136      * Holds the current length of the data buffer (_data).
137      */

138     private int _length;
139
140     /**
141      * Holds the current depth.
142      */

143     private int _depth;
144
145     /**
146      * Holds the namespace stack.
147      */

148     private final Namespaces _namespaces = new Namespaces();
149
150     /**
151      * Holds the current attributes (view over _attrPool.get(_depth)).
152      */

153     private AttributesImpl _attributes;
154
155     /**
156      * Holds a pool of AttributesImpl to avoid overwritting the current one.
157      */

158     private final FastTable _attrPool = new FastTable();
159
160     /**
161      * Holds working stack.
162      */

163     private final FastTable _elemStack = new FastTable();
164
165     /**
166      * Holds the character buffer used for reading.
167      */

168     private final char[] _chars = new char[READER_BUFFER_CAPACITY];
169
170     /**
171      * Holds the default stream reader (UTF-8).
172      */

173     private final Utf8StreamReader _inputStreamReader = new Utf8StreamReader(
174             READER_BUFFER_CAPACITY);
175
176     /**
177      * Holds the default ByteBuffer reader (UTF-8).
178      */

179     private final Utf8ByteBufferReader _byteBufferReader = new Utf8ByteBufferReader();
180
181     /**
182      * Number of characters read from reader
183      */

184     private int _charsRead;
185
186     /**
187      * Holds local name (i.e. does not include prefix, if any).
188      */

189     private CharSequenceImpl _elemLocalName;
190
191     /**
192      * Holds element namespace (lookup performed using prefix and namespace stack).
193      */

194     private CharSequenceImpl _elemNamespace;
195
196     /**
197      * Holds qualified name (include prefix).
198      */

199     private CharSequenceImpl _elemQName;
200
201     /**
202      * Holds prefix.
203      */

204     private CharSequenceImpl _elemPrefix;
205
206     /**
207      * Holds attribute qualified name.
208      */

209     private CharSequenceImpl _attrQName;
210
211     /**
212      * Holds attribute prefix.
213      */

214     private CharSequenceImpl _attrPrefix;
215
216     /**
217      * Holds attribute value.
218      */

219     private CharSequenceImpl _attrValue;
220
221     /**
222      * Holds character sequence when parsing numeric literal.
223      */

224     private final CharSequenceImpl _num = new CharSequenceImpl();
225
226     /**
227      * Holds current event type
228      */

229     private int _eventType = END_DOCUMENT;
230
231     /**
232      * Input encoding, if known
233      */

234     private String _inputEncoding;
235
236     /**
237      * Indicates if event type is START_TAG, and tag is empty, i.e. <sometag/>
238      */

239     private boolean _isEmpty;
240
241     /**
242      * Holds index of the last non-whitespace character (-1 = all whitespace).
243      */

244     private int _nonwhitespace;
245
246     /**
247      * Holds reader used to get data
248      */

249     private Reader _reader;
250
251     /**
252      * Holds start of escape sequence
253      */

254     private int _escStart;
255
256     /**
257      * Holds saved parser state when escape sequence encountered.
258      */

259     private int _savedState;
260
261     /**
262      * Holds the start of text withing _data array.
263      */

264     private int _start;
265
266     /**
267      * Holds the parser state.
268      */

269     private int _state = CHAR_DATA;
270
271     /**
272      * Holds the text associated with current event.
273      */

274     private CharSequenceImpl _text;
275
276     /**
277      * Indicates if text contains non whitespace characters (characters
278      * others than space, cr, lf, tab).
279      */

280     private boolean _hasNonWhitespace;
281
282     /**
283      * Holds character sequences instances.
284      */

285     private CharSequenceImpl[] _seqs = new CharSequenceImpl[((Integer) SEQ_SIZE
286             .get()).intValue()];
287
288     /**
289      * Holds character sequence index.
290      */

291     private int _seqsIndex;
292
293     /**
294      * Holds number of character sequence instances allocated.
295      */

296     private int _seqsCapacity;
297
298     /**
299      * Default constructor.
300      */

301     public XmlPullParserImpl() {
302         _attributes = new AttributesImpl();
303         _attrPool.addLast(_attributes);
304     }
305
306     /**
307      * Sets the byte buffer this parser is going to process
308      * (UTF-8 encoding).
309      *
310      * @param byteBuffer the byte buffer with UTF-8 encoding.
311      * @see Utf8ByteBufferReader
312      */

313     public void setInput(ByteBuffer byteBuffer) {
314         if (_reader != null)
315             throw new IllegalStateException("Parser not reset.");
316         _byteBufferReader.setByteBuffer(byteBuffer);
317         _inputEncoding = "UTF-8";
318         setInput(_byteBufferReader);
319     }
320
321     /**
322      * Sets the input stream this parser is going to process
323      * (UTF-8 encoding).
324      *
325      * @param in the input stream with UTF-8 encoding.
326      * @see Utf8StreamReader
327      */

328     public void setInput(InputStream in) {
329         if (_reader != null)
330             throw new IllegalStateException("Parser not reset.");
331         _inputStreamReader.setInputStream(in);
332         _inputEncoding = "UTF-8";
333         setInput(_inputStreamReader);
334     }
335
336     // Implements XmlPullParser interface.
337
public void setInput(InputStream inputStream, String inputEncoding)
338             throws XmlPullParserException {
339         if ((inputEncoding == null) || inputEncoding.equals("utf-8")
340                 || inputEncoding.equals("UTF-8")) {
341             setInput(inputStream);
342             return;
343         }
344         try {
345             _inputEncoding = inputEncoding;
346             setInput(new InputStreamReader(inputStream, inputEncoding));
347         } catch (UnsupportedEncodingException e) {
348             throw new XmlPullParserException(e.getMessage());
349         }
350     }
351
352     // Implements XmlPullParser interface.
353
public void setInput(Reader in) {
354         if (_reader != null)
355             throw new IllegalStateException("Parser not reset.");
356         _reader = in;
357         _eventType = START_DOCUMENT;
358     }
359
360     // Implements XmlPullParser interface.
361
public void defineEntityReplacementText(CharSequence entityName,
362             CharSequence replacementText) throws XmlPullParserException {
363     }
364
365     /**
366      * Returns SAX-2 like attributes for the current element.
367      *
368      * @return the attributes of the current element.
369      */

370     public Attributes getSaxAttributes() {
371         return _attributes;
372     }
373
374     // Implements XmlPullParser interface.
375
public int getAttributeCount() {
376         if (_eventType != START_TAG)
377             return -1;
378         return _attributes.getLength();
379     }
380
381     // Implements XmlPullParser interface.
382
public CharSequence getAttributeName(int index) {
383         return _attributes.getLocalName(index);
384     }
385
386     // Implements XmlPullParser interface.
387
public CharSequence getAttributeNamespace(int index) {
388         return _attributes.getURI(index);
389     }
390
391     // Implements XmlPullParser interface.
392
public CharSequence getAttributePrefix(int index) {
393         return _attributes.getPrefix(index);
394     }
395
396     // Implements XmlPullParser interface.
397
public String getAttributeType(int index) {
398         return _attributes.getType(index);
399     }
400
401     // Implements XmlPullParser interface.
402
public CharSequence getAttributeValue(CharSequence namespace,
403             CharSequence name) {
404         return _attributes.getValue(namespace, name);
405     }
406
407     // Implements XmlPullParser interface.
408
public CharSequence getAttributeValue(int index) {
409         return _attributes.getValue(index);
410     }
411
412     // Implements XmlPullParser interface.
413
public int getDepth() {
414         return _depth;
415     }
416
417     // Implements XmlPullParser interface.
418
public int getEventType() throws XmlPullParserException {
419         return _eventType;
420     }
421
422     // Implements XmlPullParser interface.
423
public boolean getFeature(CharSequence name) {
424         return false;
425     }
426
427     // Implements XmlPullParser interface.
428
public String getInputEncoding() {
429         return _inputEncoding;
430     }
431
432     // Implements XmlPullParser interface.
433
public int getLineNumber() {
434         int column = _columnOffset + _index;
435         return (column != 0) ? _lineNumber : _lineNumber - 1;
436     }
437
438     // Implements XmlPullParser interface.
439
public int getColumnNumber() {
440         int column = _columnOffset + _index;
441         return (column != 0) ? column : _lineLength;
442     }
443
444     // Implements XmlPullParser interface.
445
public CharSequence getName() {
446         return _elemLocalName;
447     }
448
449     // Implements XmlPullParser interface.
450
public CharSequence getNamespace() {
451         return _elemNamespace;
452     }
453
454     // Implements XmlPullParser interface.
455
public CharSequence getPrefix() {
456         return _elemPrefix;
457     }
458
459     /**
460      * Returns the current element qualified name.
461      *
462      * @return the qualified name of the current element (prefix:localName).
463      */

464     public CharSequence getQName() {
465         return _elemQName;
466     }
467
468     // Implements XmlPullParser interface.
469
public CharSequence getNamespace(CharSequence prefix) {
470         return _namespaces.getNamespaceUri(prefix);
471     }
472
473     // Implements XmlPullParser interface.
474
public int getNamespaceCount(int depth) {
475         return _namespaces.getNamespaceCount(depth);
476     }
477
478     // Implements XmlPullParser interface.
479
public CharSequence getNamespacePrefix(int pos) {
480         return _namespaces.getNamespacePrefix(pos);
481     }
482
483     // Implements XmlPullParser interface.
484
public CharSequence getNamespaceUri(int pos) {
485         return _namespaces.getNamespaceUri(pos);
486     }
487
488     // Implements XmlPullParser interface.
489
public CharSequence getPositionDescription() {
490         return Text.valueOf("line ").concat(Text.valueOf(getLineNumber()))
491                 .concat(Text.valueOf(", column ")).concat(
492                         Text.valueOf(getColumnNumber()));
493     }
494
495     // Implements XmlPullParser interface.
496
public Object getProperty(CharSequence name) {
497         return null;
498     }
499
500     // Implements XmlPullParser interface.
501
public CharSequence getText() {
502         if (_eventType == START_DOCUMENT || _eventType == END_DOCUMENT) {
503             return null;
504         }
505         return _text;
506     }
507
508     // Implements XmlPullParser interface.
509
public char[] getTextCharacters(int[] holderForStartAndLength) {
510         if (_eventType == START_DOCUMENT || _eventType == END_DOCUMENT) {
511             holderForStartAndLength[0] = holderForStartAndLength[1] = -1;
512             return null;
513         }
514         holderForStartAndLength[0] = _text.offset;
515         holderForStartAndLength[1] = _text.length;
516         return _text.data;
517     }
518
519     // Implements XmlPullParser interface.
520
public int indexOf(CharSequence namespace, CharSequence name) {
521         if (_eventType != START_TAG)
522             throw new IndexOutOfBoundsException();
523         return _attributes.getIndex(namespace, name);
524     }
525
526     // Implements XmlPullParser interface.
527
public boolean isAttribute(CharSequence namespace, CharSequence name) {
528         return indexOf(namespace, name) >= 0;
529     }
530
531     // Implements XmlPullParser interface.
532
public boolean isAttributeDefault(int index) {
533         return false;
534     }
535
536     // Implements XmlPullParser interface.
537
public boolean isEmptyElementTag() throws XmlPullParserException {
538         return _isEmpty;
539     }
540
541     // Implements XmlPullParser interface.
542
public boolean isWhitespace() throws XmlPullParserException {
543         if (_eventType == TEXT || _eventType == CDSECT)
544             return !_hasNonWhitespace;
545         throw new IllegalStateException();
546     }
547
548     // Implements XmlPullParser interface.
549
public int next() throws XmlPullParserException, IOException {
550         return (_eventType = parse(false));
551     }
552
553     // Implements XmlPullParser interface.
554
public int nextTag() throws XmlPullParserException, IOException {
555         int eventType = next();
556         if (eventType == TEXT && isWhitespace()) { // skip whitespace
557
eventType = next();
558         }
559         if (eventType != START_TAG && eventType != END_TAG) {
560             throw error("expected start or end tag");
561         }
562         return eventType;
563     }
564
565     // Implements XmlPullParser interface.
566
public CharSequence nextText() throws XmlPullParserException, IOException {
567         if (getEventType() != START_TAG)
568             throw error("parser must be on START_TAG to read next text");
569         int eventType = next();
570         if (eventType == TEXT) {
571             CharSequence result = getText();
572             eventType = next();
573             if (eventType != END_TAG)
574                 throw error("event TEXT must be immediately followed by END_TAG");
575
576             return result;
577         } else if (eventType == END_TAG) {
578             return CharSequenceImpl.EMPTY;
579         } else {
580             throw error("parser must be on START_TAG or TEXT to read text");
581         }
582     }
583
584     // Implements XmlPullParser interface.
585
public int nextToken() throws XmlPullParserException, IOException {
586         return (_eventType = parse(true));
587     }
588
589     // Implements XmlPullParser interface.
590
public void setFeature(String name, boolean state)
591             throws XmlPullParserException {
592     }
593
594     // Implements XmlPullParser interface.
595
public boolean getFeature(String name) {
596         return false;
597     }
598
599     // Implements XmlPullParser interface.
600
public void setProperty(String name, Object value)
601             throws XmlPullParserException {
602     }
603
604     // Implements XmlPullParser interface.
605
public Object getProperty(String name) {
606         return null;
607     }
608
609     // Implements XmlPullParser interface.
610
public void require(int type, CharSequence namespace, CharSequence name)
611             throws XmlPullParserException, IOException {
612         if (type != getEventType()
613                 || (namespace != null && !FastComparator.LEXICAL.areEqual(
614                         namespace, getNamespace()))
615                 || (name != null && !FastComparator.LEXICAL.areEqual(name,
616                         getName())))
617             throw error("Require " + TYPES[type] + " failed");
618     }
619
620     /**
621      * Parses the document.
622      *
623      * @param tokenize indicates if tokenization is performed.
624      * @return the event type.
625      */

626     private int parse(boolean tokenize) throws XmlPullParserException,
627             IOException {
628         switch (_eventType) { // Previous event.
629
case START_DOCUMENT:
630             _charsRead = _reader.read(_chars, 0, _chars.length);
631             break;
632         case END_DOCUMENT:
633             throw error("End of document reached.");
634         case START_TAG:
635             if (_isEmpty) { // Previous empty tag, generates END_TAG automatically.
636
_isEmpty = false;
637                 return END_TAG;
638             }
639             _elemPrefix = null;
640             _elemLocalName = null;
641             _elemNamespace = null;
642             _elemQName = null;
643             break;
644         case END_TAG:
645             _attributes.reset();
646             _depth--;
647             _attributes = (AttributesImpl) _attrPool.get(_depth);
648             _length = _elemQName.offset;
649             _start = _length;
650             while (_seqs[--_seqsIndex] != _elemQName) {
651             }
652             _elemPrefix = null;
653             _elemLocalName = null;
654             _elemNamespace = null;
655             _elemQName = null;
656             break;
657         default:
658             _text = null;
659             _hasNonWhitespace = false;
660         }
661
662         while (_index < _charsRead) {
663
664             // Preprocessing.
665
//
666
char c = _chars[_index];
667             if (++_index == _charsRead) { // Reloads buffer.
668
_columnOffset += _index;
669                 _index = 0;
670                 _charsRead = _reader.read(_chars, 0, _chars.length);
671                 while ((_length + _charsRead) >= _data.length) {
672                     // Potential overflow, resizes.
673
char[] tmp = new char[_data.length * 2];
674                     System.arraycopy(_data, 0, tmp, 0, _data.length);
675                     _data = tmp;
676                     DATA_SIZE.set(new Integer(_data.length));
677                 }
678             }
679             // Replaces #xD and #xD#xA with #xA as per XML 1.0
680
// recommendations (&2.11).
681
if (c < 0x20) {
682                 if (c == 0xD) { // Replaces #xD with #xA
683
if ((_index < _charsRead) && (_chars[_index] == 0xA)) {
684                         // Unless next char is #xA, then continue,
685
// #xD#xA will be replaced by #xA
686
continue;
687                     }
688                     c = 0xA;
689                 }
690                 if (c == 0xA) {
691                     _lineNumber++; // Do it now, locator will readjust.
692
_lineLength = _columnOffset + _index;
693                     _columnOffset = -_index; // column = 0
694
} else if (c != 0x9) { // Not a tab.
695
throw error("Illegal XML character U+"
696                             + Integer.toHexString(c));
697                 }
698             }
699             // Appends to buffer.
700
_data[_length++] = c;
701             // Detects escape sequence (e.g. character reference).
702
if ((c == '&') && (_state != STATE_COMMENT) && (_state != PI)
703                     && (_state != CDATA) && (_state != ESCAPE)) { // (&2.4)
704
_savedState = _state;
705                 _escStart = _length;
706                 _state = ESCAPE;
707             }
708
709             // Main processing.
710
//
711
switch (_state) {
712             case CHAR_DATA:
713                 if (c == '<') {
714                     _state = MARKUP;
715                     if (_hasNonWhitespace) {
716                         int nbrChar = _length - _start - 1;
717                         setText(_start, nbrChar);
718                         _length = _start; // Do not keep character data.
719
return TEXT;
720                     }
721                     _length = _start; // Do not keep character data.
722
} else if (!_hasNonWhitespace && c > ' ') {
723                     _hasNonWhitespace = true;
724                 }
725                 break;
726
727             case MARKUP:
728                 if (_length - _start == 1) {
729                     if (c == '/') {
730                         _state = CLOSE_TAG + READ_ELEM_NAME;
731                         _length = _start;
732                         _elemQName = newSeq();
733                         _elemQName.offset = _start;
734                     } else if (c == '?') {
735                         _state = PI;
736                         _length = _start;
737                     } else if (c != '!') {
738                         _state = OPEN_TAG + READ_ELEM_NAME;
739                         // Sets the attributes for the current depth.
740
if (_depth >= _attrPool.size()) {
741                             _attrPool.addLast(ATTRIBUTES_IMPL_FACTORY.newObject());
742                         }
743                         _attributes = (AttributesImpl) _attrPool.get(_depth);
744                         _elemQName = newSeq();
745                         _elemQName.offset = _start;
746                     }
747                 } else if ((_length - _start == 3) && (_data[_start] == '!')
748                         && (_data[_start + 1] == '-')
749                         && (_data[_start + 2] == '-')) {
750                     _state = STATE_COMMENT;
751                     _length = _start;
752                 } else if ((_length - _start == 8) && (_data[_start] == '!')
753                         && (_data[_start + 1] == '[')
754                         && (_data[_start + 2] == 'C')
755                         && (_data[_start + 3] == 'D')
756                         && (_data[_start + 4] == 'A')
757                         && (_data[_start + 5] == 'T')
758                         && (_data[_start + 6] == 'A')
759                         && (_data[_start + 7] == '[')) {
760                     _state = CDATA;
761                     _nonwhitespace = -1;
762                     _length = _start;
763                 } else if (c == '>') {
764                     _state = CHAR_DATA;
765                     _length = _start;
766                 }
767                 break;
768
769             case STATE_COMMENT:
770                 if ((c == '>') && (_length - _start >= 3)
771                         && (_data[_length - 2] == '-')
772                         && (_data[_length - 3] == '-')) {
773                     _state = CHAR_DATA;
774                     int nbrChar = _length - _start - 3;
775                     _length = _start; // Do not keep comment.
776
if (tokenize && nbrChar > 0) {
777                         setText(_start, nbrChar);
778                         return COMMENT;
779                     }
780                 }
781                 break;
782
783             case PI: // Ignores Processing Instructions.
784
if ((c == '>') && (_length - _start >= 2)
785                         && (_data[_length - 2] == '?')) {
786                     _state = CHAR_DATA;
787                     int nbrChar = _length - _start - 2;
788                     _length = _start; // Do not keep Processing Instructions.
789
if (tokenize && nbrChar > 0) {
790                         setText(_start, nbrChar);
791                         return PROCESSING_INSTRUCTION;
792                     }
793                 }
794                 break;
795
796             case CDATA:
797                 if ((c == '>') && (_length - _start >= 3)
798                         && (_data[_length - 2] == ']')
799                         && (_data[_length - 3] == ']')) {
800                     _state = CHAR_DATA;
801                     int nbrChar = _length - _start - 3;
802                     _hasNonWhitespace = !(_nonwhitespace == _start + nbrChar
803                             + 1);
804                     setText(_start, nbrChar);
805                     _length = _start; // Do not keep CDATA
806
return CDSECT;
807                 }
808                 if ((_nonwhitespace == -1) && (c > ' '))
809                     _nonwhitespace = _length;
810                 break;
811
812             // OPEN_TAG:
813
case OPEN_TAG + READ_ELEM_NAME:
814                 if (c == '>') {
815                     _elemQName.length = _length - _elemQName.offset - 1;
816                     _elemQName.data = _data;
817                     _state = CHAR_DATA;
818                     _start = _length;
819                     return processElement(OPEN_TAG);
820                 } else if (c == '/') {
821                     _elemQName.length = _length - _elemQName.offset - 1;
822                     _elemQName.data = _data;
823                     _state = OPEN_TAG + EMPTY_TAG;
824                 } else if ((c == ':') && (_elemPrefix == null)) {
825                     _elemPrefix = newSeq();
826                     _elemPrefix.offset = _elemQName.offset;
827                     _elemPrefix.length = _length - _elemQName.offset - 1;
828                     _elemPrefix.data = _data;
829                 } else if (c <= ' ') {
830                     _elemQName.length = _length - _elemQName.offset - 1;
831                     _elemQName.data = _data;
832                     _state = OPEN_TAG + ELEM_NAME_READ;
833                 }
834                 break;
835             case OPEN_TAG + ELEM_NAME_READ:
836                 if (c == '>') {
837                     _state = CHAR_DATA;
838                     _start = _length;
839                     return processElement(OPEN_TAG);
840                 } else if (c == '/') {
841                     _state = OPEN_TAG + EMPTY_TAG;
842                 } else if (c > ' ') {
843                     _attrQName = newSeq();
844                     _attrQName.offset = _length - 1;
845                     _state = OPEN_TAG + READ_ATTR_NAME;
846                 }
847                 break;
848             case OPEN_TAG + READ_ATTR_NAME:
849                 if (c <= ' ') {
850                     _attrQName.length = _length - _attrQName.offset - 1;
851                     _attrQName.data = _data;
852                     _state = OPEN_TAG + ATTR_NAME_READ;
853                 } else if (c == '=') {
854                     _attrQName.length = _length - _attrQName.offset - 1;
855                     _attrQName.data = _data;
856                     _state = OPEN_TAG + EQUAL_READ;
857                 } else if ((c == ':') && (_attrPrefix == null)) {
858                     _attrPrefix = newSeq();
859                     _attrPrefix.offset = _attrQName.offset;
860                     _attrPrefix.length = _length - _attrQName.offset - 1;
861                     _attrPrefix.data = _data;
862                 }
863                 break;
864             case OPEN_TAG + ATTR_NAME_READ:
865                 if (c == '=') {
866                     _state = OPEN_TAG + EQUAL_READ;
867                 } else if (c > ' ') {
868                     throw error("'=' expected");
869                 }
870  &nb