XmlPullParserImpl


1   /*
2    * Javolution - Java(TM) Solution for Real-Time and Embedded Systems
3    * Copyright (C) 2005 - Javolution (http://javolution.org/)
4    * All rights reserved.
5    * 
6    * Permission to use, copy, modify, and distribute this software is
7    * freely granted, provided that this notice is preserved.
8    */
9   package javolution.xml.pull;
10  
11  import java.io.IOException;
12  import java.io.InputStream;
13  import java.io.InputStreamReader;
14  import java.io.Reader;
15  import java.io.UnsupportedEncodingException;
16  
17  import j2me.lang.CharSequence;
18  import j2me.lang.IllegalStateException;
19  import j2me.nio.ByteBuffer;
20  
21  import javolution.io.Utf8ByteBufferReader;
22  import javolution.io.Utf8StreamReader;
23  import javolution.lang.PersistentReference;
24  import javolution.lang.Reusable;
25  import javolution.lang.Text;
26  import javolution.lang.TypeFormat;
27  import javolution.realtime.ObjectFactory;
28  import javolution.util.FastComparator;
29  import javolution.util.FastTable;
30  import javolution.xml.sax.Attributes;
31  import javolution.xml.sax.AttributesImpl;
32  
33  /**
34   * <p> This class provides a real-time XPP-like XML parser; this parser is
35   *     <i>extremely</i> fast and <b>does not create temporary objects</b>
36   *     (no garbage generated and no GC interruption).</p>
37   *     
38   * <p> The parser input source can be either a {@link #setInput(Reader) Reader},
39   *     an {@link #setInput(InputStream) InputStream} or even a {@link 
40   *     #setInput(ByteBuffer) ByteBuffer} (e.g. <code>MappedByteBuffer</code>).</p>
41   *     
42   * <p> This parser is light (less than 15Kbytes compressed) and maintains
43   *     a very small memory footprint while parsing (e.g. less than 10Kbytes
44   *     while parsing 32Mbytes files). Typical applications include SOAP
45   *     messaging, embedded/realtime systems, web servers (possibly thousands
46   *     instances running concurrently), etc.</p>
47   *     
48   * <p> Namespaces (SAX2 feature), comments, predefined entities
49   *     (<code>&amp;amp;, &amp;lt;, &amp;gt;, &amp;apos;, &amp;quot;</code>)
50   *     numeric character references (e.g. <code>&amp;#10;</code> for
51   *     linefeed) and <code>CDATA</code> are recognized. Processing instructions,
52   *     comments and entities declarations are ignored.</p>
53   *     
54   * <p> The {@link CharSequence CharSequence} generated by this parser have
55   *     the following characteristics:
56   *     <ul>
57   *         <li> They are immutable within their definition scope. The <code>
58   *              CharSequence</code> created while parsing an XML element are
59   *              reused only after the element is out-of-scope (idem for the 
60   *              {@link #getSaxAttributes attributes lists}).</li>
61   *         <li> They support equality or lexical comparison with any
62   *              <code>CharSequence</code> (e.g. <code>String</code>).</li>
63   *         <li> They have the same hashcode than <code>String</code> and can be
64   *              used to retrieve data from a <code>Map</code> (e.g.
65   *              {@link javolution.util.FastMap FastMap}) for which 
66   *              the keys are <code>String</code> instances.</li>
67   *         <li> Like any <code>CharSequence</code>, they can be parsed
68   *              to primitive types (e.g. int, double) using the utility class
69   *              {@link TypeFormat}.</li>
70   *     </ul></p>
71   *     
72   * <p> Finally, this parser does not break up character data during call back
73   *     (the whole character data between markups is always being returned).</p>
74   *
75   * @author  <a HREF="mailto:javolution@arakelian.com">Gregory Arakelian</a>
76   * @author  <a HREF="mailto:jean-marie@dautelle.com">Jean-Marie Dautelle</a>
77   * @version 3.3, May 10, 2005
78   */
79  public final class XmlPullParserImpl implements XmlPullParser, Reusable {
80  
81      /**
82       * Holds a factory producing AttributesImpl instances.
83       */
84      private static final ObjectFactory ATTRIBUTES_IMPL_FACTORY = new ObjectFactory() {
85          protected Object create() {
86              return new AttributesImpl();
87          }
88      };
89  
90      /**
91       * Holds the reader buffer capacity.
92       */
93      private static final int READER_BUFFER_CAPACITY = 2048;
94  
95      /**
96       * Holds the configurable nominal length for the data array length (must be 
97       * larger than the reader buffer capacity to avoid overflow).
98       */
99      private static final PersistentReference DATA_SIZE = new PersistentReference(
100             "javolution.xml.pull.XmlPullParserImpl#DATA_SIZE", new Integer(
101                     READER_BUFFER_CAPACITY * 2));
102 
103     /**
104      * Holds the configurable nominal length for the CharSequenceImpl stack.
105      */
106     private static final PersistentReference SEQ_SIZE = new PersistentReference(
107             "javolution.xml.pull.XmlPullParserImpl#SEQ_SIZE", new Integer(256));
108 
109     /**
110      * Holds the parsing line.
111      */
112     private int _lineNumber;
113 
114     /**
115      * Holds the column offset (column = _columnOffset + _index).
116      */
117     private int _columnOffset;
118 
119     /**
120      * Holds the line length when line break occurs.
121      */
122     private int _lineLength;
123 
124     /**
125      * Holds the current index in the character buffer.
126      */
127     private int _index;
128 
129     /**
130      * Holds the data buffer for CharSequence produced by this parser.
131      */
132     private char[] _data = (char[]) new char[((Integer) DATA_SIZE.get())
133             .intValue()];
134 
135     /**
136      * Holds the current length of the data buffer (_data).
137      */
138     private int _length;
139 
140     /**
141      * Holds the current depth.
142      */
143     private int _depth;
144 
145     /**
146      * Holds the namespace  stack.
147      */
148     private final Namespaces _namespaces = new Namespaces();
149 
150     /**
151      * Holds the current attributes (view over _attrPool.get(_depth)).
152      */
153     private AttributesImpl _attributes;
154 
155     /**
156      * Holds a pool of AttributesImpl to avoid overwritting the current one.
157      */
158     private final FastTable _attrPool = new FastTable();
159 
160     /**
161      * Holds working stack.
162      */
163     private final FastTable _elemStack = new FastTable();
164 
165     /**
166      * Holds the character buffer used for reading.
167      */
168     private final char[] _chars = new char[READER_BUFFER_CAPACITY];
169 
170     /**
171      * Holds the default stream reader (UTF-8).
172      */
173     private final Utf8StreamReader _inputStreamReader = new Utf8StreamReader(
174             READER_BUFFER_CAPACITY);
175 
176     /**
177      * Holds the default ByteBuffer reader (UTF-8).
178      */
179     private final Utf8ByteBufferReader _byteBufferReader = new Utf8ByteBufferReader();
180 
181     /**
182      * Number of characters read from reader
183      */
184     private int _charsRead;
185 
186     /**
187      * Holds local name (i.e. does not include prefix, if any).
188      */
189     private CharSequenceImpl _elemLocalName;
190 
191     /**
192      * Holds element namespace (lookup performed using prefix and namespace stack).
193      */
194     private CharSequenceImpl _elemNamespace;
195 
196     /**
197      * Holds qualified name (include prefix).
198      */
199     private CharSequenceImpl _elemQName;
200 
201     /**
202      * Holds prefix.
203      */
204     private CharSequenceImpl _elemPrefix;
205 
206     /**
207      * Holds attribute qualified name.
208      */
209     private CharSequenceImpl _attrQName;
210 
211     /**
212      * Holds attribute prefix.
213      */
214     private CharSequenceImpl _attrPrefix;
215 
216     /**
217      * Holds attribute value.
218      */
219     private CharSequenceImpl _attrValue;
220 
221     /**
222      * Holds character sequence when parsing numeric literal.
223      */
224     private final CharSequenceImpl _num = new CharSequenceImpl();
225 
226     /**
227      * Holds current event type
228      */
229     private int _eventType = END_DOCUMENT;
230 
231     /**
232      * Input encoding, if known
233      */
234     private String _inputEncoding;
235 
236     /**
237      * Indicates if event type is START_TAG, and tag is empty, i.e. <sometag/>
238      */
239     private boolean _isEmpty;
240 
241     /**
242      * Holds index of the last non-whitespace character (-1 = all whitespace).
243      */
244     private int _nonwhitespace;
245 
246     /**
247      * Holds reader used to get data
248      */
249     private Reader _reader;
250 
251     /**
252      * Holds start of escape sequence
253      */
254     private int _escStart;
255 
256     /**
257      * Holds saved parser state when escape sequence encountered.
258      */
259     private int _savedState;
260 
261     /**
262      * Holds the start of text withing _data array.
263      */
264     private int _start;
265 
266     /**
267      * Holds the parser state.
268      */
269     private int _state = CHAR_DATA;
270 
271     /**
272      * Holds the text associated with current event.
273      */
274     private CharSequenceImpl _text;
275 
276     /**
277      * Indicates if text contains non whitespace characters (characters 
278      * others than space, cr, lf, tab).
279      */
280     private boolean _hasNonWhitespace;
281 
282     /**
283      * Holds character sequences instances.
284      */
285     private CharSequenceImpl[] _seqs = new CharSequenceImpl[((Integer) SEQ_SIZE
286             .get()).intValue()];
287 
288     /**
289      * Holds character sequence index. 
290      */
291     private int _seqsIndex;
292 
293     /**
294      * Holds number of character sequence instances allocated. 
295      */
296     private int _seqsCapacity;
297 
298     /**
299      * Default constructor.
300      */
301     public XmlPullParserImpl() {
302         _attributes = new AttributesImpl();
303         _attrPool.addLast(_attributes);
304     }
305 
306     /**
307      * Sets the byte buffer this parser is going to process
308      * (UTF-8 encoding).
309      *
310      * @param  byteBuffer the byte buffer with UTF-8 encoding.
311      * @see    Utf8ByteBufferReader
312      */
313     public void setInput(ByteBuffer byteBuffer) {
314         if (_reader != null)
315             throw new IllegalStateException("Parser not reset.");
316         _byteBufferReader.setByteBuffer(byteBuffer);
317         _inputEncoding = "UTF-8";
318         setInput(_byteBufferReader);
319     }
320 
321     /**
322      * Sets the input stream this parser is going to process
323      * (UTF-8 encoding).
324      *
325      * @param in the input stream with UTF-8 encoding.
326      * @see    Utf8StreamReader
327      */
328     public void setInput(InputStream in) {
329         if (_reader != null)
330             throw new IllegalStateException("Parser not reset.");
331         _inputStreamReader.setInputStream(in);
332         _inputEncoding = "UTF-8";
333         setInput(_inputStreamReader);
334     }
335 
336     // Implements XmlPullParser interface.
337     public void setInput(InputStream inputStream, String inputEncoding)
338             throws XmlPullParserException {
339         if ((inputEncoding == null) || inputEncoding.equals("utf-8")
340                 || inputEncoding.equals("UTF-8")) {
341             setInput(inputStream);
342             return;
343         }
344         try {
345             _inputEncoding = inputEncoding;
346             setInput(new InputStreamReader(inputStream, inputEncoding));
347         } catch (UnsupportedEncodingException e) {
348             throw new XmlPullParserException(e.getMessage());
349         }
350     }
351 
352     // Implements XmlPullParser interface.
353     public void setInput(Reader in) {
354         if (_reader != null)
355             throw new IllegalStateException("Parser not reset.");
356         _reader = in;
357         _eventType = START_DOCUMENT;
358     }
359 
360     // Implements XmlPullParser interface.
361     public void defineEntityReplacementText(CharSequence entityName,
362             CharSequence replacementText) throws XmlPullParserException {
363     }
364 
365     /**
366      * Returns SAX-2 like attributes for the current element.
367      * 
368      * @return the attributes of the current element.
369      */
370     public Attributes getSaxAttributes() {
371         return _attributes;
372     }
373 
374     // Implements XmlPullParser interface.
375     public int getAttributeCount() {
376         if (_eventType != START_TAG)
377             return -1;
378         return _attributes.getLength();
379     }
380 
381     // Implements XmlPullParser interface.
382     public CharSequence getAttributeName(int index) {
383         return _attributes.getLocalName(index);
384     }
385 
386     // Implements XmlPullParser interface.
387     public CharSequence getAttributeNamespace(int index) {
388         return _attributes.getURI(index);
389     }
390 
391     // Implements XmlPullParser interface.
392     public CharSequence getAttributePrefix(int index) {
393         return _attributes.getPrefix(index);
394     }
395 
396     // Implements XmlPullParser interface.
397     public String getAttributeType(int index) {
398         return _attributes.getType(index);
399     }
400 
401     // Implements XmlPullParser interface.
402     public CharSequence getAttributeValue(CharSequence namespace,
403             CharSequence name) {
404         return _attributes.getValue(namespace, name);
405     }
406 
407     // Implements XmlPullParser interface.
408     public CharSequence getAttributeValue(int index) {
409         return _attributes.getValue(index);
410     }
411 
412     // Implements XmlPullParser interface.
413     public int getDepth() {
414         return _depth;
415     }
416 
417     // Implements XmlPullParser interface.
418     public int getEventType() throws XmlPullParserException {
419         return _eventType;
420     }
421 
422     // Implements XmlPullParser interface.
423     public boolean getFeature(CharSequence name) {
424         return false;
425     }
426 
427     // Implements XmlPullParser interface.
428     public String getInputEncoding() {
429         return _inputEncoding;
430     }
431 
432     // Implements XmlPullParser interface.
433     public int getLineNumber() {
434         int column = _columnOffset + _index;
435         return (column != 0) ? _lineNumber : _lineNumber - 1;
436     }
437 
438     // Implements XmlPullParser interface.
439     public int getColumnNumber() {
440         int column = _columnOffset + _index;
441         return (column != 0) ? column : _lineLength;
442     }
443 
444     // Implements XmlPullParser interface.
445     public CharSequence getName() {
446         return _elemLocalName;
447     }
448 
449     // Implements XmlPullParser interface.
450     public CharSequence getNamespace() {
451         return _elemNamespace;
452     }
453 
454     // Implements XmlPullParser interface.
455     public CharSequence getPrefix() {
456         return _elemPrefix;
457     }
458 
459     /**
460      * Returns the current element qualified name.
461      *
462      * @return the qualified name of the current element (prefix:localName).
463      */
464     public CharSequence getQName() {
465         return _elemQName;
466     }
467 
468     // Implements XmlPullParser interface.
469     public CharSequence getNamespace(CharSequence prefix) {
470         return _namespaces.getNamespaceUri(prefix);
471     }
472 
473     // Implements XmlPullParser interface.
474     public int getNamespaceCount(int depth) {
475         return _namespaces.getNamespaceCount(depth);
476     }
477 
478     // Implements XmlPullParser interface.
479     public CharSequence getNamespacePrefix(int pos) {
480         return _namespaces.getNamespacePrefix(pos);
481     }
482 
483     // Implements XmlPullParser interface.
484     public CharSequence getNamespaceUri(int pos) {
485         return _namespaces.getNamespaceUri(pos);
486     }
487 
488     // Implements XmlPullParser interface.
489     public CharSequence getPositionDescription() {
490         return Text.valueOf("line ").concat(Text.valueOf(getLineNumber()))
491                 .concat(Text.valueOf(", column ")).concat(
492                         Text.valueOf(getColumnNumber()));
493     }
494 
495     // Implements XmlPullParser interface.
496     public Object getProperty(CharSequence name) {
497         return null;
498     }
499 
500     // Implements XmlPullParser interface.
501     public CharSequence getText() {
502         if (_eventType == START_DOCUMENT || _eventType == END_DOCUMENT) {
503             return null;
504         }
505         return _text;
506     }
507 
508     // Implements XmlPullParser interface.
509     public char[] getTextCharacters(int[] holderForStartAndLength) {
510         if (_eventType == START_DOCUMENT || _eventType == END_DOCUMENT) {
511             holderForStartAndLength[0] = holderForStartAndLength[1] = -1;
512             return null;
513         }
514         holderForStartAndLength[0] = _text.offset;
515         holderForStartAndLength[1] = _text.length;
516         return _text.data;
517     }
518 
519     // Implements XmlPullParser interface.
520     public int indexOf(CharSequence namespace, CharSequence name) {
521         if (_eventType != START_TAG)
522             throw new IndexOutOfBoundsException();
523         return _attributes.getIndex(namespace, name);
524     }
525 
526     // Implements XmlPullParser interface.
527     public boolean isAttribute(CharSequence namespace, CharSequence name) {
528         return indexOf(namespace, name) >= 0;
529     }
530 
531     // Implements XmlPullParser interface.
532     public boolean isAttributeDefault(int index) {
533         return false;
534     }
535 
536     // Implements XmlPullParser interface.
537     public boolean isEmptyElementTag() throws XmlPullParserException {
538         return _isEmpty;
539     }
540 
541     // Implements XmlPullParser interface.
542     public boolean isWhitespace() throws XmlPullParserException {
543         if (_eventType == TEXT || _eventType == CDSECT)
544             return !_hasNonWhitespace;
545         throw new IllegalStateException();
546     }
547 
548     // Implements XmlPullParser interface.
549     public int next() throws XmlPullParserException, IOException {
550         return (_eventType = parse(false));
551     }
552 
553     // Implements XmlPullParser interface.
554     public int nextTag() throws XmlPullParserException, IOException {
555         int eventType = next();
556         if (eventType == TEXT && isWhitespace()) { // skip whitespace
557             eventType = next();
558         }
559         if (eventType != START_TAG && eventType != END_TAG) {
560             throw error("expected start or end tag");
561         }
562         return eventType;
563     }
564 
565     // Implements XmlPullParser interface.
566     public CharSequence nextText() throws XmlPullParserException, IOException {
567         if (getEventType() != START_TAG)
568             throw error("parser must be on START_TAG to read next text");
569         int eventType = next();
570         if (eventType == TEXT) {
571             CharSequence result = getText();
572             eventType = next();
573             if (eventType != END_TAG)
574                 throw error("event TEXT must be immediately followed by END_TAG");
575 
576             return result;
577         } else if (eventType == END_TAG) {
578             return CharSequenceImpl.EMPTY;
579         } else {
580             throw error("parser must be on START_TAG or TEXT to read text");
581         }
582     }
583 
584     // Implements XmlPullParser interface.
585     public int nextToken() throws XmlPullParserException, IOException {
586         return (_eventType = parse(true));
587     }
588 
589     // Implements XmlPullParser interface.
590     public void setFeature(String name, boolean state)
591             throws XmlPullParserException {
592     }
593 
594     // Implements XmlPullParser interface.
595     public boolean getFeature(String name) {
596         return false;
597     }
598 
599     // Implements XmlPullParser interface.
600     public void setProperty(String name, Object value)
601             throws XmlPullParserException {
602     }
603 
604     // Implements XmlPullParser interface.
605     public Object getProperty(String name) {
606         return null;
607     }
608 
609     // Implements XmlPullParser interface.
610     public void require(int type, CharSequence namespace, CharSequence name)
611             throws XmlPullParserException, IOException {
612         if (type != getEventType()
613                 || (namespace != null && !FastComparator.LEXICAL.areEqual(
614                         namespace, getNamespace()))
615                 || (name != null && !FastComparator.LEXICAL.areEqual(name,
616                         getName())))
617             throw error("Require " + TYPES[type] + " failed");
618     }
619 
620     /**
621      * Parses the document.
622      * 
623      * @param tokenize indicates if tokenization is performed.
624      * @return the event type.
625      */
626     private int parse(boolean tokenize) throws XmlPullParserException,
627             IOException {
628         switch (_eventType) { // Previous event.
629         case START_DOCUMENT:
630             _charsRead = _reader.read(_chars, 0, _chars.length);
631             break;
632         case END_DOCUMENT:
633             throw error("End of document reached.");
634         case START_TAG:
635             if (_isEmpty) { // Previous empty tag, generates END_TAG automatically.
636                 _isEmpty = false;
637                 return END_TAG;
638             }
639             _elemPrefix = null;
640             _elemLocalName = null;
641             _elemNamespace = null;
642             _elemQName = null;
643             break;
644         case END_TAG:
645             _attributes.reset();
646             _depth--;
647             _attributes = (AttributesImpl) _attrPool.get(_depth);
648             _length = _elemQName.offset;
649             _start = _length;
650             while (_seqs[--_seqsIndex] != _elemQName) {
651             }
652             _elemPrefix = null;
653             _elemLocalName = null;
654             _elemNamespace = null;
655             _elemQName = null;
656             break;
657         default:
658             _text = null;
659             _hasNonWhitespace = false;
660         }
661 
662         while (_index < _charsRead) {
663 
664             // Preprocessing.
665             //
666             char c = _chars[_index];
667             if (++_index == _charsRead) { // Reloads buffer.
668                 _columnOffset += _index;
669                 _index = 0;
670                 _charsRead = _reader.read(_chars, 0, _chars.length);
671                 while ((_length + _charsRead) >= _data.length) {
672                     // Potential overflow, resizes.
673                     char[] tmp = new char[_data.length * 2];
674                     System.arraycopy(_data, 0, tmp, 0, _data.length);
675                     _data = tmp;
676                     DATA_SIZE.set(new Integer(_data.length));
677                 }
678             }
679             // Replaces #xD and #xD#xA with #xA as per XML 1.0
680             // recommendations (&2.11).
681             if (c < 0x20) {
682                 if (c == 0xD) { // Replaces #xD with #xA
683                     if ((_index < _charsRead) && (_chars[_index] == 0xA)) {
684                         // Unless next char is #xA, then continue,
685                         // #xD#xA will be replaced by #xA
686                         continue;
687                     }
688                     c = 0xA;
689                 }
690                 if (c == 0xA) {
691                     _lineNumber++; // Do it now, locator will readjust.
692                     _lineLength = _columnOffset + _index;
693                     _columnOffset = -_index; // column = 0
694                 } else if (c != 0x9) { // Not a tab.
695                     throw error("Illegal XML character U+"
696                             + Integer.toHexString(c));
697                 }
698             }
699             // Appends to buffer.
700             _data[_length++] = c;
701             // Detects escape sequence (e.g. character reference).
702             if ((c == '&') && (_state != STATE_COMMENT) && (_state != PI)
703                     && (_state != CDATA) && (_state != ESCAPE)) { // (&2.4)
704                 _savedState = _state;
705                 _escStart = _length;
706                 _state = ESCAPE;
707             }
708 
709             // Main processing.
710             //
711             switch (_state) {
712             case CHAR_DATA:
713                 if (c == '<') {
714                     _state = MARKUP;
715                     if (_hasNonWhitespace) {
716                         int nbrChar = _length - _start - 1;
717                         setText(_start, nbrChar);
718                         _length = _start; // Do not keep character data.
719                         return TEXT;
720                     }
721                     _length = _start; // Do not keep character data.
722                 } else if (!_hasNonWhitespace && c > ' ') {
723                     _hasNonWhitespace = true;
724                 }
725                 break;
726 
727             case MARKUP:
728                 if (_length - _start == 1) {
729                     if (c == '/') {
730                         _state = CLOSE_TAG + READ_ELEM_NAME;
731                         _length = _start;
732                         _elemQName = newSeq();
733                         _elemQName.offset = _start;
734                     } else if (c == '?') {
735                         _state = PI;
736                         _length = _start;
737                     } else if (c != '!') {
738                         _state = OPEN_TAG + READ_ELEM_NAME;
739                         // Sets the attributes for the current depth.
740                         if (_depth >= _attrPool.size()) {
741                             _attrPool.addLast(ATTRIBUTES_IMPL_FACTORY.newObject());
742                         }
743                         _attributes = (AttributesImpl) _attrPool.get(_depth);
744                         _elemQName = newSeq();
745                         _elemQName.offset = _start;
746                     }
747                 } else if ((_length - _start == 3) && (_data[_start] == '!')
748                         && (_data[_start + 1] == '-')
749                         && (_data[_start + 2] == '-')) {
750                     _state = STATE_COMMENT;
751                     _length = _start;
752                 } else if ((_length - _start == 8) && (_data[_start] == '!')
753                         && (_data[_start + 1] == '[')
754                         && (_data[_start + 2] == 'C')
755                         && (_data[_start + 3] == 'D')
756                         && (_data[_start + 4] == 'A')
757                         && (_data[_start + 5] == 'T')
758                         && (_data[_start + 6] == 'A')
759                         && (_data[_start + 7] == '[')) {
760                     _state = CDATA;
761                     _nonwhitespace = -1;
762                     _length = _start;
763                 } else if (c == '>') {
764                     _state = CHAR_DATA;
765                     _length = _start;
766                 }
767                 break;
768 
769             case STATE_COMMENT:
770                 if ((c == '>') && (_length - _start >= 3)
771                         && (_data[_length - 2] == '-')
772                         && (_data[_length - 3] == '-')) {
773                     _state = CHAR_DATA;
774                     int nbrChar = _length - _start - 3;
775                     _length = _start; // Do not keep comment.
776                     if (tokenize && nbrChar > 0) {
777                         setText(_start, nbrChar);
778                         return COMMENT;
779                     }
780                 }
781                 break;
782 
783             case PI: // Ignores Processing Instructions.
784                 if ((c == '>') && (_length - _start >= 2)
785                         && (_data[_length - 2] == '?')) {
786                     _state = CHAR_DATA;
787                     int nbrChar = _length - _start - 2;
788                     _length = _start; // Do not keep Processing Instructions.
789                     if (tokenize && nbrChar > 0) {
790                         setText(_start, nbrChar);
791                         return PROCESSING_INSTRUCTION;
792                     }
793                 }
794                 break;
795 
796             case CDATA:
797                 if ((c == '>') && (_length - _start >= 3)
798                         && (_data[_length - 2] == ']')
799                         && (_data[_length - 3] == ']')) {
800                     _state = CHAR_DATA;
801                     int nbrChar = _length - _start - 3;
802                     _hasNonWhitespace = !(_nonwhitespace == _start + nbrChar
803                             + 1);
804                     setText(_start, nbrChar);
805                     _length = _start; // Do not keep CDATA
806                     return CDSECT;
807                 }
808                 if ((_nonwhitespace == -1) && (c > ' '))
809                     _nonwhitespace = _length;
810                 break;
811 
812             // OPEN_TAG:
813             case OPEN_TAG + READ_ELEM_NAME:
814                 if (c == '>') {
815                     _elemQName.length = _length - _elemQName.offset - 1;
816                     _elemQName.data = _data;
817                     _state = CHAR_DATA;
818                     _start = _length;
819                     return processElement(OPEN_TAG);
820                 } else if (c == '/') {
821                     _elemQName.length = _length - _elemQName.offset - 1;
822                     _elemQName.data = _data;
823                     _state = OPEN_TAG + EMPTY_TAG;
824                 } else if ((c == ':') && (_elemPrefix == null)) {
825                     _elemPrefix = newSeq();
826                     _elemPrefix.offset = _elemQName.offset;
827                     _elemPrefix.length = _length - _elemQName.offset - 1;
828                     _elemPrefix.data = _data;
829                 } else if (c <= ' ') {
830                     _elemQName.length = _length - _elemQName.offset - 1;
831                     _elemQName.data = _data;
832                     _state = OPEN_TAG + ELEM_NAME_READ;
833                 }
834                 break;
835             case OPEN_TAG + ELEM_NAME_READ:
836                 if (c == '>') {
837                     _state = CHAR_DATA;
838                     _start = _length;
839                     return processElement(OPEN_TAG);
840                 } else if (c == '/') {
841                     _state = OPEN_TAG + EMPTY_TAG;
842                 } else if (c > ' ') {
843                     _attrQName = newSeq();
844                     _attrQName.offset = _length - 1;
845                     _state = OPEN_TAG + READ_ATTR_NAME;
846                 }
847                 break;
848             case OPEN_TAG + READ_ATTR_NAME:
849                 if (c <= ' ') {
850                     _attrQName.length = _length - _attrQName.offset - 1;
851                     _attrQName.data = _data;
852                     _state = OPEN_TAG + ATTR_NAME_READ;
853                 } else if (c == '=') {
854                     _attrQName.length = _length - _attrQName.offset - 1;
855                     _attrQName.data = _data;
856                     _state = OPEN_TAG + EQUAL_READ;
857                 } else if ((c == ':') && (_attrPrefix == null)) {
858                     _attrPrefix = newSeq();
859                     _attrPrefix.offset = _attrQName.offset;
860                     _attrPrefix.length = _length - _attrQName.offset - 1;
861                     _attrPrefix.data = _data;
862                 }
863                 break;
864             case OPEN_TAG + ATTR_NAME_READ:
865                 if (c == '=') {
866                     _state = OPEN_TAG + EQUAL_READ;
867                 } else if (c > ' ') {
868                     throw error("'=' expected");
869                 }
870                 break;
871             case OPEN_TAG + EQUAL_READ:
872                 if (c == '\'') {
873                     _attrValue = newSeq();
874                     _attrValue.offset = _length;
875                     _state = OPEN_TAG + READ_ATTR_VALUE_SIMPLE_QUOTE;
876                 } else if (c == '\"') {
877                     _attrValue = newSeq();
878                     _attrValue.offset = _length;
879                     _state = OPEN_TAG + READ_ATTR_VALUE_DOUBLE_QUOTE;
880                 } else if (c > ' ') {
881                     throw error("Quotes expected");
882                 }
883                 break;
884             case OPEN_TAG + READ_ATTR_VALUE_SIMPLE_QUOTE:
885                 if (c == '\'') {
886                     _attrValue.length = _length - _attrValue.offset - 1;
887                     _attrValue.data = _data;
888                     processAttribute();
889                     _state = OPEN_TAG + ELEM_NAME_READ;
890                 }
891                 break;
892             case OPEN_TAG + READ_ATTR_VALUE_DOUBLE_QUOTE:
893                 if (c == '\"') {
894                     _attrValue.length = _length - _attrValue.offset - 1;
895                     _attrValue.data = _data;
896                     processAttribute();
897                     _state = OPEN_TAG + ELEM_NAME_READ;
898                 }
899                 break;
900             case OPEN_TAG + EMPTY_TAG:
901                 if (c == '>') {
902                     _state = CHAR_DATA;
903                     _start = _length;
904                     return processElement(OPEN_TAG + EMPTY_TAG);
905                 } else {
906                     throw error("'>' expected");
907                 }
908 
909             // CLOSE_TAG:
910             case CLOSE_TAG + READ_ELEM_NAME:
911                 if (c == '>') {
912                     _elemQName.length = _length - _elemQName.offset - 1;
913                     _elemQName.data = _data;
914                     _state = CHAR_DATA;
915                     _start = _length;
916                     return processElement(CLOSE_TAG);
917                 } else if ((c == ':') && (_elemPrefix == null)) {
918                     _elemPrefix = newSeq();
919                     _elemPrefix.offset = _elemQName.offset;
920                     _elemPrefix.length = _length - _elemQName.offset - 1;
921                     _elemPrefix.data = _data;
922                 } else if (c <= ' ') {
923                     _elemQName.length = _length - _elemQName.offset - 1;
924                     _elemQName.data = _data;
925                     _state = CLOSE_TAG + ELEM_NAME_READ;
926                 }
927                 break;
928             case CLOSE_TAG + ELEM_NAME_READ:
929                 if (c == '>') {
930                     _state = CHAR_DATA;
931                     _start = _length;
932                     return processElement(CLOSE_TAG);
933                 } else if (c > ' ') {
934                     throw error("'>' expected");
935                 }
936                 break;
937 
938             case ESCAPE:
939                 if (c == ';') { // Escape terminator.
940                     if ((_length - _escStart == 3)
941                             && (_data[_length - 3] == 'l')
942                             && (_data[_length - 2] == 't')) {
943                         _data[_escStart - 1] = '<';
944                     } else if ((_length - _escStart == 3)
945                             && (_data[_length - 3] == 'g')
946                             && (_data[_length - 2] == 't')) {
947                         _data[_escStart - 1] = '>';
948                     } else if ((_length - _escStart == 5)
949                             && (_data[_length - 5] == 'a')
950                             && (_data[_length - 4] == 'p')
951                             && (_data[_length - 3] == 'o')
952                             && (_data[_length - 2] == 's')) {
953                         _data[_escStart - 1] = '\'';
954                     } else if ((_length - _escStart == 5)
955                             && (_data[_length - 5] == 'q')
956                             && (_data[_length - 4] == 'u')
957                             && (_data[_length - 3] == 'o')
958                             && (_data[_length - 2] == 't')) {
959                         _data[_escStart - 1] = '\"';
960                     } else if ((_length - _escStart == 4)
961                             && (_data[_length - 4] == 'a')
962                             && (_data[_length - 3] == 'm')
963                             && (_data[_length - 2] == 'p')) {
964                         _data[_escStart - 1] = '&';
965                     } else { // Character reference (&4.1)
966                         if ((_length - _escStart > 1)
967                                 && (_data[_escStart] == '#')) {
968                             try {
969                                 if (_data[_escStart + 1] == 'x') { // Hexadecimal.
970                                     _num.offset = _escStart + 2;
971                                     _num.length = _length - _escStart - 3;
972                                     _num.data = _data;
973                                     _data[_escStart - 1] = (char) TypeFormat
974                                             .parseInt(_num, 16);
975                                 } else { // Decimal.
976                                     _num.offset = _escStart + 1;
977                                     _num.length = _length - _escStart - 2;
978                                     _num.data = _data;
979                                     _data[_escStart - 1] = (char) TypeFormat
980                                             .parseInt(_num);
981                                 }
982                             } catch (NumberFormatException e) {
983                                 throw error("Ill-formed character reference");
984                             }
985                         } else {
986                             throw error("'#' expected");
987                         }
988                     }
989                     _state = _savedState;
990                     _length = _escStart;
991                 } else if (c <= ' ') {
992                     throw error("';' expected");
993                 }
994                 break;
995 
996             default:
997                 throw error("State unknown: " + _state);
998             }
999         }
1000
1001        if (_depth != 0)
1002            throw error("Unexpected end of file");
1003        reset();
1004        return END_DOCUMENT;
1005    }
1006
1007    // Defines parsing states.
1008    private static final int CHAR_DATA = 0x10;
1009
1010    private static final int MARKUP = 0x20;
1011
1012    private static final int STATE_COMMENT = 0x30;
1013
1014    private static final int PI = 0x40;
1015
1016    private static final int CDATA = 0x50;
1017
1018    private static final int OPEN_TAG = 0x60;
1019
1020    private static final int CLOSE_TAG = 0x70;
1021
1022    private static final int ESCAPE = 0x90;
1023
1024    // Defines element parsing sub-states.
1025    private static final int READ_ELEM_NAME = 0x01;
1026
1027    private static final int ELEM_NAME_READ = 0x02;
1028
1029    private static final int READ_ATTR_NAME = 0x03;
1030
1031    private static final int ATTR_NAME_READ = 0x04;
1032
1033    private static final int EQUAL_READ = 0x05;
1034
1035    private static final int READ_ATTR_VALUE_SIMPLE_QUOTE = 0x06;
1036
1037    private static final int READ_ATTR_VALUE_DOUBLE_QUOTE = 0x07;
1038
1039    private static final int EMPTY_TAG = 0x08;
1040
1041    /**
1042     * Processes the attribute just read.
1043     *
1044     * @throws XmlPullParserException any SAX exception, possibly wrapping another
1045     *         exception.
1046     */
1047    private void processAttribute() throws XmlPullParserException {
1048        if (_attrPrefix == null) { // No prefix.
1049            if (isXmlns(_attrQName)) { // Sets default namespace.
1050                _namespaces.map(null, _attrValue);
1051            } else {
1052                _attributes
1053                        .addAttribute(CharSequenceImpl.EMPTY, _attrQName,
1054                                CharSequenceImpl.EMPTY, _attrQName, "CDATA",
1055                                _attrValue);
1056            }
1057        } else { // Prefix.
1058            CharSequenceImpl localName = newSeq();
1059            localName.offset = _attrQName.offset + _attrPrefix.length + 1;
1060            localName.length = _attrQName.length - _attrPrefix.length - 1;
1061            localName.data = _attrQName.data;
1062
1063            if (isXmlns(_attrPrefix)) { // Namespace association.
1064                _namespaces.map(localName, _attrValue);
1065            } else { // Searches URI
1066                CharSequenceImpl uri = _namespaces.getNamespaceUri(_attrPrefix);
1067                if (uri != null) {
1068                    _attributes.addAttribute(uri, localName, _attrPrefix,
1069                            _attrQName, "CDATA", _attrValue);
1070                } else {
1071                    error("Namespace " + _attrPrefix + " undefined");
1072                }
1073            }
1074            _attrPrefix = null; // Resets.
1075        }
1076    }
1077
1078    /**
1079     * Processes the element just read.
1080     *
1081     * @param  state the current parser state.
1082     * @throws XmlPullParserException any SAX exception, possibly wrapping another
1083     *         exception.
1084     */
1085    private int processElement(int state) throws XmlPullParserException {
1086        if (_elemPrefix != null) { // Prefix, sets uri.
1087            _elemLocalName = newSeq();
1088            _elemLocalName.offset = _elemQName.offset + _elemPrefix.length + 1;
1089            _elemLocalName.length = _elemQName.length - _elemPrefix.length - 1;
1090            _elemLocalName.data = _elemQName.data;
1091            _elemNamespace = _namespaces.getNamespaceUri(_elemPrefix);
1092            if (_elemNamespace == null)
1093                throw error("Namespace " + _elemPrefix + " undefined");
1094        } else { // No prefix.
1095            _elemLocalName = _elemQName;
1096            _elemNamespace = _namespaces.getNamespaceUri(null); // Default namespace.
1097        }
1098        if (state == OPEN_TAG + EMPTY_TAG) {
1099            _isEmpty = true;
1100            _depth++;
1101            _namespaces.flush();
1102            return START_TAG;
1103
1104        } else if (state == OPEN_TAG) {
1105            _depth++;
1106            _elemStack.addLast(_elemQName);
1107            _elemStack.addLast(_elemNamespace);
1108            _elemStack.addLast(_elemLocalName);
1109            _elemStack.addLast(_elemPrefix);
1110            _namespaces.push();
1111            return START_TAG;
1112
1113        } else if (state == CLOSE_TAG) {
1114            _elemPrefix = (CharSequenceImpl) _elemStack.removeLast();
1115            _elemLocalName = (CharSequenceImpl) _elemStack.removeLast();
1116            _elemNamespace = (CharSequenceImpl) _elemStack.removeLast();
1117            CharSequenceImpl qName = _elemQName; // Current.
1118            _elemQName = (CharSequenceImpl) _elemStack.removeLast();
1119            if (!_elemQName.equals(qName))
1120                throw error("Unexpected end tag for " + _elemQName);
1121            _namespaces.pop();
1122            return END_TAG;
1123
1124        } else {
1125            throw error("Unexpected state: " + state);
1126        }
1127    }
1128
1129    // Implements Reusable.
1130    public void reset() {
1131        try {
1132            if (_reader != null)
1133                _reader.close();
1134        } catch (IOException e) {
1135            // Ignores exceptions.
1136        }
1137
1138        // Resets all members (alphabetically ordered).
1139        _attributes = (AttributesImpl) _attrPool.get(0);
1140        for (int i = 0, n = _attrPool.size(); i < n;) {
1141            ((AttributesImpl) _attrPool.get(i++)).reset();
1142        }
1143        _attrPrefix = null;
1144        _attrQName = null;
1145        _attrValue = null;
1146        _attrQName = null;
1147        _charsRead = 0;
1148        _columnOffset = 0;
1149        _depth = 0;
1150        _elemLocalName = null;
1151        _elemNamespace = null;
1152        _elemPrefix = null;
1153        _elemQName = null;
1154        _elemStack.reset();
1155        _escStart = 0;
1156        _eventType = END_DOCUMENT;
1157        _hasNonWhitespace = false;
1158        _index = 0;
1159        _inputEncoding = null;
1160        _isEmpty = false;
1161        _length = 0;
1162        _lineLength = 0;
1163        _lineNumber = 0;
1164        _namespaces.reset();
1165        _nonwhitespace = 0;
1166        _reader = null;
1167        _savedState = 0;
1168        _seqsIndex = 0;
1169        _start = 0;
1170        _state = CHAR_DATA;
1171        _text = null;
1172    }
1173
1174    /**
1175     * Sets current text.
1176     *
1177     * @param  start the start index.
1178     * @param  end the end index.
1179     */
1180    private void setText(int start, int length) {
1181        _text = newSeq();
1182        _text.data = _data;
1183        _text.offset = start;
1184        _text.length = length;
1185    }
1186
1187    /**
1188     * Indicates if the specified character sequence is xmlns.
1189     *
1190     * @param  chars the characters sequence to be tested.
1191     * @return <code>chars.equals("xmlns")</code>
1192     */
1193    private static boolean isXmlns(CharSequenceImpl chars) {
1194        return (chars.length == 5) && (chars.data[chars.offset] == 'x')
1195                && (chars.data[chars.offset + 1] == 'm')
1196                && (chars.data[chars.offset + 2] == 'l')
1197                && (chars.data[chars.offset + 3] == 'n')
1198                && (chars.data[chars.offset + 4] == 's');
1199    }
1200
1201    private XmlPullParserException error(String message) {
1202        XmlPullParserException e = new XmlPullParserException(message, this,
1203                null);
1204        return e;
1205    }
1206
1207    // Returns a new character sequence from the pool.
1208    private CharSequenceImpl newSeq() {
1209        return (_seqsIndex < _seqsCapacity) ? _seqs[_seqsIndex++] : newSeq2();
1210    }
1211
1212    private CharSequenceImpl newSeq2() {
1213        if (_seqsCapacity++ >= _seqs.length) { // Resizes. 
1214            CharSequenceImpl[] tmp = new CharSequenceImpl[_seqs.length * 2];
1215            System.arraycopy(_seqs, 0, tmp, 0, _seqs.length);
1216            _seqs = tmp;
1217            SEQ_SIZE.set(new Integer(_seqs.length));
1218        }
1219        return _seqs[_seqsIndex++] = (CharSequenceImpl) CharSequenceImpl.FACTORY
1220                .newObject();
1221    }
1222
1223}
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Java Books Remove Frame
Popular Tags