KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > quadcap > text > sax > Parser


1 package com.quadcap.text.sax;
2
3 /* Copyright 1999 - 2003 Quadcap Software. All rights reserved.
4  *
5  * This software is distributed under the Quadcap Free Software License.
6  * This software may be used or modified for any purpose, personal or
7  * commercial. Open Source redistributions are permitted. Commercial
8  * redistribution of larger works derived from, or works which bundle
9  * this software requires a "Commercial Redistribution License"; see
10  * http://www.quadcap.com/purchase.
11  *
12  * Redistributions qualify as "Open Source" under one of the following terms:
13  *
14  * Redistributions are made at no charge beyond the reasonable cost of
15  * materials and delivery.
16  *
17  * Redistributions are accompanied by a copy of the Source Code or by an
18  * irrevocable offer to provide a copy of the Source Code for up to three
19  * years at the cost of materials and delivery. Such redistributions
20  * must allow further use, modification, and redistribution of the Source
21  * Code under substantially the same terms as this license.
22  *
23  * Redistributions of source code must retain the copyright notices as they
24  * appear in each source code file, these license terms, and the
25  * disclaimer/limitation of liability set forth as paragraph 6 below.
26  *
27  * Redistributions in binary form must reproduce this Copyright Notice,
28  * these license terms, and the disclaimer/limitation of liability set
29  * forth as paragraph 6 below, in the documentation and/or other materials
30  * provided with the distribution.
31  *
32  * The Software is provided on an "AS IS" basis. No warranty is
33  * provided that the Software is free of defects, or fit for a
34  * particular purpose.
35  *
36  * Limitation of Liability. Quadcap Software shall not be liable
37  * for any damages suffered by the Licensee or any third party resulting
38  * from use of the Software.
39  */

40
41 import java.io.CharArrayWriter JavaDoc;
42 import java.io.IOException JavaDoc;
43 import java.io.InputStreamReader JavaDoc;
44 import java.io.Reader JavaDoc;
45
46 import org.xml.sax.DocumentHandler JavaDoc;
47 import org.xml.sax.DTDHandler JavaDoc;
48 import org.xml.sax.EntityResolver JavaDoc;
49 import org.xml.sax.ErrorHandler JavaDoc;
50 import org.xml.sax.HandlerBase JavaDoc;
51 import org.xml.sax.InputSource JavaDoc;
52 import org.xml.sax.SAXException JavaDoc;
53
54 import com.quadcap.text.NoStringPool;
55 import com.quadcap.text.StringPool;
56 import com.quadcap.util.collections.ArrayQueue;
57
58 import com.quadcap.util.Debug;
59
60 /**
61  * SAX Parser implementation.
62  *
63  * @author Stan Bailes
64  */

65 public class Parser implements org.xml.sax.Parser JavaDoc {
66     boolean docStarted = false;
67     HandlerBase JavaDoc defaultHandler = new HandlerBase JavaDoc();
68     StringPool pool = new NoStringPool();
69     InputSource JavaDoc in;
70     Reader JavaDoc r;
71     DocumentHandler JavaDoc docHandler = defaultHandler;
72     DTDHandler JavaDoc dtdHandler = defaultHandler;
73     EntityResolver JavaDoc entityResolver = defaultHandler;
74     ErrorHandler JavaDoc errorHandler = defaultHandler;
75     char[] ebuf = new char[6];
76     char[] tag = new char[1024*32];
77     int taglen = 0;
78     CharArrayWriter JavaDoc data = new CharArrayWriter JavaDoc();
79     AttributeList attributes = new AttributeList();
80     String JavaDoc attrName = null;
81     String JavaDoc tagName = null;
82     ArrayQueue inStack = null;
83     ArrayQueue locStack = null;
84     int lineNumber = 1;
85     int columnNumber = 1;
86     String JavaDoc lastEntityVal = "";
87     boolean trace = false;
88     int commentLevel = 0;
89
90     public Parser() {}
91
92     public void parse(InputSource JavaDoc in) throws SAXException JavaDoc,IOException JavaDoc {
93         this.in = in;
94         this.r = getCharacterStream(in);
95         taglen = 0;
96         lineNumber = 1;
97         columnNumber = 1;
98         data.reset();
99         try {
100             parse();
101         } catch (SAXException JavaDoc ex) {
102             if (locStack != null) {
103                 for (int i = 0; i < locStack.size(); i++) {
104                     com.quadcap.util.Debug.println(" at " + locStack.top(i));
105                 }
106             }
107             throw ex;
108         }
109     }
110
111     final Reader JavaDoc getCharacterStream(InputSource JavaDoc in) {
112         Reader JavaDoc rd = in.getCharacterStream();
113         if (rd == null) {
114             rd = new InputStreamReader JavaDoc(in.getByteStream());
115         }
116         return rd;
117     }
118     
119     public void pushInputSource(InputSource JavaDoc in2) {
120     if (inStack == null) {
121             inStack = new ArrayQueue();
122             locStack = new ArrayQueue();
123         }
124     inStack.push(in);
125         locStack.push("" + lineNumber + ":" + columnNumber);
126         lineNumber = 1;
127         columnNumber = 1;
128     in = in2;
129     r = getCharacterStream(in);
130     }
131
132     boolean popInputSource() {
133     if (inStack == null || inStack.size() == 0) return false;
134     in = (InputSource JavaDoc)inStack.pop();
135         String JavaDoc s = locStack.pop().toString();
136         int idx = s.indexOf(':');
137         lineNumber = Integer.parseInt(s.substring(0, idx));
138         columnNumber = Integer.parseInt(s.substring(idx+1));
139     r = getCharacterStream(in);
140     return true;
141     }
142
143     final void addTagChar(int c) throws SAXException JavaDoc {
144         if (taglen >= tag.length) throw new SAXException JavaDoc("tag too long");
145         tag[taglen++] = (char)c;
146     }
147     
148     public void parse(String JavaDoc s) {
149     }
150
151     public void setDocumentHandler(DocumentHandler JavaDoc dh) {
152         this.docHandler = dh;
153     }
154     
155     public void setDTDHandler(DTDHandler JavaDoc dh) {
156         this.dtdHandler = dh;
157     }
158
159     public void setEntityResolver(EntityResolver JavaDoc er) {
160     this.entityResolver = er;
161     }
162
163     public EntityResolver JavaDoc getEntityResolver() {
164     return entityResolver;
165     }
166
167     public void setErrorHandler(ErrorHandler JavaDoc er) {
168         errorHandler = er;
169     }
170
171     public void setLocale(java.util.Locale JavaDoc locale) {
172     }
173
174     final int read() throws IOException JavaDoc {
175         int c = r.read();
176         if (c == '\n') {
177             lineNumber++;
178             columnNumber = 1;
179         } else {
180             columnNumber++;
181         }
182         return c;
183     }
184
185     final char parseEntity() throws SAXException JavaDoc, IOException JavaDoc {
186         int len = 0;
187         int c;
188         int state = 0;
189         while ((c = read()) >= 0) {
190             ebuf[len++] = (char)c;
191             if (!Character.isLetter((char)c) || len >= ebuf.length) break;
192         }
193         lastEntityVal = new String JavaDoc(ebuf, 0, len);
194         if (len == 5 && ebuf[0] == 'q' && ebuf[1] == 'u' &&
195             ebuf[2] == 'o' && ebuf[3] == 't') {
196             return '"';
197         }
198         if (len == 4 && ebuf[0] == 'a' && ebuf[1] == 'm' && ebuf[2] == 'p') {
199             return '&';
200         }
201         if (len == 3) {
202             if (ebuf[0] == 'l') {
203                 if (ebuf[1] == 't') return '<';
204             } else if (ebuf[0] == 'g') {
205                 if (ebuf[1] == 't') return '>';
206             }
207         }
208         throw new SAXException JavaDoc("unknown entity: " + lastEntityVal);
209             
210     }
211     
212     public int step(int state, int c) throws SAXException JavaDoc, IOException JavaDoc {
213 // Debug.println("step[" + state + " " + commentLevel +
214
// "]: " + ((char)c));
215
switch (state) {
216         case 0:
217             if (c == '<') {
218                 if (data.size() > 0) {
219                     docHandler.characters(data.toCharArray(), 0, data.size());
220                     data.reset();
221                 }
222                 state = 1;
223             } else {
224                 if (c == '&') {
225                     try {
226                         c = parseEntity();
227                     } catch (SAXException JavaDoc e) {
228                         data.write('&');
229                         data.write(lastEntityVal);
230                         break;
231                     }
232                 }
233                 data.write(c);
234             }
235             break;
236         case 1: // seen '<'
237
switch (c) {
238             case '!':
239                 state = 30;
240                 break;
241             case '\\':
242                 state = 4;
243                 break;
244             case '/':
245                 state = 8;
246                 break;
247             case '?':
248                 data.reset();
249                 state = 20;
250                 break;
251             default:
252                 addTagChar(c);
253                 state = 5;
254                 break;
255             }
256             break;
257         case 4: // seen <\
258
data.write('<');
259             data.write(c);
260             state = 0;
261             break;
262         case 5: // collect tag name
263
switch (c) {
264             case ' ': case '\r': case '\n': case '\t':
265                 tagName = pool.intern(tag, 0, taglen);
266                 taglen = 0;
267                 state = 6;
268                 break;
269             case '/':
270                 tagName = pool.intern(tag, 0, taglen);
271                 taglen = 0;
272                 state = 9;
273                 break;
274             case '>':
275                 tagName = pool.intern(tag, 0, taglen);
276                 taglen = 0;
277                 state = 0;
278                 startElement(tagName, attributes);
279                 break;
280             case '<':
281                 tagName = pool.intern(tag, 0, taglen);
282                 taglen = 0;
283                 if (data.size() > 0) {
284                     docHandler.characters(data.toCharArray(),
285                                           0, data.size());
286                     data.reset();
287                 }
288                 state = 1;
289                 break;
290             default:
291                 if (Character.isLetter((char)c) ||
292                     Character.isDigit((char)c) ||
293                     c == '.' || c == '-' || c == '_' || c == ':') {
294                     addTagChar(c);
295                 } else {
296                     // this isn't a tag after all (e.g., inside a <script>
297
// section, we've found "if (a < b) ..."
298
for (int i = 0; i < taglen; i++) {
299                         data.write(tag[i]);
300                     }
301                     data.write(c);
302                     state = 0;
303                     taglen = 0;
304                     break;
305                 }
306                     
307             }
308             break;
309         case 6: // collect attributes
310
switch (c) {
311             case ' ': case '\n': case '\r': case '\t':
312                 break;
313             case '/':
314                 state = 9;
315                 break;
316             case '%':
317                 addTagChar(c);
318                 break;
319             case '>':
320                 state = 0;
321                 startElement(tagName, attributes);
322                 break;
323             case '=':
324                 attrName = pool.intern(tag, 0, taglen);
325                 taglen = 0;
326                 state = 10;
327                 break;
328             case '<':
329                 state = 61;
330                 break;
331             default:
332                 addTagChar(c);
333             }
334             break;
335         case 61:
336             switch (c) {
337             case '?':
338                 state = 62;
339                 break;
340             default:
341                 addTagChar('<');
342                 addTagChar(c);
343                 state = 6;
344                 break;
345             }
346             break;
347         case 62:
348             switch (c) {
349             case '?':
350                 state = 63;
351                 break;
352             default:
353                 addTagChar(c);
354                 break;
355             }
356             break;
357         case 63:
358             switch(c) {
359             case '>':
360                 addTagChar(c);
361                 state = 6;
362                 break;
363             default:
364                 addTagChar('?');
365                 if (c != '?') state = 62;
366                 break;
367             }
368             break;
369         case 8: // seen </
370
if (c == '>') {
371                 tagName = pool.intern(tag, 0, taglen);
372                 taglen = 0;
373                 state = 0;
374                 docHandler.endElement(tagName);
375             } else {
376                 addTagChar(c);
377             }
378             break;
379         case 9: // in <tag, seen /
380
if (c == '>') {
381                 startElement(tagName, attributes);
382                 state = 0;
383                 docHandler.endElement(tagName);
384             } else {
385                 addTagChar('/');
386                 addTagChar(c);
387                 state = 6;
388             }
389             break;
390         case 10: // in attriblist, seen name=
391
if (c == '"') {
392                 state = 12;
393             } else if (c == '\'') {
394                 state = 121;
395             } else {
396                 addTagChar(c);
397                 state = 13;
398             }
399             break;
400         case 12: // in attriblist, seen name="
401
if (c == '"') {
402                 attributes.addAttribute(attrName, "CDATA",
403                                         pool.intern(tag, 0, taglen));
404                 taglen = 0;
405                 state = 6;
406             } else {
407                 addTagChar(c);
408             }
409             break;
410         case 121: // in attriblist, seen name='
411
if (c == '\'') {
412                 attributes.addAttribute(attrName, "CDATA",
413                                         pool.intern(tag, 0, taglen));
414                 taglen = 0;
415                 state = 6;
416             } else {
417                 addTagChar(c);
418             }
419             break;
420         case 13: // in attriblist, seen name=c
421
switch (c) {
422             case ' ':
423                 attributes.addAttribute(attrName, "CDATA",
424                                         pool.intern(tag, 0, taglen));
425                 taglen = 0;
426                 state = 6;
427                 break;
428             case '/':
429                 state = 14;
430                 break;
431             case '>':
432                 attributes.addAttribute(attrName, "CDATA",
433                                         pool.intern(tag, 0, taglen));
434                 taglen = 0;
435                 state = 0;
436                 startElement(tagName, attributes);
437                 break;
438             default:
439                 addTagChar(c);
440             }
441             break;
442         case 14: // in attriblist, seen name=dfdf/
443
if (c == '>') {
444                 attributes.addAttribute(attrName, "CDATA",
445                                         pool.intern(tag, 0, taglen));
446                 taglen = 0;
447                 state = 0;
448                 startElement(tagName, attributes);
449                 docHandler.endElement(tagName);
450             } else {
451                 addTagChar('/');
452                 if (c != '/') {
453                     addTagChar(c);
454                     state = 13;
455                 }
456             }
457             break;
458         case 15:
459             if (c == '-') state = 16;
460             break;
461         case 16:
462             if (c == '-') state = 17;
463             else state = 15;
464             break;
465         case 17:
466             if (c == '>') state = 0;
467             else if (c != '-') state = 15;
468             break;
469         case 20:
470             if (c == '?') state = 21;
471             else data.write(c);
472             break;
473         case 21:
474             if (c == '>') {
475                 String JavaDoc s = data.toString().trim();
476                 if (s.startsWith("xml")) {
477                     if (inStack == null || inStack.size() == 0) {
478                         if (!docStarted) {
479                             docStarted = true;
480                             docHandler.startDocument();
481                         }
482                     }
483                 } else {
484                     int idx = s.indexOf(' ');
485                     String JavaDoc dat = "";
486                     String JavaDoc target = s;
487                     if (idx >= 0) {
488                         target = s.substring(0, idx);
489                         dat = s.substring(idx+1).trim();
490                     }
491                     docHandler.processingInstruction(target, dat);
492                 }
493                 data.reset();
494                 state = 0;
495             } else {
496                 data.write('?');
497                 if (c != '?') {
498                     data.write(c);
499                     state = 20;
500                 }
501             }
502             break;
503         case 30: // seen <!
504
if (c == '-') state = 31;
505             else if (c == '[') state = 41;
506             else state = 40;
507             break;
508         case 31: // seen <!-
509
if (c == '-') {
510                 commentLevel = 1;
511                 state = 32;
512             }
513             else state = 40;
514             break;
515         case 32: // in comment, look for '-'
516
if (c == '-') state = 33;
517             else if (c == '<') state = 320;
518             break;
519         case 320: // in comment, seen <
520
if (c == '!') state = 321;
521             else if (c == '-') state = 33;
522             else state = 32;
523             break;
524         case 321: // in comment, seen <!
525
if (c == '-') state = 322;
526             else state = 32;
527             break;
528         case 322: // in comment, seen <!-
529
if (c == '-') {
530                 commentLevel++;
531             }
532             state = 32;
533             break;
534         case 33: // in comment, seen -
535
if (c == '-') state = 34;
536             else state = 32;
537             break;
538         case 34: // in comment, seen --
539
if (c == '>') {
540                 if (--commentLevel == 0) {
541                     state = 0;
542                 } else {
543                     state = 32;
544                 }
545             }
546             else if (c != '-') state = 32;
547             break;
548         case 40: // seen <!, but not comment
549
if (c == '>') state = 0;
550             break;
551         case 41: // seen <![
552
if (c == '[') {
553                 if (data.toString().equals("CDATA")) {
554                     data.reset();
555                     state = 42;
556                 } else {
557                     state = 40;
558                 }
559             } else {
560                 data.write(c);
561             }
562             break;
563         case 42: // in CDATA section
564
if (c == ']') {
565                 state = 43;
566             } else {
567                 data.write(c);
568             }
569             break;
570         case 43: // in CDATA, seen ']'
571
if (c == ']') {
572                 state = 44;
573             } else {
574                 data.write(']');
575                 data.write(c);
576                 state = 42;
577             }
578             break;
579         case 44: // in CDATA, seen ']]'
580
if (c == '>') {
581                 state = 0;
582             } else if (c == ']') {
583                 data.write(']');
584             } else {
585                 data.write("]]");
586                 data.write(c);
587                 state = 42;
588             }
589             break;
590         default:
591             throw new SAXException JavaDoc("Bad parser state: " + state);
592         }
593         return state;
594     }
595     
596     public void parse() throws SAXException JavaDoc, IOException JavaDoc {
597         int state = 0;
598         docHandler.setDocumentLocator(new Locator(this));
599         while (parseUntilEOF()) {}
600         docHandler.endDocument();
601     }
602
603     public boolean parseUntilEOF() throws SAXException JavaDoc, IOException JavaDoc {
604         boolean ret = false;
605         int state = 0;
606         while (state >= 0) {
607             int c = read();
608             if (c < 0) {
609                 try { r.close(); } catch (Exception JavaDoc e) {}
610                 ret = popInputSource();
611                 state = -1;
612             } else {
613                 state = step(state, c);
614             }
615         }
616         return ret;
617     }
618
619     public int getLineNumber() {
620         return lineNumber;
621     }
622
623     public int getColumnNumber() {
624         return columnNumber;
625     }
626
627     void startElement(String JavaDoc name, AttributeList attributes) throws SAXException JavaDoc {
628         if (!docStarted) {
629             docStarted = true;
630             docHandler.startDocument();
631         }
632         docHandler.startElement(tagName, attributes);
633         attributes.clear();
634     }
635     
636 }
637
Popular Tags