KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > JSX > ParserXML


1 /** (c) Brendan Macmillan 2001, licensed under GNU's GPL 2
2         (If your project is not GPL, then a fee is due - see license on website)
3     Website: http://www.csse.monash.edu.au/~bren/JSX
4     Website: http://freshmeat.net/projects/jsx
5     List (can read without subscribing, need to join to post):
6       http://groups.yahoo.com/group/JSX-ideas/messages
7         Commercial licensing enquiries only:
8             bren@mail.csse.monash.edu.au */

9
10
11 /** ===========================================================================
12     * ParserXML (JSX0.8).
13     * =========
14     *
15     * email comments to:
16     bren@mail.csse.monash.edu.au (G. Brendan Macmillan)
17  **/

18
19 /** Syntactic elements: <, =, "", > and / (see end for further info)
20     * DOCUMENT: ELEMENT*
21     * ELEMENT: <ZZZZ ATTR*>ELEMENT*</ZZZZ>
22     * ATTR: XXXX="YYYY"
23     **/

24
25 /** PROBLEM: \n in quoted values.
26     * SOLUTION:
27     * (a). reset the syntax (so that \n \r are significant, not whitespace)
28     * (b). grab up to the next " char, and return as String
29     * (c). then reset syntax again.
30     **/

31     
32
33 package JSX;
34 import java.io.*;
35
36 public class ParserXML {
37     Reader in; //keep it so we can close later on...
38
StreamTokenizer st;
39     final static boolean DEBUG = false;
40     //final static boolean DEBUG = true;
41

42     final static boolean CLOSETAG_DEBUG = false;
43     //final static boolean CLOSETAG_DEBUG = true;
44

45     final static boolean ATTR_BUFFER_DEBUG = false;
46     //final static boolean ATTR_BUFFER_DEBUG = true;
47

48     public ParserXML(Reader in) {
49         this.in = in;
50         st = new StreamTokenizer(new BufferedReader(in)); //thanks to Levi Mota
51
setSyntax();
52     }
53
54     public void close() {
55         try {
56             in.close(); //there is no Reader.flush()
57
} catch (IOException e) {
58             e.printStackTrace();
59         }
60     }
61
62
63     /**--------------------------------------------------------------------------
64         * setSyntax
65         * ---------
66         * Note: is altered in readAttr(), to cope with newlines within quotes.
67         **/

68     private void setSyntax() {
69         st.eolIsSignificant(false); //quoted text is broken by /n still
70
//checked src: yes, \n is hard-coded
71
st.resetSyntax(); //(so numbers aren't significant)
72
st.wordChars(0,255);
73         st.ordinaryChar('<'); //could loop over a list...
74
st.ordinaryChar('/');
75         //st.ordinaryChar('?'); //for processing instruction PI
76
st.ordinaryChar('>');
77         st.ordinaryChar('=');
78         st.ordinaryChar('"'); //detect for as a token for OWN IMPLEMENTATION
79
st.ordinaryChar('\''); //detect for as a token for OWN IMPLEMENTATION
80
//st.quoteChar('"');
81
st.whitespaceChars(' ',' ');
82         st.whitespaceChars('\t','\t');
83         st.whitespaceChars('\n','\n');
84         st.whitespaceChars('\r','\r');
85     }
86
87     //we buffer now (so tag-based like SAX, notreadTag
88
public Attr readAttr() throws ExceptionXML, IOException {
89         Attr attr;
90             //... the "consumer" takes off (see readTag for producer)
91
attr = (Attr) attrBuffer.get(attrBufferIndex++); //if too off end -> Exception
92
if (attrBufferIndex==attrBuffer.size()) {
93             //reset: nice to do these two in one call.
94
attrBuffer.clear();
95             attrBufferIndex = 0; //NB: it makes sense for consumer to well consume
96
}
97         return attr;
98     }
99
100
101     //ignore all "alias-ID" attributes for now - based on Mark Collette's ideas
102
//insert before the old readAttr
103
/*
104     public Attr readAttrImpl2() throws ExceptionXML, IOException {
105         Attr attr = readAttrImpl();
106         if (attr.name!=null && attr.name.equals(XMLSerialize.ALIAS_ID_TOKEN) ) {
107                 //setAlias(attr);
108             attr=readAttrImpl();
109             if (attr.name!=null && attr.name.equals(XMLSerialize.ALIAS_ID_TOKEN) ) {
110                 throw new ExceptionXML("Found two "+XMLSerialize.ALIAS_ID_TOKEN+" attrs in a tag");
111             }
112         }
113         return attr;
114     }
115 */

116
117
118     /**--------------------------------------------------------------------------
119         * readAttrImp
120         * --------
121         * XXXX = "yyyy" | >
122         *
123         **/

124     public Attr readAttrImpl() throws ExceptionXML, IOException {
125
126         Attr attr = new Attr();
127         int ttype;
128         ttype = st.nextToken(); //expect an attribute name
129
if (ttype=='>') {
130             attr.isEnd = true;
131             return attr;
132         }
133
134         if (ttype=='/') { //first char of digraph for "EMPTY TAG"
135
if (st.nextToken() == '>') //expect an attribute name
136
{
137                 attr.isEnd=true;
138                 attr.emptyTag=true;
139                 return attr;
140             }
141             else
142              throw new ExceptionXML("empty tag (\"/>\")", st);
143         }
144
145         if (ttype!=st.TT_WORD)
146             throw new ExceptionXML("an attribute name", st);
147
148         //We can do "?>" here - (it is not same as ordinary char)
149
if (st.sval.equals("?")) { //first char of digraph for "END PI" is '?'
150
if (st.nextToken() == '>') //expect an attribute name
151
{
152                 attr.isEndPI=true; //special end logic, to prevent <tag ?>
153
return attr;
154             }
155             else
156              throw new ExceptionXML("PI end (\"?>\")", st);
157         }
158
159         attr.name = st.sval;
160         ttype = st.nextToken(); //expect a =
161
if (ttype!='=')
162             throw new ExceptionXML(
163                 "Expected \"=\" to follow attribute name \"" +attr.name+ "\""
164             );
165             //NOTE: Any ws between '=' and '"' has already been gobbled up; therefore,
166
//it is OK to switch the syntax *now* - or else ws *following* the
167
//'"' will also be snaffled.
168
//OWN IMPLEMENTATION:
169
st.wordChars('>','>'); //by XML1.0, this is legal in an attr value
170
st.wordChars('=','=');
171               st.wordChars('/','/');
172               //st.wordChars('?','?');
173
st.ordinaryChar(' '); //means it is returned individually, I think?
174
//st.wordChars(' ',' ');
175
st.ordinaryChar('\t');
176               //st.wordChars('\t','\t');
177
st.eolIsSignificant(true);
178               //st.wordChars('\n','\n'); //these appear to be just skipped?
179
st.ordinaryChar('\r');
180               //st.wordChars('\r','\r');
181

182         ttype = st.nextToken(); //expect a quoted value: OWN IMPLEMENTATION
183
//System.err.println("1 ttype of thing read: "+ ttype + ", " + (char)ttype);
184
if (ttype!='"' && ttype!='\'')
185             throw new ExceptionXML(
186                 "'quoted' or \"quoted\" value (single or double) to follow \""+
187                 attr.name + "=\"", st); //"name="
188
if (DEBUG) System.err.println(st.sval + " (sval) *with* quote");
189         int quoteChar = ttype;
190              /* PROBLEM: a \n breaks a quoted string, finishing it. I can't see how
191                 * to fix this. You want it to ignore the \n within a quoted string.
192                 * other ws within a quoted string does not separate it into separate
193                 * tokens; but \n does, even when specified as not significant. This
194                 * is a bug in the tokenizer; or else I am initializing it wrongly.
195                 *
196                 * PROBLEM: leading whitespace within a quote is ignored...
197                 * Cause: when first quote read, following whitespace is not
198                 * a word char and so is gobbled up. Solution is to set syntax
199                 * earlier, as a quoted string *must* follow, or else error, anyway.
200                 */

201
202                 ttype = st.nextToken(); //expect a quoted value
203
if (ttype==quoteChar) { //ie empty string is possible! Romain bug
204
attr.value = ""; //NB: not the same as null
205
}
206                 else {
207                     if (DEBUG) System.err.println(ttype + " (type) read after quote");
208                     if (DEBUG) System.err.println(st.sval + " (sval) read after quote");
209                     //build up value in StringBuffer, for efficiency:
210
StringBuffer JavaDoc out = new StringBuffer JavaDoc(); //don't know length...
211
//assign String to attr.value when fully read...
212
while (true) { //read quoted value HACK, but works. Tokenizer sucks
213
//System.err.println("2 ttype of thing read: "+ ttype + ", " + (char)ttype);
214
if (DEBUG) System.err.println("BUILDING: '"+out+"'");
215                         if (ttype==' ') { //eat ws
216
out.append(' ');
217                                 //attr.value += ' ';
218
ttype = st.nextToken();
219                             continue;
220                         }
221                         if (ttype=='\t') { //eat ws
222
out.append('\t');
223                                 //attr.value += '\t';
224
ttype = st.nextToken();
225                             continue;
226                         }
227                         if (ttype=='\r') { //eat ws
228
out.append('\r');
229                                 //attr.value += '\r';
230
ttype = st.nextToken();
231                             continue;
232                         }
233                         if (ttype==st.TT_EOL) { //eat nl
234
out.append('\n');
235                                 //attr.value += '\n';
236
ttype = st.nextToken();
237                             continue;
238                         }
239                         if (ttype==st.TT_WORD) {
240                             out.append(st.sval);//NB: any leading ws + store quoted value.
241
//attr.value += st.sval;//NB: any leading ws + store quoted value.
242
ttype = st.nextToken();
243                             continue;
244                         }
245                         break; //unreachable code
246
/*
247                         throw new ExceptionXML(
248                             "a \"quoted\" value (same as opening quote) to follow \""+
249                             attr.name + "=\"", st); //"name="
250 */

251                     }
252                     attr.value = out.toString(); //assign at end of while
253
}
254                 if (DEBUG) System.err.println(st.sval + " was value read after quote");
255                 if (DEBUG) System.err.println(attr.value + " was total value");
256                 if (DEBUG) System.err.println(ttype + " read after that");
257                 if (ttype!=quoteChar) //the same, whatever it was
258
throw new ExceptionXML(
259                     "matching '"+(char)quoteChar+"'", st);
260             //switch syntax back (duplicating some code for eff). It hurts me to dup
261
st.eolIsSignificant(false);
262               st.ordinaryChar('>');
263               st.ordinaryChar('=');
264               st.ordinaryChar('/');
265               //st.ordinaryChar('?');
266
st.whitespaceChars(' ',' ');
267                 st.whitespaceChars('\t','\t');
268                 st.whitespaceChars('\n','\n');
269                 st.whitespaceChars('\r','\r');
270 // setSyntax();
271
attr.name = ParseUtilities.descapeDollar(attr.name);
272         return attr;
273     }
274
275     /**--------------------------------------------------------------------------
276         * Attr class. (either XXXX="yyyy" -or- ">")
277         * ==========
278         * attribute name | >
279         * =:
280         * ": expects a quoted value
281         *
282         * return attribute OR null (if no attributes left)
283         **/

284     public class Attr implements Serializable{
285         public String JavaDoc name;
286         public String JavaDoc value;
287         boolean nameMissing; //only for Deserialization convenience. Yuk!
288
int length; //HACK!! for use in message passing.
289
boolean isEndPI = false; //distinguish "?>" from "/>"
290
public boolean isEnd = false; //usually it is. We flag the end with this.
291
public boolean emptyTag = false;
292         public String JavaDoc toString() {
293         /*
294             StringWriter sw = new StringWriter();
295             XMLSerialize.serialize(this, new PrintWriter(sw, true));//this works!
296             return sw.toString();
297         */

298             return name +"=\""+value+"\" , "
299             + ((emptyTag)?"empty":"")+((isEnd)?"end":"")+" tag"
300             + " nameMissing="+nameMissing;
301         }
302     }
303
304
305
306     /**--------------------------------------------------------------------------
307         * readTag
308         * -------
309         *
310         **/

311     public Tag readTag() throws ExceptionXML, IOException {
312         Tag tag = new Tag(); //may be more efficient to reuse?
313
int ttype;
314         ttype = st.nextToken(); //expect an <
315
if (ttype==st.TT_EOF)
316             return null;
317         if (ttype!='<')
318                 throw new ExceptionXML("an open tag", st);
319         ttype = st.nextToken(); //expect either a name, or a /
320
switch (ttype) {
321             case StreamTokenizer.TT_WORD: //must be constant
322
tag.name = st.sval;
323                 tag.start = true;
324                 break;
325             case '/': //close tag: </tag>, not empty tag: <tag/>
326
ttype = st.nextToken(); //expect a name to follow
327
if (ttype!=st.TT_WORD)
328                     throw new ExceptionXML("a tag", st);
329                 tag.name = st.sval;
330                 tag.start = false;
331                 ttype = st.nextToken(); //expect a >
332
if (ttype!='>')
333                     throw new ExceptionXML("a >", st);
334                 break;
335             default:
336                 throw new ExceptionXML("either \"<tag\" or \"</tag\"", st);
337         }
338         if (tag.name.startsWith("?")) {
339             tag.isPI = true;
340             tag.name = tag.name.substring(1); //strip the "?" for sake of ease
341
}
342         if (CLOSETAG_DEBUG) {
343             System.err.println("RETURNING TAG: "+tag);
344             try {
345                 throw new Exception JavaDoc();
346             } catch (Exception JavaDoc e) {
347                 e.printStackTrace();
348             }
349         }
350         //fill attr buffer (put it here, so very clear)
351
if (tag.start) { //how is version handled?
352
if (attrBuffer.size()!=0) {
353                 throw new ExceptionXML("not all attrs ("+attrBuffer+") consumed, when read " +tag);
354             }
355             Attr attr;
356             do { //this loop structure includes last value
357
attr=readAttrImpl();
358                 if (attr.name!=null && attr.name.equals(XMLSerialize.ALIAS_ID_TOKEN) )
359                     setAlias(attr);
360                 else if (attr.name!=null && attr.name.equals(XMLSerialize.NAME_TOKEN) )
361                     attrBuffer.insertElementAt(attr, 0); //obj-name always first
362
else
363                     attrBuffer.add(attr); //"producer" adds on...(readAttr consumes)
364
if (ATTR_BUFFER_DEBUG) System.err.println("attr: "+attr);
365             } while (!attr.isEnd && !attr.isEndPI); //have different meanings.
366
}
367         return tag;
368     }
369     /**--------------------------------------------------------------------------
370         * Tag class.
371         * ==========
372         * //empty tag: <tag/> not implemented
373         * Should we include information here, about whether it is a PI or not?
374         * ie if it begins with a "?", as in <?xml version="1.0"?> or
375         * <?jsx version="0.8.20.2"?>. Note that it ends with "?>", which is a
376         * little like an empty tag "/>".
377         **/

378     public class Tag {
379         public String JavaDoc name;
380         boolean isPI; //for <?target xxx="yyy"?> processing instruction (PI)
381
public boolean start; //true: <tag>, false: </tag>
382
public String JavaDoc toString() { return "<"+(isPI?"?":(start?"":"/")) + name; }
383     }
384
385 /** fields for buffering attr (converting from tag-centric to element-centric
386     *
387     **/

388     private java.util.Vector JavaDoc attrBuffer = new java.util.Vector JavaDoc();//cleared and reused
389
private int attrBufferIndex = 0; //cleared and reused
390

391     private Attr attrAlias = null;
392     /** producer **/
393     private void setAlias(Attr attr) throws ExceptionXML {
394         if (attrAlias==null) { //should not yet be set
395
attrAlias = attr;
396         } else {
397             throw new ExceptionXML("Found two "+XMLSerialize.ALIAS_ID_TOKEN+" attrs in tag");
398         }
399     }
400     /** consumer **/
401     public Attr getAlias() throws ExceptionXML {
402         if (attrAlias==null)
403             throw new ExceptionXML("no alias attr found");
404         Attr attr = attrAlias;
405         attrAlias = null; //reset after reading (the attrAlias is consumed)
406
return attr; //leave caller to check for null (meaning not set)
407
}
408
409
410
411     /**--------------------------------------------------------------------------
412         * Exceptions
413         * ==========
414         * this subclassing of exceptions is *very* reuseful modularity!
415         * probably should always send the st, so can output line number
416         * (and possibly position, last token etc, too).
417         * Just subclass IOException since that is what it really is, anyway
418         **/

419     static public class ExceptionXML extends IOException {
420         public ExceptionXML(String JavaDoc expected) {
421             super(expected);
422         }
423         public ExceptionXML(String JavaDoc expected, String JavaDoc got) {
424             super("Expected \"" +expected+ "\""
425                 + ", but got \"" +got+ "\"");
426         }
427         public ExceptionXML(String JavaDoc comment, String JavaDoc expected, String JavaDoc got) {
428             super(comment+": Expected \"" +expected+ "\""
429                 + ", but got \"" +got+ "\"");
430         }
431         public ExceptionXML(String JavaDoc expected, StreamTokenizer st) {
432             super(
433                 "Line " +st.lineno()+": "
434                 + "Expected " +expected
435                 + ", but got "
436                     +"'"+
437                         (st.ttype!=StreamTokenizer.TT_WORD?
438                         (""+(char)(st.ttype)):
439                         (st.sval))
440                   +"' "
441                     +"["+st.ttype+": " +st.sval+ "]"
442                 );
443         }
444     }
445
446
447
448     /**--------------------------------------------------------------------------
449         * Test code
450         * =========
451         * Tag, fields, then tags again
452         **/

453     public void parse() {
454         Tag tag;
455         Attr attr;
456         String JavaDoc indent = "";
457         System.out.println(
458             "Note: this version of parse() does no start-end tag balancing\n"+
459             "Note: it does no checking for legality of class and field names\n");
460         try {
461             while ( (tag=readTag()) != null) {
462                 if (tag.start) {
463                     System.out.print(indent + tag);
464                     indent += " "; //whack on 4 spaces
465
while ( (attr=readAttr()) != null)
466                         System.out.print(" "+attr);
467                 }
468                 else {
469                     indent = indent.substring(4); //whack off 4 spaces
470
System.out.print(indent + tag);
471                 }
472                 System.out.println(">");
473             }
474         }
475         catch (Exception JavaDoc e) {
476             System.out.println(e);
477         }
478     }
479
480
481     public void dumbParse() {
482         int ttype;
483         try {
484             while ( (ttype = st.nextToken()) != st.TT_EOF) {
485                 String JavaDoc a;
486                 switch (ttype) {
487                     case '"':
488                         a = "QUOTED: " +st.sval;
489                         break;
490                     case StreamTokenizer.TT_WORD: //must be constant
491
a = "TOKEN: " +st.sval;
492                         break;
493                     default:
494                         a = (char)ttype + " : " + st.sval;
495                         break;
496                 }
497                 System.out.println(a);
498             }
499         }
500         catch (Exception JavaDoc e) {
501             System.out.println(e);
502         }
503     }
504
505     public static class Test {
506         public static void main(String JavaDoc args[]) {
507             new ParserXML(new InputStreamReader(System.in)).parse();
508         }
509     }
510 }
511
512     /**--------------------------------------------------------------------------
513         * Rough summary of syntax
514         * -----------------------
515         * ELEMENTS:
516         * <: OPEN_TAG
517         * </: CLOSE_TAG
518         * ELEMENTS
519         *
520         * OPEN_TAG: expects a tag name
521         * FIELDS
522         * return
523         *
524         * CLOSE_TAG: expects a tag name (AT SAME LEVEL-it can only be one, I think)
525         * >: return
526         *
527         * FIELDS:
528         * >: return
529         * FIELD_NAME
530         * FIELDS
531         **/

532
Popular Tags