KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > DTDGenerator


1 import org.xml.sax.*;
2 import java.util.*;
3 import java.io.File JavaDoc;
4 import javax.xml.parsers.SAXParserFactory JavaDoc;
5
6 /**
7 * DTDGenerator<BR>
8 * Generates a possible DTD from an XML document instance.
9 * Pure SAX version of the Saxon DTDGenerator
10 * The program has no remaining dependencies on Saxon; all it needs is:
11 * JAXP 1.1
12 * SAX2
13 * A JAXP 1.1 conformant XML parser
14 * Java 1.2
15 * @author M.H.Kay
16 * @version 7.0: separated from Saxon source, now works with any JAXP 1.1 XML parser
17 */

18
19 public class DTDGenerator extends org.xml.sax.helpers.DefaultHandler JavaDoc {
20                                 // DTDSAXGen is a ContentHandler, created for convenience
21
// by extending the default handler that comes with SAX2
22

23     protected static int MIN_ENUMERATION_INSTANCES = 10;
24                                 // minimum number of appearances of an attribute for
25
// it to be considered a candidate for an enumeration type
26

27     protected static int MAX_ENUMERATION_VALUES = 20;
28                                 // maximum number of distinct attribute values to be
29
// included in an enumeration
30

31     protected static int MIN_ENUMERATION_RATIO = 3;
32                                 // an attribute will be regarded as an enumeration attribute
33
// only if the number of instances divided by the number of
34
// distinct values is >= this ratio
35

36     protected static int MIN_FIXED = 5;
37                                 // minimum number of attributes that must appear, with
38
// the same value each time, for the value to be regarded
39
// as FIXED
40

41     protected static int MIN_ID_VALUES = 10;
42                                 // minumum number of attribute values that must appear
43
// for the attribute to be regarded as an ID value
44

45     protected static int MAX_ID_VALUES = 100000;
46                                 // maximum number of attribute values to be saved
47
// while checking for uniqueness
48

49     TreeMap elementList; // alphabetical list of element types appearing in the document;
50
// each has the element name as a key and an ElementDetails object
51
// as the value
52

53     Stack elementStack; // stack of elements currently open; each entry is a StackEntry
54
// object
55

56     /**
57     * Entry point
58     * Usage: java DTDSAXGen input-file >output-file
59     */

60
61     public static void main (String JavaDoc args[]) throws java.lang.Exception JavaDoc
62     {
63                 // Check the command-line arguments.
64
if (args.length != 1) {
65             System.err.println("Usage: java DTDSAXGen input-file >output-file");
66             System.exit(1);
67         }
68
69                 // Instantiate and run the application
70
DTDGenerator app = new DTDGenerator();
71
72         app.run(args[0]);
73         app.printDTD();
74     }
75
76     public DTDGenerator ()
77     {
78         elementList = new TreeMap();
79         elementStack = new Stack();
80     }
81
82     private void run(String JavaDoc filename) {
83         try {
84             InputSource is = new InputSource(new File JavaDoc(filename).toURL().toString());
85             XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
86             parser.setContentHandler(this);
87             parser.parse(is);
88         } catch (java.io.FileNotFoundException JavaDoc nf) {
89             System.err.println("File " + filename + " not found");
90         } catch (Exception JavaDoc err) {
91             System.err.println("Failed while parsing source file");
92             System.err.println(err.getMessage());
93             err.printStackTrace();
94             System.exit(2);
95         }
96     }
97         
98
99     /**
100     * Test whether a string is an XML name.
101     * TODO: This is currently an incomplete test, it treats all non-ASCII characters
102     * as being valid in names.
103     */

104
105     private boolean isValidName(String JavaDoc s) {
106         if (!isValidNMTOKEN(s)) return false;
107         int c = s.charAt(0);
108         return ! ((c>=0x30 && c<=0x39) || c=='.' || c=='-' );
109     }
110
111     /**
112     * Test whether a string is an XML NMTOKEN.
113     * TODO: This is currently an incomplete test, it treats all non-ASCII characters
114     * as being valid in NMTOKENs.
115     */

116
117     private boolean isValidNMTOKEN(String JavaDoc s) {
118         if (s.length()==0) return false;
119         for (int i=0; i<s.length(); i++) {
120             int c = s.charAt(i);
121             if (!( (c>=0x41 && c<=0x5a) ||
122                    (c>=0x61 && c<=0x7a) ||
123                    (c>=0x30 && c<=0x39) ||
124                     c=='.' ||
125                     c=='_' ||
126                     c=='-' ||
127                     c==':' ||
128                     c>128 ))
129                 return false;
130         }
131         return true;
132     }
133   
134     /**
135     * When the whole document has been analysed, construct the DTD
136     */

137     
138     private void printDTD ()
139     {
140         // process the element types encountered, in turn
141

142         Iterator e=elementList.keySet().iterator();
143         while ( e.hasNext() )
144         {
145             String JavaDoc elementname = (String JavaDoc) e.next();
146             ElementDetails ed = (ElementDetails) elementList.get(elementname);
147             TreeMap children = ed.children;
148             Set childKeys = children.keySet();
149
150             //EMPTY content
151
if (childKeys.size()==0 && !ed.hasCharacterContent)
152                 System.out.print("<!ELEMENT " + elementname + " EMPTY >\n");
153
154             //CHARACTER content
155
if (childKeys.size()==0 && ed.hasCharacterContent)
156                 System.out.print("<!ELEMENT " + elementname + " ( #PCDATA ) >\n");
157
158             //ELEMENT content
159
if (childKeys.size()>0 && !ed.hasCharacterContent) {
160                 System.out.print("<!ELEMENT " + elementname + " ( ");
161
162                 if (ed.sequenced) {
163                     
164                     // all elements of this type have the same child elements
165
// in the same sequence, retained in the childseq vector
166

167                     Enumeration c = ed.childseq.elements();
168                     while (true) {
169                         ChildDetails ch = (ChildDetails)c.nextElement();
170                         System.out.print(ch.name);
171                         if (ch.repeatable && !ch.optional)
172                             System.out.print("+");
173                         if (ch.repeatable && ch.optional)
174                             System.out.print("*");
175                         if (ch.optional && !ch.repeatable)
176                             System.out.print("?");
177                         if (c.hasMoreElements())
178                             System.out.print(", ");
179                         else
180                             break;
181                     }
182                     System.out.print(" ) >\n");
183                 }
184                 else {
185                     
186                     // the children don't always appear in the same sequence; so
187
// list them alphabetically and allow them to be in any order
188

189                     Iterator c1 = childKeys.iterator();
190                     while (c1.hasNext()) {
191                         System.out.print((String JavaDoc)c1.next());
192                         if (c1.hasNext()) System.out.print(" | ");
193                     }
194                     System.out.print(" )* >\n");
195                 }
196             };
197
198             //MIXED content
199
if (childKeys.size()>0 && ed.hasCharacterContent) {
200                 System.out.print("<!ELEMENT " + elementname + " ( #PCDATA");
201                 Iterator c2 = childKeys.iterator();
202                 while (c2.hasNext()) {
203                     System.out.print(" | " + (String JavaDoc)c2.next());
204                 }
205                 System.out.print(" )* >\n");
206             };
207
208             // Now examine the attributes encountered for this element type
209

210             TreeMap attlist = ed.attributes;
211             boolean doneID = false; // to ensure we have at most one ID attribute per element
212
Iterator a=attlist.keySet().iterator();
213             while ( a.hasNext() )
214             {
215                 String JavaDoc attname = (String JavaDoc) a.next();
216                 AttributeDetails ad = (AttributeDetails) attlist.get(attname);
217
218                 // If the attribute is present on every instance of the element, treat it as required
219
boolean required = (ad.occurrences==ed.occurrences);
220
221                 // If every value of the attribute is distinct,
222
// and there are at least MIN_ID_VALUES, treat it as an ID
223
// TODO: this may give the wrong answer, we should check whether the value sets of two
224
// candidate-ID attributes overlap, in which case they can't both be IDs !!)
225
boolean isid = ad.allNames && // ID values must be Names
226
(!doneID) && // Only allowed one ID attribute per element type
227
(ad.unique) &&
228                                 (ad.occurrences>=MIN_ID_VALUES);
229
230                 // if there is only one attribute value, and at least MIN_FIXED occurrences of it,
231
// treat it as FIXED
232
boolean isfixed = required && ad.values.size()==1 && ad.occurrences >= MIN_FIXED;
233
234                 // if the number of distinct values is small compared with the number of occurrences,
235
// treat it as an enumeration
236
boolean isenum = ad.allNMTOKENs && // Enumeration values must be NMTOKENs
237
(ad.occurrences>=MIN_ENUMERATION_INSTANCES) &&
238                                 (ad.values.size()<=ad.occurrences/MIN_ENUMERATION_RATIO) &&
239                                 (ad.values.size()<=MAX_ENUMERATION_VALUES);
240
241                 System.out.print("<!ATTLIST " + elementname + " " + attname + " ");
242                 String JavaDoc tokentype = (ad.allNMTOKENs ? "NMTOKEN" : "CDATA");
243                 
244                 if (isid) {
245                     System.out.print("ID");
246                     doneID = true;
247                 }
248                 else if (isfixed) {
249                     String JavaDoc val = (String JavaDoc) ad.values.first();
250                     System.out.print(tokentype + " #FIXED \"" + escape(val) + "\" >\n");
251                 }
252                 else if (isenum) {
253                     System.out.print("( ");
254                     Iterator v = ad.values.iterator();
255                     while (v.hasNext()) {
256                         System.out.print((String JavaDoc) v.next());
257                         if (!v.hasNext()) break;
258                         System.out.print(" | ");
259                     };
260                     System.out.print(" )");
261                 }
262                 else
263                     System.out.print(tokentype);
264
265                 if (!isfixed) {
266                     if (required)
267                         System.out.print(" #REQUIRED >\n");
268                     else
269                         System.out.print(" #IMPLIED >\n");
270                 }
271             };
272             System.out.print("\n");
273         };
274    
275     }
276     
277
278     /**
279     * Escape special characters for display.
280     * @param ch The character array containing the string
281     * @param start The start position of the input string within the character array
282     * @param length The length of the input string within the character array
283     * @return The XML/HTML representation of the string<br>
284     * This static method converts a Unicode string to a string containing
285     * only ASCII characters, in which non-ASCII characters are represented
286     * by the usual XML/HTML escape conventions (for example, "&lt;" becomes "&amp;lt;").
287     * Note: if the input consists solely of ASCII or Latin-1 characters,
288     * the output will be equally valid in XML and HTML. Otherwise it will be valid
289     * only in XML.
290     * The escaped characters are written to the dest array starting at position 0; the
291     * number of positions used is returned as the result
292     */

293     
294     private static int escape(char ch[], int start, int length, char[] out)
295     {
296         int o = 0;
297         for (int i = start; i < start+length; i++) {
298             if (ch[i]=='<') {("&lt;").getChars(0,4,out,o); o+=4;}
299             else if (ch[i]=='>') {("&gt;").getChars(0,4,out,o); o+=4;}
300             else if (ch[i]=='&') {("&amp;").getChars(0,5,out,o); o+=5;}
301             else if (ch[i]=='\"') {("&#34;").getChars(0,5,out,o); o+=5;}
302             else if (ch[i]=='\'') {("&#39;").getChars(0,5,out,o); o+=5;}
303             else if (ch[i]<=0x7f) {out[o++]=ch[i];}
304             else {
305                 String JavaDoc dec = "&#" + Integer.toString((int)ch[i]) + ';';
306                 dec.getChars(0, dec.length(), out, o);
307                 o+=dec.length();
308             }
309         }
310         return o;
311     }
312
313     /**
314     * Escape special characters in a String value.
315     * @param in The input string
316     * @return The XML representation of the string<br>
317     * This static method converts a Unicode string to a string containing
318     * only ASCII characters, in which non-ASCII characters are represented
319     * by the usual XML/HTML escape conventions (for example, "&lt;" becomes
320     * "&amp;lt;").<br>
321     * Note: if the input consists solely of ASCII or Latin-1 characters,
322     * the output will be equally valid in XML and HTML. Otherwise it will be valid
323     * only in XML.
324     */

325     
326     private static String JavaDoc escape(String JavaDoc in)
327     {
328         char[] dest = new char[in.length()*8];
329         int newlen = escape( in.toCharArray(), 0, in.length(), dest);
330         return new String JavaDoc(dest, 0, newlen);
331     }
332        
333     /**
334     * Handle the start of an element. Record information about the position of this
335     * element relative to its parent, and about the attributes of the element.
336     */

337     
338     public void startElement (String JavaDoc uri, String JavaDoc localName, String JavaDoc name, Attributes attributes)
339     throws SAXException
340     {
341         StackEntry se = new StackEntry();
342
343         // create an entry in the Element List, or locate the existing entry
344
ElementDetails ed = (ElementDetails) elementList.get(name);
345         if (ed==null) {
346             ed = new ElementDetails(name);
347             elementList.put(name,ed);
348         };
349
350         // retain the associated element details object
351
se.elementDetails = ed;
352
353         // initialise sequence numbering of child element types
354
se.sequenceNumber = -1;
355         
356         // count occurrences of this element type
357
ed.occurrences++;
358
359         // Handle the attributes accumulated for this element.
360
// Merge the new attribute list into the existing list for the element
361

362         for (int a=0; a<attributes.getLength(); a++) {
363             String JavaDoc attName = attributes.getQName(a);
364             String JavaDoc val = attributes.getValue(a);
365  
366             AttributeDetails ad = (AttributeDetails) ed.attributes.get(attName);
367             if (ad==null) {
368                ad=new AttributeDetails(attName);
369                ed.attributes.put(attName, ad);
370             };
371             
372             if (!ad.values.contains(val)) {
373                 
374                 // We haven't seen this attribute value before
375

376                 ad.values.add(val);
377                 
378                 // Check if attribute value is a valid name
379
if (ad.allNames && !isValidName(val)) {
380                     ad.allNames = false;
381                 }
382                 
383                 // Check if attribute value is a valid NMTOKEN
384
if (ad.allNMTOKENs && !isValidNMTOKEN(val)) {
385                     ad.allNMTOKENs = false;
386                 }
387
388                 // For economy, don't save the new value unless it's needed;
389
// it's needed only if we're looking for ID values or enumerated values
390

391                 if (ad.unique && ad.allNames && ad.occurrences <= MAX_ID_VALUES) {
392                     ad.values.add(val);
393                 } else if (ad.values.size() <= MAX_ENUMERATION_VALUES) {
394                     ad.values.add(val);
395                 }
396                 
397             } else {
398                 // We've seen this attribute value before
399
ad.unique = false;
400             }
401             ad.occurrences++;
402         };
403
404         // now keep track of the nesting and sequencing of child elements
405
if (!elementStack.isEmpty()) {
406             StackEntry parent = (StackEntry)elementStack.peek();
407             ElementDetails parentDetails = parent.elementDetails;
408             int seq = parent.sequenceNumber;
409
410             // for sequencing, we're interested in consecutive groups of the same child element type
411
boolean isFirstInGroup = (parent.latestChild==null || (!parent.latestChild.equals(name)));
412             if (isFirstInGroup) {
413                 seq++;
414                 parent.sequenceNumber++;
415             }
416             parent.latestChild = name;
417
418             // if we've seen this child of this parent before, get the details
419
TreeMap children = parentDetails.children;
420             ChildDetails c = (ChildDetails)children.get(name);
421             if (c==null) {
422                 // this is the first time we've seen this child belonging to this parent
423
c = new ChildDetails();
424                 c.name = name;
425                 c.position = seq;
426                 c.repeatable = false;
427                 c.optional = false;
428                 children.put(name, c);
429                 parentDetails.childseq.addElement(c);
430
431                 // if the first time we see this child is not on the first instance of the parent,
432
// then we allow it as an optional element
433
if (parentDetails.occurrences!=1) {
434                     c.optional = true;
435                 }
436
437             } else {
438
439                 // if it's the first occurrence of the parent element, and we've seen this
440
// child before, and it's the first of a new group, then the child occurrences are
441
// not consecutive
442
if (parentDetails.occurrences==1 && isFirstInGroup) {
443                     parentDetails.sequenced = false;
444                 }
445                 
446                 // check whether the position of this group of children in this parent element is
447
// the same as its position in previous instances of the parent.
448
if (parentDetails.childseq.size()<=seq ||
449                         !((ChildDetails)parentDetails.childseq.elementAt(seq)).name.equals(name))
450                 {
451                     parentDetails.sequenced = false;
452                 }
453             }
454
455             // if there's more than one child element, mark it as repeatable
456
if (!isFirstInGroup) {
457                 c.repeatable = true;
458             }
459         }
460         elementStack.push(se);
461     }
462
463     /**
464     * End of element. If sequenced, check that all expected children are accounted for.
465     */

466
467     public void endElement (String JavaDoc uri, String JavaDoc localName, String JavaDoc name)
468     throws SAXException
469     {
470
471         // If the number of child element groups in this parent element is less than the
472
// number in previous elements, then the absent children are marked as optional
473
ElementDetails ed = (ElementDetails) elementList.get(name);
474         if (ed.sequenced) {
475             StackEntry se = (StackEntry)elementStack.peek();
476             int seq = se.sequenceNumber;
477             for (int i=seq+1; i<ed.childseq.size(); i++) {
478                 ((ChildDetails)ed.childseq.elementAt(i)).optional = true;
479             }
480         }
481         elementStack.pop();
482     }
483     
484     /**
485     * Handle character data.
486     * Make a note whether significant character data is found in the element
487     */

488
489     public void characters (char ch[], int start, int length)
490     throws SAXException
491     {
492         ElementDetails ed = ((StackEntry)elementStack.peek()).elementDetails;
493         if (!ed.hasCharacterContent) {
494             for (int i=start; i<start+length; i++) {
495                 if ((int)ch[i] > 0x20) {
496                     ed.hasCharacterContent = true;
497                     break;
498                 }
499             }
500         }
501     }
502
503     /**
504     * ElementDetails is a data structure to keep information about element types
505     */

506
507     private class ElementDetails {
508         String JavaDoc name;
509         int occurrences;
510         boolean hasCharacterContent;
511         boolean sequenced;
512         TreeMap children;
513         Vector childseq;
514         TreeMap attributes;
515
516         public ElementDetails ( String JavaDoc name ) {
517             this.name = name;
518             this.occurrences = 0;
519             this.hasCharacterContent = false;
520             this.sequenced = true;
521             this.children = new TreeMap();
522             this.childseq = new Vector();
523             this.attributes = new TreeMap();
524         }
525     }
526
527     /**
528     * ChildDetails records information about the presence of a child element within its
529     * parent element. If the parent element is sequenced, then the child elements always
530     * occur in sequence with the given frequency.
531     */

532
533     private class ChildDetails {
534         String JavaDoc name;
535         int position;
536         boolean repeatable;
537         boolean optional;
538     }
539     
540
541     /**
542     * AttributeDetails is a data structure to keep information about attribute types
543     */

544
545     private class AttributeDetails {
546         String JavaDoc name; // name of the attribute
547
int occurrences; // number of occurrences of the attribute
548
boolean unique; // true if no duplicate values encountered
549
TreeSet values; // set of all distinct values encountered for this attribute
550
boolean allNames; // true if all the attribute values are valid names
551
boolean allNMTOKENs; // true if all the attribute values are valid NMTOKENs
552

553         public AttributeDetails ( String JavaDoc name ) {
554             this.name = name;
555             this.occurrences = 0;
556             this.unique = true;
557             this.values = new TreeSet();
558             this.allNames = true;
559             this.allNMTOKENs = true;
560         }
561     }
562     
563     /**
564     * StackEntry is a data structure we put on the stack for each nested element
565     */

566     
567     private class StackEntry {
568         ElementDetails elementDetails;
569         int sequenceNumber;
570         String JavaDoc latestChild;
571     }
572
573
574 } // end of outer class DTDSAXGen
575

576
Popular Tags