KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > nu > xom > Builder


1 /* Copyright 2002-2004 Elliotte Rusty Harold
2    
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6    
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10    GNU Lesser General Public License for more details.
11    
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307 USA
16    
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@metalab.unc.edu. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */

21
22 package nu.xom;
23
24 import java.io.CharConversionException JavaDoc;
25 import java.io.File JavaDoc;
26 import java.io.FileInputStream JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.InputStream JavaDoc;
29 import java.io.Reader JavaDoc;
30 import java.io.StringReader JavaDoc;
31 import java.io.UTFDataFormatException JavaDoc;
32 import java.net.MalformedURLException JavaDoc;
33 import java.net.URL JavaDoc;
34
35 import org.xml.sax.ErrorHandler JavaDoc;
36 import org.xml.sax.InputSource JavaDoc;
37 import org.xml.sax.SAXException JavaDoc;
38 import org.xml.sax.SAXNotRecognizedException JavaDoc;
39 import org.xml.sax.SAXNotSupportedException JavaDoc;
40 import org.xml.sax.SAXParseException JavaDoc;
41 import org.xml.sax.XMLFilter JavaDoc;
42 import org.xml.sax.XMLReader JavaDoc;
43 import org.xml.sax.helpers.XMLReaderFactory JavaDoc;
44
45 import org.apache.xerces.impl.Version;
46
47 /**
48  * <p>
49  * This class is responsible for creating XOM <code>Document</code>
50  * objects from a URL, file, string, or input stream by reading
51  * an XML document. A SAX parser is used to read the
52  * document and report any well-formedness errors.
53  * </p>
54  *
55  * @author Elliotte Rusty Harold
56  * @version 1.0
57  *
58  */

59 public class Builder {
60
61     
62     private XMLReader JavaDoc parser;
63     private NodeFactory factory;
64     
65     private static double xercesVersion = 2.6;
66     
67     static {
68
69         try {
70             String JavaDoc versionString = Version.getVersion();
71             versionString = versionString.substring(9, 12);
72             xercesVersion = Double.valueOf(versionString).doubleValue();
73         }
74         catch (Exception JavaDoc ex) {
75             // The version string format changed so presumably it's
76
// 2.6 or later
77
}
78         catch (Error JavaDoc err) {
79             // Xerces not installed, so none of this matters
80
}
81         
82     }
83     
84     
85     /**
86      * <p>
87      * Creates a <code>Builder</code> that uses the default node
88      * factory and chooses among any available SAX2 parsers.
89      * In order of preference, it looks for:
90      * </p>
91      *
92      * <ol>
93      * <li>Xerces 2.x (a.k.a. IBM XML parser for Java)</li>
94      * <li>GNU &AElig;lfred</li>
95      * <li>Crimson</li>
96      * <li>Piccolo</li>
97      * <li>Oracle</li>
98      * <li>XP</li>
99      * <li>Saxon's &AElig;lfred</li>
100      * <li>dom4j's &AElig;lfred</li>
101      * <li>The platform default specified by the
102      * <code>org.xml.sax.driver</code> system property</li>
103      * </ol>
104      *
105      * <p>
106      * Parsers must implicitly or explicitly support the
107      * http://xml.org/sax/features/external-general-entities
108      * and
109      * http://xml.org/sax/features/external-parameter-entities
110      * features XOM requires. Parsers that don't are rejected
111      * automatically.
112      * </p>
113      *
114      * @throws XMLException if no satisfactory parser is
115      * installed in the local class path
116      */

117     public Builder() {
118         this(false);
119     }
120     
121     
122     /**
123      * <p>
124      * Creates a <code>Builder</code> based on an optionally validating
125      * parser. If the <code>validate</code> argument
126      * is true, then a validity error while
127      * parsing will cause a fatal error; that is,
128      * it will throw a <code>ValidityException</code>.
129      * </p>
130      *
131      * @param validate true if the parser should
132      * validate the document while parsing
133      *
134      * @throws XMLException if no satisfactory parser
135      * is installed in the local class path
136      */

137     public Builder(boolean validate) {
138         this(findParser(validate), validate, null);
139     }
140
141     
142     /**
143      * <p>
144      * Creates a <code>Builder</code> based on an optionally
145      * validating parser that builds node objects with the supplied
146      * factory. If the <code>validate</code> argument is true, then
147      * a validity error while parsing will cause a fatal error; that
148      * is, it will throw a <code>ValidityException</code>.
149      * </p>
150      *
151      * @param validate true if the parser should
152      * validate the document while parsing
153      * @param factory the <code>NodeFactory</code> that creates
154      * the node objects for this <code>Builder</code>
155      *
156      * @throws XMLException if no satisfactory parser
157      * is installed in the local class path
158      */

159     public Builder(boolean validate, NodeFactory factory) {
160         this(findParser(validate), validate, factory);
161     }
162
163     
164     // These are stored in the order of preference.
165
private static String JavaDoc[] parsers = {
166         "nu.xom.XML1_0Parser",
167         "nu.xom.JDK15XML1_0Parser",
168         "org.apache.xerces.parsers.SAXParser",
169         "com.sun.org.apache.xerces.internal.parsers.SAXParser",
170         "gnu.xml.aelfred2.XmlReader",
171         "org.apache.crimson.parser.XMLReaderImpl",
172         "com.bluecast.xml.Piccolo",
173         "oracle.xml.parser.v2.SAXParser",
174         "com.jclark.xml.sax.SAX2Driver",
175         "net.sf.saxon.aelfred.SAXDriver",
176         "com.icl.saxon.aelfred.SAXDriver",
177         "org.dom4j.io.aelfred2.SAXDriver",
178         "org.dom4j.io.aelfred.SAXDriver"
179     };
180
181     
182     private static XMLReader JavaDoc findParser(boolean validate) {
183         
184         // first look for Xerces; we only trust Xerces if
185
// we set it up; and we need to configure it specially
186
// so we can't load it with the XMLReaderFactory
187
XMLReader JavaDoc parser;
188         try {
189             parser = new XML1_0Parser();
190             setupParser(parser, validate);
191             return parser;
192         }
193         catch (SAXException JavaDoc ex) {
194             // look for next one
195
}
196         catch (NoClassDefFoundError JavaDoc err) {
197             // Xerces is not available; look for next one
198
}
199
200         try {
201             parser = (XMLReader JavaDoc) Class.forName(
202               "nu.xom.JDK15XML1_0Parser").newInstance();
203             setupParser(parser, validate);
204             return parser;
205         }
206         catch (SAXException JavaDoc ex) {
207             // look for next one
208
}
209         catch (InstantiationException JavaDoc ex) {
210             // look for next one
211
}
212         catch (ClassNotFoundException JavaDoc ex) {
213             // look for next one
214
}
215         catch (IllegalAccessException JavaDoc ex) {
216             // look for next one
217
}
218         catch (NoClassDefFoundError JavaDoc err) {
219             // Xerces is not available; look for next one
220
}
221         
222         // XMLReaderFactory.createXMLReader never returns
223
// null. If it can't locate the parser, it throws
224
// a SAXException.
225
for (int i = 2; i < parsers.length; i++) {
226             try {
227                 parser = XMLReaderFactory.createXMLReader(parsers[i]);
228                 setupParser(parser, validate);
229                 return parser;
230             }
231             catch (SAXException JavaDoc ex) {
232                 // try the next one
233
}
234             catch (NoClassDefFoundError JavaDoc err) {
235                 // try the next one
236
}
237         }
238         
239         try { // default
240
parser = XMLReaderFactory.createXMLReader();
241             setupParser(parser, validate);
242             return parser;
243         }
244         catch (SAXException JavaDoc ex) {
245             throw new XMLException(
246               "Could not find a suitable SAX2 parser", ex);
247         }
248         
249     }
250
251
252     private static void setupParser(XMLReader JavaDoc parser, boolean validate)
253       throws SAXNotRecognizedException JavaDoc, SAXNotSupportedException JavaDoc {
254         
255         XMLReader JavaDoc baseParser = parser;
256         while (baseParser instanceof XMLFilter JavaDoc) {
257             baseParser = ((XMLFilter JavaDoc) baseParser).getParent();
258         }
259         String JavaDoc parserName = baseParser.getClass().getName();
260         if (!validate) {
261             parser.setFeature(
262               "http://xml.org/sax/features/namespace-prefixes", true);
263             if (parserName.equals( // Crimson workaround
264
"org.apache.crimson.parser.XMLReaderImpl")) {
265                 parser.setErrorHandler(
266                   new NamespaceWellformednessRequired()
267                 );
268             }
269             else {
270                 parser.setFeature(
271                   "http://xml.org/sax/features/external-general-entities",
272                   true
273                 );
274                 parser.setFeature(
275                  "http://xml.org/sax/features/external-parameter-entities",
276                   true
277                 );
278             }
279         }
280         else {
281             parser.setFeature(
282               "http://xml.org/sax/features/namespace-prefixes", true);
283             parser.setFeature(
284               "http://xml.org/sax/features/validation", true);
285             parser.setErrorHandler(new ValidityRequired());
286         }
287         
288         try {
289             parser.setFeature(
290               "http://xml.org/sax/features/string-interning", true);
291         }
292         catch (SAXException JavaDoc ex) {
293             // This parser does not support string interning.
294
// We can live without that.
295
}
296         
297         // A couple of Xerces specific properties
298
if (parserName.equals("nu.xom.XML1_0Parser")
299          || parserName.equals("nu.xom.JDK15XML1_0Parser")
300          || parserName.equals("org.apache.xerces.parsers.SAXParser")
301          || parserName.equals("com.sun.org.apache.xerces.internal.parsers.SAXParser")) {
302             try {
303                 parser.setFeature(
304                  "http://apache.org/xml/features/allow-java-encodings", true);
305             }
306             catch (SAXException JavaDoc ex) {
307                 // Possibly an earlier version of Xerces; no big deal.
308
// We can live without this feature.
309
}
310             // See http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23768
311
// if you care to know why this line breaks unit tests on
312
// versions of Xerces prior to 2.6.1
313
try {
314                 parser.setFeature(
315                  "http://apache.org/xml/features/standard-uri-conformant",
316                  true);
317             }
318             catch (SAXException JavaDoc ex) {
319                 // Possibly an earlier version of Xerces, or a
320
// or a non-Xerces parser; no big deal.
321
// We can live without this.
322
}
323         }
324         
325     }
326     
327     
328     /**
329      * <p>
330      * Creates a <code>Builder</code> that uses
331      * the specified SAX <code>XMLReader</code>.
332      * Custom SAX features and properties such as
333      * schema validation can be set on this <code>XMLReader</code>
334      * before passing it to this method.
335      * </p>
336      *
337      * @param parser the SAX2 <code>XMLReader</code> that
338      * parses the document
339      *
340      * @throws XMLException if <code>parser</code> does not support the
341      * features XOM requires
342      */

343     public Builder(XMLReader JavaDoc parser) {
344         this(parser, false);
345     }
346     
347     
348     /**
349      * <p>
350      * Creates a <code>Builder</code> that uses
351      * the specified <code>NodeFactory</code> to create
352      * node objects.
353      * </p>
354      *
355      * @param factory the <code>NodeFactory</code> that creates
356      * the node objects for this <code>Builder</code>
357      *
358      * @throws XMLException if no satisfactory parser is
359      * installed in the local class path
360      */

361     public Builder(NodeFactory factory) {
362         this(findParser(false), false, factory);
363     }
364     
365
366     /**
367      * <p>
368      * Creates a optionally validating <code>Builder</code> based
369      * on the specified parser object. Custom SAX features and
370      * properties such as schema validation can be set on this
371      * <code>XMLReader</code> before passing it to this method.
372      * </p>
373      *
374      * <p>
375      * If the validate argument is true, then a validity error
376      * while parsing will cause a fatal error; that is, it
377      * will throw a <code>ParsingException</code>
378      * </p>
379      *
380      * @param parser the SAX2 <code>XMLReader</code> that parses
381      * the document
382      * @param validate true if the parser should validate
383      * the document while parsing
384      *
385      */

386     public Builder(XMLReader JavaDoc parser, boolean validate) {
387         this(parser, validate, null);
388     }
389     
390     
391     /**
392      * <p>
393      * Creates an optionally validating <code>Builder</code> that reads
394      * data from the specified parser object and constructs new nodes
395      * using the specified factory object. Custom SAX features and
396      * properties such as schema validation can be set on this
397      * <code>XMLReader</code> before passing it to this method.
398      * </p>
399      *
400      * <p>
401      * If the <code>validate</code> argument is true, then a validity
402      * error while parsing will throw a <code>ParsingException</code>.
403      * </p>
404      *
405      * @param parser the SAX2 <code>XMLReader</code> that parses
406      * the document
407      * @param validate true if the parser should validate the
408      * document while parsing
409      * @param factory the <code>NodeFactory</code>
410      * this builder uses to create objects in the tree
411      *
412      * @throws XMLException if <code>parser</code> does not support
413      * the features XOM requires
414      *
415      */

416     public Builder(
417       XMLReader JavaDoc parser, boolean validate, NodeFactory factory) {
418                   
419         try {
420             setupParser(parser, validate);
421         }
422         catch (SAXException JavaDoc ex) {
423             if (validate) {
424                 throw new XMLException(parser.getClass().getName()
425                   + " does not support validation.", ex);
426             }
427             else {
428                 throw new XMLException(parser.getClass().getName()
429                   + " does not support the entity resolution"
430                   + " features XOM requires.", ex);
431             }
432         }
433         
434         // setup the handlers
435
this.parser = parser;
436         this.factory = factory;
437         setHandlers();
438
439     }
440     
441     
442     private static boolean knownGoodParser(XMLReader JavaDoc parser) {
443          
444         String JavaDoc parserName = parser.getClass().getName();
445         
446         // In general, a filter may violate the constraints of XML 1.0.
447
// However, I specifically trust Norm Walsh not to do that, so
448
// if his filters are being used we look at the parent instead.
449
if (parserName.equals("org.apache.xml.resolver.tools.ResolvingXMLReader")
450           || parserName.equals("org.apache.xml.resolver.tools.ResolvingXMLFilter")) {
451             XMLFilter JavaDoc filter = (XMLFilter JavaDoc) parser;
452             parserName = filter.getParent().getClass().getName();
453         }
454         
455         // These parsers are known to not make all the checks
456
// they're supposed to. :-(
457
if (parserName.equals("gnu.xml.aelfred2.XmlReader")) return false;
458         if (parserName.equals("net.sf.saxon.aelfred.SAXDriver")) return false;
459         if (parserName.equals("com.icl.saxon.aelfred.SAXDriver")) return false;
460     
461         if (parserName.equals("org.apache.xerces.parsers.SAXParser")
462             && xercesVersion >= 2.4) {
463             return false;
464         }
465         
466         for (int i = 0; i < parsers.length; i++) {
467             if (parserName.equals(parsers[i])) return true;
468         }
469         return false;
470         
471     }
472
473
474     private void setHandlers() {
475         
476         XOMHandler handler;
477         if ((factory == null
478           || factory.getClass().getName().equals("nu.xom.NodeFactory"))
479           && knownGoodParser(parser)) {
480             // If no factory is supplied by user, don't
481
// return one
482
NodeFactory tempFactory = factory;
483             if (tempFactory == null) tempFactory = new NodeFactory();
484             handler = new NonVerifyingHandler(tempFactory);
485         }
486         else {
487             if (factory == null) factory = new NodeFactory();
488             handler = new XOMHandler(factory);
489         }
490         parser.setContentHandler(handler);
491         parser.setDTDHandler(handler);
492         
493         try {
494             parser.setProperty(
495               "http://xml.org/sax/properties/lexical-handler",
496               handler);
497         }
498         catch (SAXException JavaDoc ex) {
499             // This parser does not support lexical events.
500
// We can live without them, though it does mean
501
// there won't be any comments or a DOCTYPE declaration
502
// in the tree.
503
}
504                 
505         try {
506             parser.setProperty(
507               "http://xml.org/sax/properties/declaration-handler",
508               handler);
509         }
510         catch (SAXException JavaDoc ex) {
511             // This parser does not support declaration events.
512
// We can live without them, though it does mean
513
// they won't be any internal DTD subset.
514
}
515         
516     }
517     
518     
519     /**
520      * <p>
521      * Parses the document at the specified URL.
522      * </p>
523      *
524      * <p>
525      * Note that relative URLs generally do not work here, as
526      * there's no base to resolve them against. This includes
527      * relative URLs that point into the file system, though this
528      * is somewhat platform dependent. Furthermore, <code>file</code>
529      * URLs often only work when they adhere exactly to RFC 2396
530      * syntax. URLs that work in Internet Explorer often fail when
531      * used in Java. If you're reading XML from a file, more reliable
532      * results are obtained by using the <code>build</code> method
533      * that takes a <code>java.io.File</code> object as an argument.
534      * </p>
535      *
536      * @param systemID the URL (generally absolute)
537      * from which the document is read.
538      * The URL's scheme must be one supported by the Java VM.
539      *
540      * @return the parsed <code>Document</code>
541      *
542      * @throws ValidityException if a validity error is detected. This
543      * is only thrown if the builder has been instructed to validate.
544      * @throws ParsingException if a well-formedness error is detected
545      * @throws IOException if an I/O error such as a broken socket
546      * prevents the document from being fully read
547      */

548     public Document build(String JavaDoc systemID)
549       throws ParsingException, ValidityException, IOException JavaDoc {
550
551         systemID = canonicalizeURL(systemID);
552         InputSource JavaDoc source = new InputSource JavaDoc(systemID);
553         return build(source);
554         
555     }
556
557     
558     /**
559      * <p>
560      * Reads the document from an input stream.
561      * </p>
562      *
563      * @param in the input stream from which the document is read
564      *
565      * @return the parsed <code>Document</code>
566      *
567      * @throws ValidityException if a validity error is detected;
568      * only thrown if the builder has been instructed to validate
569      * @throws ParsingException if a well-formedness error is detected
570      * @throws IOException if an I/O error such as a broken
571      * socket prevents the document from being fully read.
572      */

573     public Document build(InputStream JavaDoc in)
574       throws ParsingException, ValidityException, IOException JavaDoc {
575
576         InputSource JavaDoc source = new InputSource JavaDoc(in);
577         return build(source);
578         
579     }
580
581
582     /**
583      * <p>
584      * Reads the document from an input stream while specifying
585      * a base URI (which need not be the stream's actual URI).
586      * </p>
587      *
588      * @param in the input stream from which the document is read
589      * @param baseURI the base URI for this document
590      *
591      * @return the parsed <code>Document</code>
592      *
593      * @throws ValidityException if a validity error is detected;
594      * only thrown if the builder has been instructed to validate
595      * @throws ParsingException if a well-formedness error is detected
596      * @throws IOException if an I/O error such as a broken
597      * socket prevents the document from being fully read
598      */

599     public Document build(InputStream JavaDoc in, String JavaDoc baseURI)
600       throws ParsingException, ValidityException, IOException JavaDoc {
601
602         baseURI = canonicalizeURL(baseURI);
603         InputSource JavaDoc source = new InputSource JavaDoc(in);
604         source.setSystemId(baseURI);
605         return build(source);
606         
607     }
608
609     
610     // Nasty hack to make sure we get the right form
611
// of file URLs on Windows
612
private static String JavaDoc fileURLPrefix = "file://";
613     private static boolean isWindows = false;
614     
615     static {
616         String JavaDoc os = System.getProperty("os.name", "Unix");
617         // I could do System.setProperty("os.name" "Windows") to test
618
// this, but I'd need to us ea frsh ClassLoader to rerun the
619
// static initializer block.
620
if (os.indexOf("Windows") >= 0) {
621             fileURLPrefix = "file:/";
622             isWindows = true;
623         }
624     }
625
626     
627     /**
628      * <p>
629      * Reads the document from a file.
630      * The base URI of the document is set to the
631      * location of the file.
632      * </p>
633      *
634      * @param in the file from which the document is read
635      *
636      * @return the parsed <code>Document</code>
637      *
638      * @throws ValidityException if a validity error is detected. This
639      * is only thrown if the builder has been instructed to validate.
640      * @throws ParsingException if a well-formedness error is detected
641      * @throws IOException if an I/O error such as a bad disk
642      * prevents the file from being read
643      */

644     public Document build(File JavaDoc in)
645       throws ParsingException, ValidityException, IOException JavaDoc {
646
647         InputStream JavaDoc fin = new FileInputStream JavaDoc(in);
648         // Java's toURL method doesn't properly escape file
649
// names so we have to do it manually
650
String JavaDoc absolute = in.getAbsolutePath();
651         StringBuffer JavaDoc url = new StringBuffer JavaDoc(fileURLPrefix);
652         int length = absolute.length();
653         char separatorChar = File.separatorChar;
654         for (int i = 0; i < length; i++) {
655             char c = absolute.charAt(i);
656             if (c == separatorChar) url.append('/');
657             else {
658                 switch(c) {
659                     case ' ':
660                         url.append("%20");
661                         break;
662                     case '!':
663                         url.append(c);
664                         break;
665                     case '"':
666                         url.append("%22");
667                         break;
668                     case '#':
669                         url.append("%23");
670                         break;
671                     case '$':
672                         url.append(c);
673                         break;
674                     case '%':
675                         url.append("%25");
676                         break;
677                     case '&':
678                         // ampersand does not need to be encoded in
679
// path part of URL
680
url.append('&');
681                         break;
682                     case '\'':
683                         url.append(c);
684                         break;
685                     case '(':
686                         url.append(c);
687                         break;
688                     case ')':
689                         url.append(c);
690                         break;
691                     case '*':
692                         url.append(c);
693                         break;
694                     case '+':
695                         url.append("%2B");
696                         break;
697                     case ',':
698                         url.append(c);
699                         break;
700                     case '-':
701                         url.append(c);
702                         break;
703                     case '.':
704                         url.append(c);
705                         break;
706                     case '/':
707                         url.append("%2F");
708                         break;
709                     case '0':
710                         url.append(c);
711                         break;
712                     case '1':
713                         url.append(c);
714                         break;
715                     case '2':
716                         url.append(c);
717                         break;
718                     case '3':
719                         url.append(c);
720                         break;
721                     case '4':
722                         url.append(c);
723                         break;
724                     case '5':
725                         url.append(c);
726                         break;
727                     case '6':
728                         url.append(c);
729                         break;
730                     case '7':
731                         url.append(c);
732                         break;
733                     case '8':
734                         url.append(c);
735                         break;
736                     case '9':
737                         url.append(c);
738                         break;
739                     case ':':
740                         url.append(c);
741                         break;
742                     case ';':
743                         url.append(c);
744                         break;
745                     case '<':
746                         url.append("%3C");
747                         break;
748                     case '=':
749                         url.append(c);
750                         break;
751                     case '>':
752                         url.append("%3E");
753                         break;
754                     case '?':
755                         url.append("%3F");
756                         break;
757                     case '@':
758                         url.append("%40");
759                         break;
760                     case 'A':
761                         url.append(c);
762                         break;
763                     case 'B':
764                         url.append(c);
765                         break;
766                     case 'C':
767                         url.append(c);
768                         break;
769                     case 'D':
770                         url.append(c);
771                         break;
772                     case 'E':
773                         url.append(c);
774                         break;
775                     case 'F':
776                         url.append(c);
777                         break;
778                     case 'G':
779                         url.append(c);
780                         break;
781                     case 'H':
782                         url.append(c);
783                         break;
784                     case 'I':
785                         url.append(c);
786                         break;
787                     case 'J':
788                         url.append(c);
789                         break;
790                     case 'K':
791                         url.append(c);
792                         break;
793                     case 'L':
794                         url.append(c);
795                         break;
796                     case 'M':
797                         url.append(c);
798                         break;
799                     case 'N':
800                         url.append(c);
801                         break;
802                     case 'O':
803                         url.append(c);
804                         break;
805                     case 'P':
806                         url.append(c);
807                         break;
808                     case 'Q':
809                         url.append(c);
810                         break;
811                     case 'R':
812                         url.append(c);
813                         break;
814                     case 'S':
815                         url.append(c);
816                         break;
817                     case 'T':
818                         url.append(c);
819                         break;
820                     case 'U':
821                         url.append(c);
822                         break;
823                     case 'V':
824                         url.append(c);
825                         break;
826                     case 'W':
827                         url.append(c);
828                         break;
829                     case 'X':
830                         url.append(c);
831                         break;
832                     case 'Y':
833                         url.append(c);
834                         break;
835                     case 'Z':
836                         url.append(c);
837                         break;
838                     case '[':
839                         url.append("%5B");
840                         break;
841                     case '\\':
842                         url.append("%5C");
843                         break;
844                     case ']':
845                         url.append("%5D");
846                         break;
847                     case '^':
848                         url.append("%5E");
849                         break;
850                     case '_':
851                         url.append(c);
852                         break;
853                     case '`':
854                         url.append("%60");
855                         break;
856                     case 'a':
857                         url.append(c);
858                         break;
859                     case 'b':
860                         url.append(c);
861                         break;
862                     case 'c':
863                         url.append(c);
864                         break;
865                     case 'd':
866                         url.append(c);
867                         break;
868                     case 'e':
869                         url.append(c);
870                         break;
871                     case 'f':
872                         url.append(c);
873                         break;
874                     case 'g':
875                         url.append(c);
876                         break;
877                     case 'h':
878                         url.append(c);
879                         break;
880                     case 'i':
881                         url.append(c);
882                         break;
883                     case 'j':
884                         url.append(c);
885                         break;
886                     case 'k':
887                         url.append(c);
888                         break;
889                     case 'l':
890                         url.append(c);
891                         break;
892                     case 'm':
893                         url.append(c);
894                         break;
895                     case 'n':
896                         url.append(c);
897                         break;
898                     case 'o':
899                         url.append(c);
900                         break;
901                     case 'p':
902                         url.append(c);
903                         break;
904                     case 'q':
905                         url.append(c);
906                         break;
907                     case 'r':
908                         url.append(c);
909                         break;
910                     case 's':
911                         url.append(c);
912                         break;
913                     case 't':
914                         url.append(c);
915                         break;
916                     case 'u':
917                         url.append(c);
918                         break;
919                     case 'v':
920                         url.append(c);
921                         break;
922                     case 'w':
923                         url.append(c);
924                         break;
925                     case 'x':
926                         url.append(c);
927                         break;
928                     case 'y':
929                         url.append(c);
930                         break;
931                     case 'z':
932                         url.append(c);
933                         break;
934                     case '{':
935                         url.append("%7B");
936                         break;
937                     case '|':
938                         url.append("%7C");
939                         break;
940                     case '}':
941                         url.append("%7D");
942                         break;
943                     case '~':
944                         url.append(c);
945                         break;
946                     default:
947                         if (c < 0xD800 || c > 0xDFFF) {
948                             url.append(URIUtil.percentEscape(c));
949                         }
950                         else if (c <= 0xDBFF) {
951                             // high surrogate; therefore we need to
952
// grab the next half before encoding
953
i++;
954                             try {
955                                 char low = absolute.charAt(i);
956                                 String JavaDoc character = String.valueOf(c)+String.valueOf(low);
957                                 byte[] data = character.getBytes("UTF8");
958                                 // Always exactly 4 bytes, unless the encoder is buggy
959
for (int j=0; j < 4; j++) {
960                                     url.append('%');
961                                     String JavaDoc hex = Integer.toHexString(data[j]).toUpperCase();
962                                     url.append(hex.substring(hex.length()-2));
963                                 }
964                             }
965                             catch (IndexOutOfBoundsException JavaDoc ex) {
966                                 // file name contains a high half and not a low half
967
url = new StringBuffer JavaDoc();
968                                 break;
969                             }
970                         }
971                         else {
972                             // low half not preceded by high half
973
// Can't create a base URI
974
url = new StringBuffer JavaDoc();
975                             break;
976                         }
977                 }
978             }
979         }
980         
981         String JavaDoc base = url.toString();
982         try {
983             Document doc = build(fin, base);
984             return doc;
985         }
986         finally {
987             fin.close();
988         }
989         
990     }
991
992     
993     /**
994      * <p>
995      * Reads the document from a reader.
996      * </p>
997      *
998      * @param in the reader from which the document is read
999      *
1000     * @return the parsed <code>Document</code>
1001     *
1002     * @throws ValidityException if a validity error is detected. This
1003     * is only thrown if the builder has been instructed to validate.
1004     * @throws ParsingException if a well-formedness error is detected
1005     * @throws IOException if an I/O error such as a bad disk
1006     * prevents the document from being fully read
1007     */

1008    public Document build(Reader JavaDoc in)
1009      throws ParsingException, ValidityException, IOException JavaDoc {
1010
1011        InputSource JavaDoc source = new InputSource JavaDoc(in);
1012        return build(source);
1013        
1014    }
1015
1016    
1017    /**
1018     * <p>
1019     * Reads the document from a character stream while
1020     * specifying a base URI.
1021     * </p>
1022     *
1023     * @param in the reader from which the document
1024     * is read
1025     * @param baseURI the base URI for this document
1026     *
1027     * @return the parsed <code>Document</code>
1028     *
1029     * @throws ValidityException if a validity error is detected. This
1030     * is only thrown if the builder has been instructed to
1031     * validate.
1032     * @throws ParsingException if a well-formedness error is detected
1033     * @throws IOException if an I/O error such as a bad disk
1034     * prevents the document from being completely read
1035     */

1036    public Document build(Reader JavaDoc in, String JavaDoc baseURI)
1037      throws ParsingException, ValidityException, IOException JavaDoc {
1038          
1039        baseURI = canonicalizeURL(baseURI);
1040        InputSource JavaDoc source = new InputSource JavaDoc(in);
1041        source.setSystemId(baseURI);
1042        return build(source);
1043        
1044    }
1045    
1046    
1047    /**
1048     * <p>
1049     * Reads the document from the contents of a string.
1050     * </p>
1051     *
1052     * @param document the string that contains
1053     * the XML document.
1054     * @param baseURI the base URI for this document
1055     *
1056     * @return the parsed <code>Document</code>
1057     *
1058     * @throws ValidityException if a validity error is detected. This
1059     * is only thrown if the builder has been instructed to
1060     * validate.
1061     * @throws ParsingException if a well-formedness error is detected
1062     * @throws IOException if an I/O error such as a bad disk
1063     * prevents the document's external DTD subset from being read
1064     */

1065    public Document build(String JavaDoc document, String JavaDoc baseURI)
1066      throws ParsingException, ValidityException, IOException JavaDoc {
1067
1068        Reader JavaDoc reader = new StringReader JavaDoc(document);
1069        return build(reader, baseURI);
1070        
1071    }
1072    
1073    // needed to work around a bug in Xerces and Crimson
1074
// for URLs with no trailing slashes (no path part)
1075
// such as http://www.cafeconleche.org
1076
private String JavaDoc canonicalizeURL(String JavaDoc uri) {
1077        
1078        try {
1079            URL JavaDoc u = new URL JavaDoc(uri);
1080            String JavaDoc path = u.getFile();
1081            if (path == null || path.length() == 0
1082              || ("/".equals(path) && !(uri.endsWith("/")))) {
1083                uri += '/';
1084            }
1085            return uri;
1086        }
1087        catch (MalformedURLException JavaDoc ex) {
1088            return uri;
1089        }
1090    }
1091    
1092    
1093    /**
1094     * <p>
1095     * Reads the document from a SAX <code>InputSource</code>.
1096     * </p>
1097     *
1098     * @param in the input source from
1099     * which the document is read.
1100     *
1101     * @return the parsed <code>Document</code>
1102     *
1103     * @throws ValidityException if a validity error is detected. This
1104     * is only thrown if the builder has been instructed to
1105     * validate.
1106     * @throws ParsingException if a well-formedness error is detected
1107     * @throws IOException if an I/O error such as a bad disk
1108     * prevents the document from being read
1109     */

1110    private Document build(InputSource JavaDoc in)
1111      throws ParsingException, ValidityException, IOException JavaDoc {
1112
1113        try {
1114            parser.parse(in);
1115        }
1116        catch (SAXParseException JavaDoc ex) {
1117            ParsingException pex = new ParsingException(
1118                ex.getMessage(),
1119                ex.getSystemId(),
1120                ex.getLineNumber(),
1121                ex.getColumnNumber(),
1122                ex.getException());
1123            throw pex;
1124        }
1125        catch (SAXException JavaDoc ex) {
1126            ParsingException pex
1127              = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1128            throw pex;
1129        }
1130        catch (XMLException ex) {
1131            throw new ParsingException(ex.getMessage(), ex);
1132        }
1133        catch (RuntimeException JavaDoc ex) {
1134            // Work-around for non-conformant parsers, especially Piccolo
1135
ParsingException pex
1136              = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1137            throw pex;
1138        }
1139        catch (UTFDataFormatException JavaDoc ex) {
1140            // Work-around for non-conformant parsers, especially Xerces
1141
// http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583
1142
ParsingException pex
1143              = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1144            throw pex;
1145        }
1146        catch (CharConversionException JavaDoc ex) {
1147            // Work-around for non-conformant parsers, especially Xerces
1148
// http://nagoya.apache.org/bugzilla/show_bug.cgi?id=27583
1149
ParsingException pex
1150              = new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1151            throw pex;
1152        }
1153        catch (IOException JavaDoc ex) {
1154            // Work-around for Xerces; I don't want to just catch
1155
// org.apache.xerces.util.URI.MalformedURIException
1156
// because that would introduce a dependence on Xerces
1157
if (ex.getClass().getName().equals(
1158              "org.apache.xerces.util.URI$MalformedURIException")) {
1159                throw new ParsingException(ex.getMessage(), in.getSystemId(), ex);
1160            }
1161            else {
1162                throw ex;
1163            }
1164        }
1165        
1166        XOMHandler handler = (XOMHandler) parser.getContentHandler();
1167        ErrorHandler JavaDoc errorHandler = parser.getErrorHandler();
1168        Document result = handler.getDocument();
1169        if (result != null && "".equals(result.getBaseURI())) {
1170            result.setBaseURI(in.getSystemId());
1171        }
1172        
1173        if (errorHandler instanceof ValidityRequired) {
1174            ValidityRequired validityHandler
1175              = (ValidityRequired) errorHandler;
1176            if (!validityHandler.isValid()) {
1177                ValidityException vex = validityHandler.vexception;
1178                vex.setDocument(result);
1179                validityHandler.reset();
1180                throw vex;
1181            }
1182        }
1183        return result;
1184        
1185    }
1186    
1187    
1188    private static class ValidityRequired implements ErrorHandler JavaDoc {
1189
1190        ValidityException vexception = null;
1191
1192        void reset() {
1193            vexception = null;
1194        }
1195
1196        public void warning(SAXParseException JavaDoc exception) {
1197            // ignore warnings
1198
}
1199      
1200        public void error(SAXParseException JavaDoc exception) {
1201              
1202            if (vexception == null) {
1203                vexception = new ValidityException(
1204                  exception.getMessage(),
1205                  exception.getSystemId(),
1206                  exception.getLineNumber(),
1207                  exception.getColumnNumber(),
1208                  exception);
1209            }
1210            vexception.addError(exception);
1211        }
1212      
1213        public void fatalError(SAXParseException JavaDoc exception)
1214          throws SAXParseException JavaDoc {
1215            throw exception;
1216        }
1217        
1218        boolean isValid() {
1219            return vexception == null;
1220        }
1221        
1222    }
1223
1224    
1225    // Because Crimson doesn't report namespace errors as fatal
1226
private static class NamespaceWellformednessRequired
1227      implements ErrorHandler JavaDoc {
1228
1229        public void warning(SAXParseException JavaDoc exception) {
1230            // ignore warnings
1231
}
1232      
1233        public void error(SAXParseException JavaDoc exception)
1234          throws SAXParseException JavaDoc {
1235            throw exception;
1236        }
1237      
1238        public void fatalError(SAXParseException JavaDoc exception)
1239          throws SAXParseException JavaDoc {
1240            throw exception;
1241        }
1242        
1243    }
1244
1245    
1246    // I added this because XIncluder needed it.
1247
/**
1248     * <p>
1249     * Returns this builder's <code>NodeFactory</code>. It may return
1250     * null if a factory was not supplied when the builder was created. XXX
1251     * </p>
1252     *
1253     * @return the node factory that was specified in the constructor
1254     */

1255    public NodeFactory getNodeFactory() {
1256        return factory;
1257    }
1258
1259    
1260}
Popular Tags