KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > net > URI


1 /*
2  * @(#)URI.java 1.40 05/11/28
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 package java.net;
9
10 import java.io.IOException JavaDoc;
11 import java.io.InvalidObjectException JavaDoc;
12 import java.io.ObjectInputStream JavaDoc;
13 import java.io.ObjectOutputStream JavaDoc;
14 import java.io.Serializable JavaDoc;
15 import java.nio.ByteBuffer JavaDoc;
16 import java.nio.CharBuffer JavaDoc;
17 import java.nio.charset.CharsetDecoder JavaDoc;
18 import java.nio.charset.CharsetEncoder JavaDoc;
19 import java.nio.charset.CoderResult JavaDoc;
20 import java.nio.charset.CodingErrorAction JavaDoc;
21 import java.nio.charset.CharacterCodingException JavaDoc;
22 import sun.nio.cs.ThreadLocalCoders;
23 import sun.text.Normalizer;
24
25 import java.lang.Character JavaDoc; // for javadoc
26
import java.lang.NullPointerException JavaDoc; // for javadoc
27

28
29 /**
30  * Represents a Uniform Resource Identifier (URI) reference.
31  *
32  * <p> Aside from some minor deviations noted below, an instance of this
33  * class represents a URI reference as defined by
34  * <a HREF="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
35  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
36  * HREF="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
37  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
38  * also supports scope_ids. The syntax and usage of scope_ids is described
39  * <a HREF="Inet6Address.html#scoped">here</a>.
40  * This class provides constructors for creating URI instances from
41  * their components or by parsing their string forms, methods for accessing the
42  * various components of an instance, and methods for normalizing, resolving,
43  * and relativizing URI instances. Instances of this class are immutable.
44  *
45  *
46  * <h4> URI syntax and components </h4>
47  *
48  * At the highest level a URI reference (hereinafter simply "URI") in string
49  * form has the syntax
50  *
51  * <blockquote>
52  * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
53  * </blockquote>
54  *
55  * where square brackets [...] delineate optional components and the characters
56  * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
57  *
58  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
59  * said to be <i>relative</i>. URIs are also classified according to whether
60  * they are <i>opaque</i> or <i>hierarchical</i>.
61  *
62  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
63  * not begin with a slash character (<tt>'/'</tt>). Opaque URIs are not
64  * subject to further parsing. Some examples of opaque URIs are:
65  *
66  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
67  * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
68  * <tr><td><tt>news:comp.lang.java</tt><td></tr>
69  * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
70  * </table></blockquote>
71  *
72  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
73  * scheme-specific part begins with a slash character, or a relative URI, that
74  * is, a URI that does not specify a scheme. Some examples of hierarchical
75  * URIs are:
76  *
77  * <blockquote>
78  * <tt>http://java.sun.com/j2se/1.3/</tt><br>
79  * <tt>docs/guide/collections/designfaq.html#28</tt><br>
80  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
81  * <tt>file:///~/calendar</tt>
82  * </blockquote>
83  *
84  * <p> A hierarchical URI is subject to further parsing according to the syntax
85  *
86  * <blockquote>
87  * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
88  * </blockquote>
89  *
90  * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
91  * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves. The
92  * scheme-specific part of a hierarchical URI consists of the characters
93  * between the scheme and fragment components.
94  *
95  * <p> The authority component of a hierarchical URI is, if specified, either
96  * <i>server-based</i> or <i>registry-based</i>. A server-based authority
97  * parses according to the familiar syntax
98  *
99  * <blockquote>
100  * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
101  * </blockquote>
102  *
103  * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
104  * themselves. Nearly all URI schemes currently in use are server-based. An
105  * authority component that does not parse in this way is considered to be
106  * registry-based.
107  *
108  * <p> The path component of a hierarchical URI is itself said to be absolute
109  * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
110  * relative. The path of a hierarchical URI that is either absolute or
111  * specifies an authority is always absolute.
112  *
113  * <p> All told, then, a URI instance has the following nine components:
114  *
115  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
116  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
117  * <tr><td>scheme</td><td><tt>String</tt></td></tr>
118  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>
119  * <tr><td>authority</td><td><tt>String</tt></td></tr>
120  * <tr><td>user-info</td><td><tt>String</tt></td></tr>
121  * <tr><td>host</td><td><tt>String</tt></td></tr>
122  * <tr><td>port</td><td><tt>int</tt></td></tr>
123  * <tr><td>path</td><td><tt>String</tt></td></tr>
124  * <tr><td>query</td><td><tt>String</tt></td></tr>
125  * <tr><td>fragment</td><td><tt>String</tt></td></tr>
126  * </table></blockquote>
127  *
128  * In a given instance any particular component is either <i>undefined</i> or
129  * <i>defined</i> with a distinct value. Undefined string components are
130  * represented by <tt>null</tt>, while undefined integer components are
131  * represented by <tt>-1</tt>. A string component may be defined to have the
132  * empty string as its value; this is not equivalent to that component being
133  * undefined.
134  *
135  * <p> Whether a particular component is or is not defined in an instance
136  * depends upon the type of the URI being represented. An absolute URI has a
137  * scheme component. An opaque URI has a scheme, a scheme-specific part, and
138  * possibly a fragment, but has no other components. A hierarchical URI always
139  * has a path (though it may be empty) and a scheme-specific-part (which at
140  * least contains the path), and may have any of the other components. If the
141  * authority component is present and is server-based then the host component
142  * will be defined and the user-information and port components may be defined.
143  *
144  *
145  * <h4> Operations on URI instances </h4>
146  *
147  * The key operations supported by this class are those of
148  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
149  *
150  * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
151  * and <tt>".."</tt> segments from the path component of a hierarchical URI.
152  * Each <tt>"."</tt> segment is simply removed. A <tt>".."</tt> segment is
153  * removed only if it is preceded by a non-<tt>".."</tt> segment.
154  * Normalization has no effect upon opaque URIs.
155  *
156  * <p> <i>Resolution</i> is the process of resolving one URI against another,
157  * <i>base</i> URI. The resulting URI is constructed from components of both
158  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
159  * base URI for those not specified in the original. For hierarchical URIs,
160  * the path of the original is resolved against the path of the base and then
161  * normalized. The result, for example, of resolving
162  *
163  * <blockquote>
164  * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)
165  * </blockquote>
166  *
167  * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
168  * URI
169  *
170  * <blockquote>
171  * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
172  * </blockquote>
173  *
174  * Resolving the relative URI
175  *
176  * <blockquote>
177  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)
178  * </blockquote>
179  *
180  * against this result yields, in turn,
181  *
182  * <blockquote>
183  * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
184  * </blockquote>
185  *
186  * Resolution of both absolute and relative URIs, and of both absolute and
187  * relative paths in the case of hierarchical URIs, is supported. Resolving
188  * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
189  * original URI, since it is absolute. Resolving the relative URI (2) above
190  * against the relative base URI (1) yields the normalized, but still relative,
191  * URI
192  *
193  * <blockquote>
194  * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
195  * </blockquote>
196  *
197  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
198  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
199  *
200  * <blockquote>
201  * <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>
202  * <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>
203  * </blockquote>
204  *
205  * This operation is often useful when constructing a document containing URIs
206  * that must be made relative to the base URI of the document wherever
207  * possible. For example, relativizing the URI
208  *
209  * <blockquote>
210  * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
211  * </blockquote>
212  *
213  * against the base URI
214  *
215  * <blockquote>
216  * <tt>http://java.sun.com/j2se/1.3</tt>
217  * </blockquote>
218  *
219  * yields the relative URI <tt>docs/guide/index.html</tt>.
220  *
221  *
222  * <h4> Character categories </h4>
223  *
224  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
225  * various components of a URI reference. The following categories, most of
226  * which are taken from that specification, are used below to describe these
227  * constraints:
228  *
229  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
230  * <tr><th valign=top><i>alpha</i></th>
231  * <td>The US-ASCII alphabetic characters,
232  * <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>
233  * and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>
234  * <tr><th valign=top><i>digit</i></th>
235  * <td>The US-ASCII decimal digit characters,
236  * <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>
237  * <tr><th valign=top><i>alphanum</i></th>
238  * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
239  * <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
240  * <td>All <i>alphanum</i> characters together with those in the string
241  * <tt>"_-!.~'()*"</tt></td></tr>
242  * <tr><th valign=top><i>punct</i></th>
243  * <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
244  * <tr><th valign=top><i>reserved</i></th>
245  * <td>All <i>punct</i> characters together with those in the string
246  * <tt>"?/[]@"</tt></td></tr>
247  * <tr><th valign=top><i>escaped</i></th>
248  * <td>Escaped octets, that is, triplets consisting of the percent
249  * character (<tt>'%'</tt>) followed by two hexadecimal digits
250  * (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
251  * <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
252  * <tr><th valign=top><i>other</i></th>
253  * <td>The Unicode characters that are not in the US-ASCII character set,
254  * are not control characters (according to the {@link
255  * java.lang.Character#isISOControl(char) Character.isISOControl}
256  * method), and are not space characters (according to the {@link
257  * java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
258  * method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
259  * limited to US-ASCII)</i></td></tr>
260  * </table></blockquote>
261  *
262  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
263  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
264  * characters.
265  *
266  *
267  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
268  *
269  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
270  * fragment components. Escaping serves two purposes in URIs:
271  *
272  * <ul>
273  *
274  * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
275  * conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
276  * characters. </p></li>
277  *
278  * <li><p> To <i>quote</i> characters that are otherwise illegal in a
279  * component. The user-info, path, query, and fragment components differ
280  * slightly in terms of which characters are considered legal and illegal.
281  * </p></li>
282  *
283  * </ul>
284  *
285  * These purposes are served in this class by three related operations:
286  *
287  * <ul>
288  *
289  * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
290  * with the sequence of escaped octets that represent that character in the
291  * UTF-8 character set. The Euro currency symbol (<tt>'&#92;u20AC'</tt>),
292  * for example, is encoded as <tt>"%E2%82%AC"</tt>. <i>(<b>Deviation from
293  * RFC&nbsp;2396</b>, which does not specify any particular character
294  * set.)</i> </p></li>
295  *
296  * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
297  * encoding it. The space character, for example, is quoted by replacing it
298  * with <tt>"%20"</tt>. UTF-8 contains US-ASCII, hence for US-ASCII
299  * characters this transformation has exactly the effect required by
300  * RFC&nbsp;2396. </p></li>
301  *
302  * <li><p><a name="decode"></a>
303  * A sequence of escaped octets is <i>decoded</i> by
304  * replacing it with the sequence of characters that it represents in the
305  * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the
306  * effect of de-quoting any quoted US-ASCII characters as well as that of
307  * decoding any encoded non-US-ASCII characters. If a <a
308  * HREF="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
309  * when decoding the escaped octets then the erroneous octets are replaced by
310  * <tt>'&#92;uFFFD'</tt>, the Unicode replacement character. </p></li>
311  *
312  * </ul>
313  *
314  * These operations are exposed in the constructors and methods of this class
315  * as follows:
316  *
317  * <ul>
318  *
319  * <li><p> The {@link #URI(java.lang.String) <code>single-argument
320  * constructor</code>} requires any illegal characters in its argument to be
321  * quoted and preserves any escaped octets and <i>other</i> characters that
322  * are present. </p></li>
323  *
324  * <li><p> The {@link
325  * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
326  * <code>multi-argument constructors</code>} quote illegal characters as
327  * required by the components in which they appear. The percent character
328  * (<tt>'%'</tt>) is always quoted by these constructors. Any <i>other</i>
329  * characters are preserved. </p></li>
330  *
331  * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
332  * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
333  * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
334  * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
335  * values of their corresponding components in raw form, without interpreting
336  * any escaped octets. The strings returned by these methods may contain
337  * both escaped octets and <i>other</i> characters, and will not contain any
338  * illegal characters. </p></li>
339  *
340  * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
341  * getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
342  * getFragment}, {@link #getAuthority() getAuthority}, and {@link
343  * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
344  * octets in their corresponding components. The strings returned by these
345  * methods may contain both <i>other</i> characters and illegal characters,
346  * and will not contain any escaped octets. </p></li>
347  *
348  * <li><p> The {@link #toString() toString} method returns a URI string with
349  * all necessary quotation but which may contain <i>other</i> characters.
350  * </p></li>
351  *
352  * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
353  * quoted and encoded URI string that does not contain any <i>other</i>
354  * characters. </p></li>
355  *
356  * </ul>
357  *
358  *
359  * <h4> Identities </h4>
360  *
361  * For any URI <i>u</i>, it is always the case that
362  *
363  * <blockquote>
364  * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.
365  * </blockquote>
366  *
367  * For any URI <i>u</i> that does not contain redundant syntax such as two
368  * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a
369  * colon following a host name but no port (as in
370  * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters
371  * except those that must be quoted, the following identities also hold:
372  *
373  * <blockquote>
374  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
375  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
376  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
377  * .equals(</tt><i>u</i><tt>)</tt>
378  * </blockquote>
379  *
380  * in all cases,
381  *
382  * <blockquote>
383  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
384  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>
385  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
386  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
387  * .equals(</tt><i>u</i><tt>)</tt>
388  * </blockquote>
389  *
390  * if <i>u</i> is hierarchical, and
391  *
392  * <blockquote>
393  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
394  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>
395  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
396  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
397  * .equals(</tt><i>u</i><tt>)</tt>
398  * </blockquote>
399  *
400  * if <i>u</i> is hierarchical and has either no authority or a server-based
401  * authority.
402  *
403  *
404  * <h4> URIs, URLs, and URNs </h4>
405  *
406  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
407  * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but
408  * not every URI is a URL. This is because there is another subcategory of
409  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
410  * specify how to locate them. The <tt>mailto</tt>, <tt>news</tt>, and
411  * <tt>isbn</tt> URIs shown above are examples of URNs.
412  *
413  * <p> The conceptual distinction between URIs and URLs is reflected in the
414  * differences between this class and the {@link URL} class.
415  *
416  * <p> An instance of this class represents a URI reference in the syntactic
417  * sense defined by RFC&nbsp;2396. A URI may be either absolute or relative.
418  * A URI string is parsed according to the generic syntax without regard to the
419  * scheme, if any, that it specifies. No lookup of the host, if any, is
420  * performed, and no scheme-dependent stream handler is constructed. Equality,
421  * hashing, and comparison are defined strictly in terms of the character
422  * content of the instance. In other words, a URI instance is little more than
423  * a structured string that supports the syntactic, scheme-independent
424  * operations of comparison, normalization, resolution, and relativization.
425  *
426  * <p> An instance of the {@link URL} class, by contrast, represents the
427  * syntactic components of a URL together with some of the information required
428  * to access the resource that it describes. A URL must be absolute, that is,
429  * it must always specify a scheme. A URL string is parsed according to its
430  * scheme. A stream handler is always established for a URL, and in fact it is
431  * impossible to create a URL instance for a scheme for which no handler is
432  * available. Equality and hashing depend upon both the scheme and the
433  * Internet address of the host, if any; comparison is not defined. In other
434  * words, a URL is a structured string that supports the syntactic operation of
435  * resolution as well as the network I/O operations of looking up the host and
436  * opening a connection to the specified resource.
437  *
438  *
439  * @version 1.40, 05/11/28
440  * @author Mark Reinhold
441  * @since 1.4
442  *
443  * @see <a HREF="http://ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
444  * transformation format of ISO 10646</i></a>, <br><a
445  * HREF="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
446  * Architecture</i></a>, <br><a
447  * HREF="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
448  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
449  * HREF="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
450  * Literal IPv6 Addresses in URLs</i></a>, <br><a
451  * HREF="URISyntaxException.html">URISyntaxException</a>
452  */

453
454 public final class URI
455     implements Comparable JavaDoc<URI JavaDoc>, Serializable JavaDoc
456 {
457
458     // Note: Comments containing the word "ASSERT" indicate places where a
459
// throw of an InternalError should be replaced by an appropriate assertion
460
// statement once asserts are enabled in the build.
461

462     static final long serialVersionUID = -6052424284110960213L;
463
464
465     // -- Properties and components of this instance --
466

467     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
468
private transient String JavaDoc scheme; // null ==> relative URI
469
private transient String JavaDoc fragment;
470
471     // Hierarchical URI components: [//<authority>]<path>[?<query>]
472
private transient String JavaDoc authority; // Registry or server
473

474     // Server-based authority: [<userInfo>@]<host>[:<port>]
475
private transient String JavaDoc userInfo;
476     private transient String JavaDoc host; // null ==> registry-based
477
private transient int port = -1; // -1 ==> undefined
478

479     // Remaining components of hierarchical URIs
480
private transient String JavaDoc path; // null ==> opaque
481
private transient String JavaDoc query;
482
483     // The remaining fields may be computed on demand
484

485     private volatile transient String JavaDoc schemeSpecificPart;
486     private volatile transient int hash; // Zero ==> undefined
487

488     private volatile transient String JavaDoc decodedUserInfo = null;
489     private volatile transient String JavaDoc decodedAuthority = null;
490     private volatile transient String JavaDoc decodedPath = null;
491     private volatile transient String JavaDoc decodedQuery = null;
492     private volatile transient String JavaDoc decodedFragment = null;
493     private volatile transient String JavaDoc decodedSchemeSpecificPart = null;
494
495     /**
496      * The string form of this URI.
497      *
498      * @serial
499      */

500     private volatile String JavaDoc string; // The only serializable field
501

502
503
504     // -- Constructors and factories --
505

506     private URI() { } // Used internally
507

508     /**
509      * Constructs a URI by parsing the given string.
510      *
511      * <p> This constructor parses the given string exactly as specified by the
512      * grammar in <a
513      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
514      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
515      *
516      * <ul type=disc>
517      *
518      * <li><p> An empty authority component is permitted as long as it is
519      * followed by a non-empty path, a query component, or a fragment
520      * component. This allows the parsing of URIs such as
521      * <tt>"file:///foo/bar"</tt>, which seems to be the intent of
522      * RFC&nbsp;2396 although the grammar does not permit it. If the
523      * authority component is empty then the user-information, host, and port
524      * components are undefined. </p></li>
525      *
526      * <li><p> Empty relative paths are permitted; this seems to be the
527      * intent of RFC&nbsp;2396 although the grammar does not permit it. The
528      * primary consequence of this deviation is that a standalone fragment
529      * such as <tt>"#foo"</tt> parses as a relative URI with an empty path
530      * and the given fragment, and can be usefully <a
531      * HREF="#resolve-frag">resolved</a> against a base URI.
532      *
533      * <li><p> IPv4 addresses in host components are parsed rigorously, as
534      * specified by <a
535      * HREF="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
536      * element of a dotted-quad address must contain no more than three
537      * decimal digits. Each element is further constrained to have a value
538      * no greater than 255. </p></li>
539      *
540      * <li> <p> Hostnames in host components that comprise only a single
541      * domain label are permitted to start with an <i>alphanum</i>
542      * character. This seems to be the intent of <a
543      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
544      * section&nbsp;3.2.2 although the grammar does not permit it. The
545      * consequence of this deviation is that the authority component of a
546      * hierarchical URI such as <tt>s://123</tt>, will parse as a server-based
547      * authority. </p></li>
548      *
549      * <li><p> IPv6 addresses are permitted for the host component. An IPv6
550      * address must be enclosed in square brackets (<tt>'['</tt> and
551      * <tt>']'</tt>) as specified by <a
552      * HREF="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>. The
553      * IPv6 address itself must parse according to <a
554      * HREF="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>. IPv6
555      * addresses are further constrained to describe no more than sixteen
556      * bytes of address information, a constraint implicit in RFC&nbsp;2373
557      * but not expressible in the grammar. </p></li>
558      *
559      * <li><p> Characters in the <i>other</i> category are permitted wherever
560      * RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
561      * user-information, path, query, and fragment components, as well as in
562      * the authority component if the authority is registry-based. This
563      * allows URIs to contain Unicode characters beyond those in the US-ASCII
564      * character set. </p></li>
565      *
566      * </ul>
567      *
568      * @param str The string to be parsed into a URI
569      *
570      * @throws NullPointerException
571      * If <tt>str</tt> is <tt>null</tt>
572      *
573      * @throws URISyntaxException
574      * If the given string violates RFC&nbsp;2396, as augmented
575      * by the above deviations
576      */

577     public URI(String JavaDoc str) throws URISyntaxException JavaDoc {
578     new Parser(str).parse(false);
579     }
580
581     /**
582      * Constructs a hierarchical URI from the given components.
583      *
584      * <p> If a scheme is given then the path, if also given, must either be
585      * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
586      * component of the new URI may be left undefined by passing <tt>null</tt>
587      * for the corresponding parameter or, in the case of the <tt>port</tt>
588      * parameter, by passing <tt>-1</tt>.
589      *
590      * <p> This constructor first builds a URI string from the given components
591      * according to the rules specified in <a
592      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
593      * section&nbsp;5.2, step&nbsp;7: </p>
594      *
595      * <ol>
596      *
597      * <li><p> Initially, the result string is empty. </p></li>
598      *
599      * <li><p> If a scheme is given then it is appended to the result,
600      * followed by a colon character (<tt>':'</tt>). </p></li>
601      *
602      * <li><p> If user information, a host, or a port are given then the
603      * string <tt>"//"</tt> is appended. </p></li>
604      *
605      * <li><p> If user information is given then it is appended, followed by
606      * a commercial-at character (<tt>'@'</tt>). Any character not in the
607      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
608      * categories is <a HREF="#quote">quoted</a>. </p></li>
609      *
610      * <li><p> If a host is given then it is appended. If the host is a
611      * literal IPv6 address but is not enclosed in square brackets
612      * (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
613      * </p></li>
614      *
615      * <li><p> If a port number is given then a colon character
616      * (<tt>':'</tt>) is appended, followed by the port number in decimal.
617      * </p></li>
618      *
619      * <li><p> If a path is given then it is appended. Any character not in
620      * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
621      * categories, and not equal to the slash character (<tt>'/'</tt>) or the
622      * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
623      *
624      * <li><p> If a query is given then a question-mark character
625      * (<tt>'?'</tt>) is appended, followed by the query. Any character that
626      * is not a <a HREF="#legal-chars">legal URI character</a> is quoted.
627      * </p></li>
628      *
629      * <li><p> Finally, if a fragment is given then a hash character
630      * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
631      * that is not a legal URI character is quoted. </p></li>
632      *
633      * </ol>
634      *
635      * <p> The resulting URI string is then parsed as if by invoking the {@link
636      * #URI(String)} constructor and then invoking the {@link
637      * #parseServerAuthority()} method upon the result; this may cause a {@link
638      * URISyntaxException} to be thrown. </p>
639      *
640      * @param scheme Scheme name
641      * @param userInfo User name and authorization information
642      * @param host Host name
643      * @param port Port number
644      * @param path Path
645      * @param query Query
646      * @param fragment Fragment
647      *
648      * @throws URISyntaxException
649      * If both a scheme and a path are given but the path is relative,
650      * if the URI string constructed from the given components violates
651      * RFC&nbsp;2396, or if the authority component of the string is
652      * present but cannot be parsed as a server-based authority
653      */

654     public URI(String JavaDoc scheme,
655                String JavaDoc userInfo, String JavaDoc host, int port,
656                String JavaDoc path, String JavaDoc query, String JavaDoc fragment)
657     throws URISyntaxException JavaDoc
658     {
659     String JavaDoc s = toString(scheme, null,
660                 null, userInfo, host, port,
661                 path, query, fragment);
662     checkPath(s, scheme, path);
663     new Parser(s).parse(true);
664     }
665
666     /**
667      * Constructs a hierarchical URI from the given components.
668      *
669      * <p> If a scheme is given then the path, if also given, must either be
670      * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
671      * component of the new URI may be left undefined by passing <tt>null</tt>
672      * for the corresponding parameter.
673      *
674      * <p> This constructor first builds a URI string from the given components
675      * according to the rules specified in <a
676      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
677      * section&nbsp;5.2, step&nbsp;7: </p>
678      *
679      * <ol>
680      *
681      * <li><p> Initially, the result string is empty. </p></li>
682      *
683      * <li><p> If a scheme is given then it is appended to the result,
684      * followed by a colon character (<tt>':'</tt>). </p></li>
685      *
686      * <li><p> If an authority is given then the string <tt>"//"</tt> is
687      * appended, followed by the authority. If the authority contains a
688      * literal IPv6 address then the address must be enclosed in square
689      * brackets (<tt>'['</tt> and <tt>']'</tt>). Any character not in the
690      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
691      * categories, and not equal to the commercial-at character
692      * (<tt>'@'</tt>), is <a HREF="#quote">quoted</a>. </p></li>
693      *
694      * <li><p> If a path is given then it is appended. Any character not in
695      * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
696      * categories, and not equal to the slash character (<tt>'/'</tt>) or the
697      * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
698      *
699      * <li><p> If a query is given then a question-mark character
700      * (<tt>'?'</tt>) is appended, followed by the query. Any character that
701      * is not a <a HREF="#legal-chars">legal URI character</a> is quoted.
702      * </p></li>
703      *
704      * <li><p> Finally, if a fragment is given then a hash character
705      * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
706      * that is not a legal URI character is quoted. </p></li>
707      *
708      * </ol>
709      *
710      * <p> The resulting URI string is then parsed as if by invoking the {@link
711      * #URI(String)} constructor and then invoking the {@link
712      * #parseServerAuthority()} method upon the result; this may cause a {@link
713      * URISyntaxException} to be thrown. </p>
714      *
715      * @param scheme Scheme name
716      * @param authority Authority
717      * @param path Path
718      * @param query Query
719      * @param fragment Fragment
720      *
721      * @throws URISyntaxException
722      * If both a scheme and a path are given but the path is relative,
723      * if the URI string constructed from the given components violates
724      * RFC&nbsp;2396, or if the authority component of the string is
725      * present but cannot be parsed as a server-based authority
726      */

727     public URI(String JavaDoc scheme,
728            String JavaDoc authority,
729            String JavaDoc path, String JavaDoc query, String JavaDoc fragment)
730     throws URISyntaxException JavaDoc
731     {
732     String JavaDoc s = toString(scheme, null,
733                 authority, null, null, -1,
734                 path, query, fragment);
735     checkPath(s, scheme, path);
736     new Parser(s).parse(false);
737     }
738
739     /**
740      * Constructs a hierarchical URI from the given components.
741      *
742      * <p> A component may be left undefined by passing <tt>null</tt>.
743      *
744      * <p> This convenience constructor works as if by invoking the
745      * seven-argument constructor as follows:
746      *
747      * <blockquote><tt>
748      * new&nbsp;{@link #URI(String, String, String, int, String, String, String)
749      * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);
750      * </tt></blockquote>
751      *
752      * @param scheme Scheme name
753      * @param host Host name
754      * @param path Path
755      * @param fragment Fragment
756      *
757      * @throws URISyntaxException
758      * If the URI string constructed from the given components
759      * violates RFC&nbsp;2396
760      */

761     public URI(String JavaDoc scheme, String JavaDoc host, String JavaDoc path, String JavaDoc fragment)
762     throws URISyntaxException JavaDoc
763     {
764     this(scheme, null, host, -1, path, null, fragment);
765     }
766
767     /**
768      * Constructs a URI from the given components.
769      *
770      * <p> A component may be left undefined by passing <tt>null</tt>.
771      *
772      * <p> This constructor first builds a URI in string form using the given
773      * components as follows: </p>
774      *
775      * <ol>
776      *
777      * <li><p> Initially, the result string is empty. </p></li>
778      *
779      * <li><p> If a scheme is given then it is appended to the result,
780      * followed by a colon character (<tt>':'</tt>). </p></li>
781      *
782      * <li><p> If a scheme-specific part is given then it is appended. Any
783      * character that is not a <a HREF="#legal-chars">legal URI character</a>
784      * is <a HREF="#quote">quoted</a>. </p></li>
785      *
786      * <li><p> Finally, if a fragment is given then a hash character
787      * (<tt>'#'</tt>) is appended to the string, followed by the fragment.
788      * Any character that is not a legal URI character is quoted. </p></li>
789      *
790      * </ol>
791      *
792      * <p> The resulting URI string is then parsed in order to create the new
793      * URI instance as if by invoking the {@link #URI(String)} constructor;
794      * this may cause a {@link URISyntaxException} to be thrown. </p>
795      *
796      * @param scheme Scheme name
797      * @param ssp Scheme-specific part
798      * @param fragment Fragment
799      *
800      * @throws URISyntaxException
801      * If the URI string constructed from the given components
802      * violates RFC&nbsp;2396
803      */

804     public URI(String JavaDoc scheme, String JavaDoc ssp, String JavaDoc fragment)
805     throws URISyntaxException JavaDoc
806     {
807     new Parser(toString(scheme, ssp,
808                 null, null, null, -1,
809                 null, null, fragment))
810         .parse(false);
811     }
812
813     /**
814      * Creates a URI by parsing the given string.
815      *
816      * <p> This convenience factory method works as if by invoking the {@link
817      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
818      * constructor is caught and wrapped in a new {@link
819      * IllegalArgumentException} object, which is then thrown.
820      *
821      * <p> This method is provided for use in situations where it is known that
822      * the given string is a legal URI, for example for URI constants declared
823      * within in a program, and so it would be considered a programming error
824      * for the string not to parse as such. The constructors, which throw
825      * {@link URISyntaxException} directly, should be used situations where a
826      * URI is being constructed from user input or from some other source that
827      * may be prone to errors. </p>
828      *
829      * @param str The string to be parsed into a URI
830      * @return The new URI
831      *
832      * @throws NullPointerException
833      * If <tt>str</tt> is <tt>null</tt>
834      *
835      * @throws IllegalArgumentException
836      * If the given string violates RFC&nbsp;2396
837      */

838     public static URI JavaDoc create(String JavaDoc str) {
839     try {
840         return new URI JavaDoc(str);
841     } catch (URISyntaxException JavaDoc x) {
842         IllegalArgumentException JavaDoc y = new IllegalArgumentException JavaDoc();
843         y.initCause(x);
844         throw y;
845     }
846     }
847
848
849     // -- Operations --
850

851     /**
852      * Attempts to parse this URI's authority component, if defined, into
853      * user-information, host, and port components.
854      *
855      * <p> If this URI's authority component has already been recognized as
856      * being server-based then it will already have been parsed into
857      * user-information, host, and port components. In this case, or if this
858      * URI has no authority component, this method simply returns this URI.
859      *
860      * <p> Otherwise this method attempts once more to parse the authority
861      * component into user-information, host, and port components, and throws
862      * an exception describing why the authority component could not be parsed
863      * in that way.
864      *
865      * <p> This method is provided because the generic URI syntax specified in
866      * <a HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
867      * cannot always distinguish a malformed server-based authority from a
868      * legitimate registry-based authority. It must therefore treat some
869      * instances of the former as instances of the latter. The authority
870      * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
871      * legal server-based authority but it is legal as a registry-based
872      * authority.
873      *
874      * <p> In many common situations, for example when working URIs that are
875      * known to be either URNs or URLs, the hierarchical URIs being used will
876      * always be server-based. They therefore must either be parsed as such or
877      * treated as an error. In these cases a statement such as
878      *
879      * <blockquote>
880      * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
881      * </blockquote>
882      *
883      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
884      * it has an authority component, has a server-based authority with proper
885      * user-information, host, and port components. Invoking this method also
886      * ensures that if the authority could not be parsed in that way then an
887      * appropriate diagnostic message can be issued based upon the exception
888      * that is thrown. </p>
889      *
890      * @return A URI whose authority field has been parsed
891      * as a server-based authority
892      *
893      * @throws URISyntaxException
894      * If the authority component of this URI is defined
895      * but cannot be parsed as a server-based authority
896      * according to RFC&nbsp;2396
897      */

898     public URI JavaDoc parseServerAuthority()
899     throws URISyntaxException JavaDoc
900     {
901     // We could be clever and cache the error message and index from the
902
// exception thrown during the original parse, but that would require
903
// either more fields or a more-obscure representation.
904
if ((host != null) || (authority == null))
905         return this;
906     defineString();
907     new Parser(string).parse(true);
908     return this;
909     }
910
911     /**
912      * Normalizes this URI's path.
913      *
914      * <p> If this URI is opaque, or if its path is already in normal form,
915      * then this URI is returned. Otherwise a new URI is constructed that is
916      * identical to this URI except that its path is computed by normalizing
917      * this URI's path in a manner consistent with <a
918      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
919      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
920      * </p>
921      *
922      * <ol>
923      *
924      * <li><p> All <tt>"."</tt> segments are removed. </p></li>
925      *
926      * <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
927      * segment then both of these segments are removed. This step is
928      * repeated until it is no longer applicable. </p></li>
929      *
930      * <li><p> If the path is relative, and if its first segment contains a
931      * colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
932      * prepended. This prevents a relative URI with a path such as
933      * <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
934      * scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
935      * <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
936      *
937      * </ol>
938      *
939      * <p> A normalized path will begin with one or more <tt>".."</tt> segments
940      * if there were insufficient non-<tt>".."</tt> segments preceding them to
941      * allow their removal. A normalized path will begin with a <tt>"."</tt>
942      * segment if one was inserted by step 3 above. Otherwise, a normalized
943      * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
944      *
945      * @return A URI equivalent to this URI,
946      * but whose path is in normal form
947      */

948     public URI JavaDoc normalize() {
949     return normalize(this);
950     }
951
952     /**
953      * Resolves the given URI against this URI.
954      *
955      * <p> If the given URI is already absolute, or if this URI is opaque, then
956      * the given URI is returned.
957      *
958      * <p><a name="resolve-frag"></a> If the given URI's fragment component is
959      * defined, its path component is empty, and its scheme, authority, and
960      * query components are undefined, then a URI with the given fragment but
961      * with all other components equal to those of this URI is returned. This
962      * allows a URI representing a standalone fragment reference, such as
963      * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
964      *
965      * <p> Otherwise this method constructs a new hierarchical URI in a manner
966      * consistent with <a
967      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
968      * section&nbsp;5.2; that is: </p>
969      *
970      * <ol>
971      *
972      * <li><p> A new URI is constructed with this URI's scheme and the given
973      * URI's query and fragment components. </p></li>
974      *
975      * <li><p> If the given URI has an authority component then the new URI's
976      * authority and path are taken from the given URI. </p></li>
977      *
978      * <li><p> Otherwise the new URI's authority component is copied from
979      * this URI, and its path is computed as follows: </p></li>
980      *
981      * <ol type=a>
982      *
983      * <li><p> If the given URI's path is absolute then the new URI's path
984      * is taken from the given URI. </p></li>
985      *
986      * <li><p> Otherwise the given URI's path is relative, and so the new
987      * URI's path is computed by resolving the path of the given URI
988      * against the path of this URI. This is done by concatenating all but
989      * the last segment of this URI's path, if any, with the given URI's
990      * path and then normalizing the result as if by invoking the {@link
991      * #normalize() normalize} method. </p></li>
992      *
993      * </ol>
994      *
995      * </ol>
996      *
997      * <p> The result of this method is absolute if, and only if, either this
998      * URI is absolute or the given URI is absolute. </p>
999      *
1000     * @param uri The URI to be resolved against this URI
1001     * @return The resulting URI
1002     *
1003     * @throws NullPointerException
1004     * If <tt>uri</tt> is <tt>null</tt>
1005     */

1006    public URI JavaDoc resolve(URI JavaDoc uri) {
1007    return resolve(this, uri);
1008    }
1009
1010    /**
1011     * Constructs a new URI by parsing the given string and then resolving it
1012     * against this URI.
1013     *
1014     * <p> This convenience method works as if invoking it were equivalent to
1015     * evaluating the expression <tt>{@link #resolve(java.net.URI)
1016     * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
1017     *
1018     * @param str The string to be parsed into a URI
1019     * @return The resulting URI
1020     *
1021     * @throws NullPointerException
1022     * If <tt>str</tt> is <tt>null</tt>
1023     *
1024     * @throws IllegalArgumentException
1025     * If the given string violates RFC&nbsp;2396
1026     */

1027    public URI JavaDoc resolve(String JavaDoc str) {
1028    return resolve(URI.create(str));
1029    }
1030
1031    /**
1032     * Relativizes the given URI against this URI.
1033     *
1034     * <p> The relativization of the given URI against this URI is computed as
1035     * follows: </p>
1036     *
1037     * <ol>
1038     *
1039     * <li><p> If either this URI or the given URI are opaque, or if the
1040     * scheme and authority components of the two URIs are not identical, or
1041     * if the path of this URI is not a prefix of the path of the given URI,
1042     * then the given URI is returned. </p></li>
1043     *
1044     * <li><p> Otherwise a new relative hierarchical URI is constructed with
1045     * query and fragment components taken from the given URI and with a path
1046     * component computed by removing this URI's path from the beginning of
1047     * the given URI's path. </p></li>
1048     *
1049     * </ol>
1050     *
1051     * @param uri The URI to be relativized against this URI
1052     * @return The resulting URI
1053     *
1054     * @throws NullPointerException
1055     * If <tt>uri</tt> is <tt>null</tt>
1056     */

1057    public URI JavaDoc relativize(URI JavaDoc uri) {
1058    return relativize(this, uri);
1059    }
1060
1061    /**
1062     * Constructs a URL from this URI.
1063     *
1064     * <p> This convenience method works as if invoking it were equivalent to
1065     * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after
1066     * first checking that this URI is absolute. </p>
1067     *
1068     * @return A URL constructed from this URI
1069     *
1070     * @throws IllegalArgumentException
1071     * If this URL is not absolute
1072     *
1073     * @throws MalformedURLException
1074     * If a protocol handler for the URL could not be found,
1075     * or if some other error occurred while constructing the URL
1076     */

1077    public URL JavaDoc toURL()
1078    throws MalformedURLException JavaDoc {
1079    if (!isAbsolute())
1080        throw new IllegalArgumentException JavaDoc("URI is not absolute");
1081    return new URL JavaDoc(toString());
1082    }
1083
1084    // -- Component access methods --
1085

1086    /**
1087     * Returns the scheme component of this URI.
1088     *
1089     * <p> The scheme component of a URI, if defined, only contains characters
1090     * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>. A
1091     * scheme always starts with an <i>alpha</i> character. <p>
1092     *
1093     * The scheme component of a URI cannot contain escaped octets, hence this
1094     * method does not perform any decoding.
1095     *
1096     * @return The scheme component of this URI,
1097     * or <tt>null</tt> if the scheme is undefined
1098     */

1099    public String JavaDoc getScheme() {
1100    return scheme;
1101    }
1102
1103    /**
1104     * Tells whether or not this URI is absolute.
1105     *
1106     * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1107     *
1108     * @return <tt>true</tt> if, and only if, this URI is absolute
1109     */

1110    public boolean isAbsolute() {
1111    return scheme != null;
1112    }
1113
1114    /**
1115     * Tells whether or not this URI is opaque.
1116     *
1117     * <p> A URI is opaque if, and only if, it is absolute and its
1118     * scheme-specific part does not begin with a slash character ('/').
1119     * An opaque URI has a scheme, a scheme-specific part, and possibly
1120     * a fragment; all other components are undefined. </p>
1121     *
1122     * @return <tt>true</tt> if, and only if, this URI is opaque
1123     */

1124    public boolean isOpaque() {
1125        return path == null;
1126    }
1127
1128    /**
1129     * Returns the raw scheme-specific part of this URI. The scheme-specific
1130     * part is never undefined, though it may be empty.
1131     *
1132     * <p> The scheme-specific part of a URI only contains legal URI
1133     * characters. </p>
1134     *
1135     * @return The raw scheme-specific part of this URI
1136     * (never <tt>null</tt>)
1137     */

1138    public String JavaDoc getRawSchemeSpecificPart() {
1139    defineSchemeSpecificPart();
1140    return schemeSpecificPart;
1141    }
1142
1143    /**
1144     * Returns the decoded scheme-specific part of this URI.
1145     *
1146     * <p> The string returned by this method is equal to that returned by the
1147     * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1148     * except that all sequences of escaped octets are <a
1149     * HREF="#decode">decoded</a>. </p>
1150     *
1151     * @return The decoded scheme-specific part of this URI
1152     * (never <tt>null</tt>)
1153     */

1154    public String JavaDoc getSchemeSpecificPart() {
1155    if (decodedSchemeSpecificPart == null)
1156        decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
1157    return decodedSchemeSpecificPart;
1158    }
1159
1160    /**
1161     * Returns the raw authority component of this URI.
1162     *
1163     * <p> The authority component of a URI, if defined, only contains the
1164     * commercial-at character (<tt>'@'</tt>) and characters in the
1165     * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1166     * categories. If the authority is server-based then it is further
1167     * constrained to have valid user-information, host, and port
1168     * components. </p>
1169     *
1170     * @return The raw authority component of this URI,
1171     * or <tt>null</tt> if the authority is undefined
1172     */

1173    public String JavaDoc getRawAuthority() {
1174    return authority;
1175    }
1176
1177    /**
1178     * Returns the decoded authority component of this URI.
1179     *
1180     * <p> The string returned by this method is equal to that returned by the
1181     * {@link #getRawAuthority() getRawAuthority} method except that all
1182     * sequences of escaped octets are <a HREF="#decode">decoded</a>. </p>
1183     *
1184     * @return The decoded authority component of this URI,
1185     * or <tt>null</tt> if the authority is undefined
1186     */

1187    public String JavaDoc getAuthority() {
1188    if (decodedAuthority == null)
1189        decodedAuthority = decode(authority);
1190    return decodedAuthority;
1191    }
1192
1193    /**
1194     * Returns the raw user-information component of this URI.
1195     *
1196     * <p> The user-information component of a URI, if defined, only contains
1197     * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1198     * <i>other</i> categories. </p>
1199     *
1200     * @return The raw user-information component of this URI,
1201     * or <tt>null</tt> if the user information is undefined
1202     */

1203    public String JavaDoc getRawUserInfo() {
1204    return userInfo;
1205    }
1206
1207    /**
1208     * Returns the decoded user-information component of this URI.
1209     *
1210     * <p> The string returned by this method is equal to that returned by the
1211     * {@link #getRawUserInfo() getRawUserInfo} method except that all
1212     * sequences of escaped octets are <a HREF="#decode">decoded</a>. </p>
1213     *
1214     * @return The decoded user-information component of this URI,
1215     * or <tt>null</tt> if the user information is undefined
1216     */

1217    public String JavaDoc getUserInfo() {
1218    if ((decodedUserInfo == null) && (userInfo != null))
1219        decodedUserInfo = decode(userInfo);
1220    return decodedUserInfo;
1221    }
1222
1223    /**
1224     * Returns the host component of this URI.
1225     *
1226     * <p> The host component of a URI, if defined, will have one of the
1227     * following forms: </p>
1228     *
1229     * <ul type=disc>
1230     *
1231     * <li><p> A domain name consisting of one or more <i>labels</i>
1232     * separated by period characters (<tt>'.'</tt>), optionally followed by
1233     * a period character. Each label consists of <i>alphanum</i> characters
1234     * as well as hyphen characters (<tt>'-'</tt>), though hyphens never
1235     * occur as the first or last characters in a label. The rightmost
1236     * label of a domain name consisting of two or more labels, begins
1237     * with an <i>alpha</i> character. </li>
1238     *
1239     * <li><p> A dotted-quad IPv4 address of the form
1240     * <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
1241     * where no <i>digit</i> sequence is longer than three characters and no
1242     * sequence has a value larger than 255. </p></li>
1243     *
1244     * <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
1245     * <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
1246     * (<tt>':'</tt>), and possibly an embedded IPv4 address. The full
1247     * syntax of IPv6 addresses is specified in <a
1248     * HREF="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1249     * Addressing Architecture</i></a>. </p></li>
1250     *
1251     * </ul>
1252     *
1253     * The host component of a URI cannot contain escaped octets, hence this
1254     * method does not perform any decoding.
1255     *
1256     * @return The host component of this URI,
1257     * or <tt>null</tt> if the host is undefined
1258     */

1259    public String JavaDoc getHost() {
1260    return host;
1261    }
1262
1263    /**
1264     * Returns the port number of this URI.
1265     *
1266     * <p> The port component of a URI, if defined, is a non-negative
1267     * integer. </p>
1268     *
1269     * @return The port component of this URI,
1270     * or <tt>-1</tt> if the port is undefined
1271     */

1272    public int getPort() {
1273    return port;
1274    }
1275
1276    /**
1277     * Returns the raw path component of this URI.
1278     *
1279     * <p> The path component of a URI, if defined, only contains the slash
1280     * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
1281     * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1282     * and <i>other</i> categories. </p>
1283     *
1284     * @return The path component of this URI,
1285     * or <tt>null</tt> if the path is undefined
1286     */

1287    public String JavaDoc getRawPath() {
1288    return path;
1289    }
1290
1291    /**
1292     * Returns the decoded path component of this URI.
1293     *
1294     * <p> The string returned by this method is equal to that returned by the
1295     * {@link #getRawPath() getRawPath} method except that all sequences of
1296     * escaped octets are <a HREF="#decode">decoded</a>. </p>
1297     *
1298     * @return The decoded path component of this URI,
1299     * or <tt>null</tt> if the path is undefined
1300     */

1301    public String JavaDoc getPath() {
1302    if ((decodedPath == null) && (path != null))
1303        decodedPath = decode(path);
1304    return decodedPath;
1305    }
1306
1307    /**
1308     * Returns the raw query component of this URI.
1309     *
1310     * <p> The query component of a URI, if defined, only contains legal URI
1311     * characters. </p>
1312     *
1313     * @return The raw query component of this URI,
1314     * or <tt>null</tt> if the query is undefined
1315     */

1316    public String JavaDoc getRawQuery() {
1317    return query;
1318    }
1319
1320    /**
1321     * Returns the decoded query component of this URI.
1322     *
1323     * <p> The string returned by this method is equal to that returned by the
1324     * {@link #getRawQuery() getRawQuery} method except that all sequences of
1325     * escaped octets are <a HREF="#decode">decoded</a>. </p>
1326     *
1327     * @return The decoded query component of this URI,
1328     * or <tt>null</tt> if the query is undefined
1329     */

1330    public String JavaDoc getQuery() {
1331    if ((decodedQuery == null) && (query != null))
1332        decodedQuery = decode(query);
1333    return decodedQuery;
1334    }
1335
1336    /**
1337     * Returns the raw fragment component of this URI.
1338     *
1339     * <p> The fragment component of a URI, if defined, only contains legal URI
1340     * characters. </p>
1341     *
1342     * @return The raw fragment component of this URI,
1343     * or <tt>null</tt> if the fragment is undefined
1344     */

1345    public String JavaDoc getRawFragment() {
1346    return fragment;
1347    }
1348
1349    /**
1350     * Returns the decoded fragment component of this URI.
1351     *
1352     * <p> The string returned by this method is equal to that returned by the
1353     * {@link #getRawFragment() getRawFragment} method except that all
1354     * sequences of escaped octets are <a HREF="#decode">decoded</a>. </p>
1355     *
1356     * @return The decoded fragment component of this URI,
1357     * or <tt>null</tt> if the fragment is undefined
1358     */

1359    public String JavaDoc getFragment() {
1360    if ((decodedFragment == null) && (fragment != null))
1361        decodedFragment = decode(fragment);
1362    return decodedFragment;
1363    }
1364
1365
1366    // -- Equality, comparison, hash code, toString, and serialization --
1367

1368    /**
1369     * Tests this URI for equality with another object.
1370     *
1371     * <p> If the given object is not a URI then this method immediately
1372     * returns <tt>false</tt>.
1373     *
1374     * <p> For two URIs to be considered equal requires that either both are
1375     * opaque or both are hierarchical. Their schemes must either both be
1376     * undefined or else be equal without regard to case. Their fragments
1377     * must either both be undefined or else be equal.
1378     *
1379     * <p> For two opaque URIs to be considered equal, their scheme-specific
1380     * parts must be equal.
1381     *
1382     * <p> For two hierarchical URIs to be considered equal, their paths must
1383     * be equal and their queries must either both be undefined or else be
1384     * equal. Their authorities must either both be undefined, or both be
1385     * registry-based, or both be server-based. If their authorities are
1386     * defined and are registry-based, then they must be equal. If their
1387     * authorities are defined and are server-based, then their hosts must be
1388     * equal without regard to case, their port numbers must be equal, and
1389     * their user-information components must be equal.
1390     *
1391     * <p> When testing the user-information, path, query, fragment, authority,
1392     * or scheme-specific parts of two URIs for equality, the raw forms rather
1393     * than the encoded forms of these components are compared and the
1394     * hexadecimal digits of escaped octets are compared without regard to
1395     * case.
1396     *
1397     * <p> This method satisfies the general contract of the {@link
1398     * java.lang.Object#equals(Object) Object.equals} method. </p>
1399     *
1400     * @param ob The object to which this object is to be compared
1401     *
1402     * @return <tt>true</tt> if, and only if, the given object is a URI that
1403     * is identical to this URI
1404     */

1405    public boolean equals(Object JavaDoc ob) {
1406    if (ob == this)
1407        return true;
1408    if (!(ob instanceof URI JavaDoc))
1409        return false;
1410    URI JavaDoc that = (URI JavaDoc)ob;
1411    if (this.isOpaque() != that.isOpaque()) return false;
1412    if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1413    if (!equal(this.fragment, that.fragment)) return false;
1414
1415    // Opaque
1416
if (this.isOpaque())
1417        return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1418
1419    // Hierarchical
1420
if (!equal(this.path, that.path)) return false;
1421    if (!equal(this.query, that.query)) return false;
1422
1423    // Authorities
1424
if (this.authority == that.authority) return true;
1425    if (this.host != null) {
1426        // Server-based
1427
if (!equal(this.userInfo, that.userInfo)) return false;
1428        if (!equalIgnoringCase(this.host, that.host)) return false;
1429        if (this.port != that.port) return false;
1430    } else if (this.authority != null) {
1431        // Registry-based
1432
if (!equal(this.authority, that.authority)) return false;
1433    } else if (this.authority != that.authority) {
1434        return false;
1435    }
1436
1437    return true;
1438    }
1439
1440    /**
1441     * Returns a hash-code value for this URI. The hash code is based upon all
1442     * of the URI's components, and satisfies the general contract of the
1443     * {@link java.lang.Object#hashCode() Object.hashCode} method.
1444     *
1445     * @return A hash-code value for this URI
1446     */

1447    public int hashCode() {
1448    if (hash != 0)
1449        return hash;
1450    int h = hashIgnoringCase(0, scheme);
1451    h = hash(h, fragment);
1452    if (isOpaque()) {
1453        h = hash(h, schemeSpecificPart);
1454    } else {
1455        h = hash(h, path);
1456        h = hash(h, query);
1457        if (host != null) {
1458        h = hash(h, userInfo);
1459        h = hashIgnoringCase(h, host);
1460        h += 1949 * port;
1461        } else {
1462        h = hash(h, authority);
1463        }
1464    }
1465    hash = h;
1466    return h;
1467    }
1468
1469    /**
1470     * Compares this URI to another object, which must be a URI.
1471     *
1472     * <p> When comparing corresponding components of two URIs, if one
1473     * component is undefined but the other is defined then the first is
1474     * considered to be less than the second. Unless otherwise noted, string
1475     * components are ordered according to their natural, case-sensitive
1476     * ordering as defined by the {@link java.lang.String#compareTo(Object)
1477     * String.compareTo} method. String components that are subject to
1478     * encoding are compared by comparing their raw forms rather than their
1479     * encoded forms.
1480     *
1481     * <p> The ordering of URIs is defined as follows: </p>
1482     *
1483     * <ul type=disc>
1484     *
1485     * <li><p> Two URIs with different schemes are ordered according the
1486     * ordering of their schemes, without regard to case. </p></li>
1487     *
1488     * <li><p> A hierarchical URI is considered to be less than an opaque URI
1489     * with an identical scheme. </p></li>
1490     *
1491     * <li><p> Two opaque URIs with identical schemes are ordered according
1492     * to the ordering of their scheme-specific parts. </p></li>
1493     *
1494     * <li><p> Two opaque URIs with identical schemes and scheme-specific
1495     * parts are ordered according to the ordering of their
1496     * fragments. </p></li>
1497     *
1498     * <li><p> Two hierarchical URIs with identical schemes are ordered
1499     * according to the ordering of their authority components: </p></li>
1500     *
1501     * <ul type=disc>
1502     *
1503     * <li><p> If both authority components are server-based then the URIs
1504     * are ordered according to their user-information components; if these
1505     * components are identical then the URIs are ordered according to the
1506     * ordering of their hosts, without regard to case; if the hosts are
1507     * identical then the URIs are ordered according to the ordering of
1508     * their ports. </p></li>
1509     *
1510     * <li><p> If one or both authority components are registry-based then
1511     * the URIs are ordered according to the ordering of their authority
1512     * components. </p></li>
1513     *
1514     * </ul>
1515     *
1516     * <li><p> Finally, two hierarchical URIs with identical schemes and
1517     * authority components are ordered according to the ordering of their
1518     * paths; if their paths are identical then they are ordered according to
1519     * the ordering of their queries; if the queries are identical then they
1520     * are ordered according to the order of their fragments. </p></li>
1521     *
1522     * </ul>
1523     *
1524     * <p> This method satisfies the general contract of the {@link
1525     * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1526     * method. </p>
1527     *
1528     * @param ob
1529     * The object to which this URI is to be compared
1530     *
1531     * @return A negative integer, zero, or a positive integer as this URI is
1532     * less than, equal to, or greater than the given URI
1533     *
1534     * @throws ClassCastException
1535     * If the given object is not a URI
1536     */

1537    public int compareTo(URI JavaDoc that) {
1538    int c;
1539
1540    if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1541        return c;
1542
1543    if (this.isOpaque()) {
1544        if (that.isOpaque()) {
1545        // Both opaque
1546
if ((c = compare(this.schemeSpecificPart,
1547                 that.schemeSpecificPart)) != 0)
1548            return c;
1549        return compare(this.fragment, that.fragment);
1550        }
1551        return +1; // Opaque > hierarchical
1552
} else if (that.isOpaque()) {
1553        return -1; // Hierarchical < opaque
1554
}
1555
1556    // Hierarchical
1557
if ((this.host != null) && (that.host != null)) {
1558        // Both server-based
1559
if ((c = compare(this.userInfo, that.userInfo)) != 0)
1560        return c;
1561        if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1562        return c;
1563        if ((c = this.port - that.port) != 0)
1564        return c;
1565    } else {
1566        // If one or both authorities are registry-based then we simply
1567
// compare them in the usual, case-sensitive way. If one is
1568
// registry-based and one is server-based then the strings are
1569
// guaranteed to be unequal, hence the comparison will never return
1570
// zero and the compareTo and equals methods will remain
1571
// consistent.
1572
if ((c = compare(this.authority, that.authority)) != 0) return c;
1573    }
1574
1575    if ((c = compare(this.path, that.path)) != 0) return c;
1576    if ((c = compare(this.query, that.query)) != 0) return c;
1577    return compare(this.fragment, that.fragment);
1578    }
1579
1580    /**
1581     * Returns the content of this URI as a string.
1582     *
1583     * <p> If this URI was created by invoking one of the constructors in this
1584     * class then a string equivalent to the original input string, or to the
1585     * string computed from the originally-given components, as appropriate, is
1586     * returned. Otherwise this URI was created by normalization, resolution,
1587     * or relativization, and so a string is constructed from this URI's
1588     * components according to the rules specified in <a
1589     * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1590     * section&nbsp;5.2, step&nbsp;7. </p>
1591     *
1592     * @return The string form of this URI
1593     */

1594    public String JavaDoc toString() {
1595    defineString();
1596    return string;
1597    }
1598
1599    /**
1600     * Returns the content of this URI as a US-ASCII string.
1601     *
1602     * <p> If this URI does not contain any characters in the <i>other</i>
1603     * category then an invocation of this method will return the same value as
1604     * an invocation of the {@link #toString() toString} method. Otherwise
1605     * this method works as if by invoking that method and then <a
1606     * HREF="#encode">encoding</a> the result. </p>
1607     *
1608     * @return The string form of this URI, encoded as needed
1609     * so that it only contains characters in the US-ASCII
1610     * charset
1611     */

1612    public String JavaDoc toASCIIString() {
1613    defineString();
1614    return encode(string);
1615    }
1616
1617
1618    // -- Serialization support --
1619

1620    /**
1621     * Saves the content of this URI to the given serial stream.
1622     *
1623     * <p> The only serializable field of a URI instance is its <tt>string</tt>
1624     * field. That field is given a value, if it does not have one already,
1625     * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1626     * method of the given object-output stream is invoked. </p>
1627     *
1628     * @param os The object-output stream to which this object
1629     * is to be written
1630     */

1631    private void writeObject(ObjectOutputStream JavaDoc os)
1632    throws IOException JavaDoc
1633    {
1634    defineString();
1635    os.defaultWriteObject(); // Writes the string field only
1636
}
1637
1638    /**
1639     * Reconstitutes a URI from the given serial stream.
1640     *
1641     * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1642     * invoked to read the value of the <tt>string</tt> field. The result is
1643     * then parsed in the usual way.
1644     *
1645     * @param is The object-input stream from which this object
1646     * is being read
1647     */

1648    private void readObject(ObjectInputStream JavaDoc is)
1649    throws ClassNotFoundException JavaDoc, IOException JavaDoc
1650    {
1651    port = -1; // Argh
1652
is.defaultReadObject();
1653    try {
1654        new Parser(string).parse(false);
1655    } catch (URISyntaxException JavaDoc x) {
1656        IOException JavaDoc y = new InvalidObjectException JavaDoc("Invalid URI");
1657        y.initCause(x);
1658        throw y;
1659    }
1660    }
1661
1662
1663    // -- End of public methods --
1664

1665
1666    // -- Utility methods for string-field comparison and hashing --
1667

1668    // These methods return appropriate values for null string arguments,
1669
// thereby simplifying the equals, hashCode, and compareTo methods.
1670
//
1671
// The case-ignoring methods should only be applied to strings whose
1672
// characters are all known to be US-ASCII. Because of this restriction,
1673
// these methods are faster than the similar methods in the String class.
1674

1675    // US-ASCII only
1676
private static int toLower(char c) {
1677    if ((c >= 'A') && (c <= 'Z'))
1678        return c + ('a' - 'A');
1679    return c;
1680    }
1681
1682    private static boolean equal(String JavaDoc s, String JavaDoc t) {
1683    if (s == t) return true;
1684    if ((s != null) && (t != null)) {
1685        if (s.length() != t.length())
1686        return false;
1687        if (s.indexOf('%') < 0)
1688        return s.equals(t);
1689        int n = s.length();
1690        for (int i = 0; i < n;) {
1691        char c = s.charAt(i);
1692        char d = t.charAt(i);
1693        if (c != '%') {
1694            if (c != d)
1695            return false;
1696            i++;
1697            continue;
1698        }
1699        i++;
1700        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1701            return false;
1702        i++;
1703        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1704            return false;
1705        i++;
1706        }
1707        return true;
1708    }
1709    return false;
1710    }
1711
1712    // US-ASCII only
1713
private static boolean equalIgnoringCase(String JavaDoc s, String JavaDoc t) {
1714    if (s == t) return true;
1715    if ((s != null) && (t != null)) {
1716        int n = s.length();
1717        if (t.length() != n)
1718        return false;
1719        for (int i = 0; i < n; i++) {
1720        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1721            return false;
1722        }
1723        return true;
1724    }
1725    return false;
1726    }
1727
1728    private static int hash(int hash, String JavaDoc s) {
1729    if (s == null) return hash;
1730    return hash * 127 + s.hashCode();
1731    }
1732
1733    // US-ASCII only
1734
private static int hashIgnoringCase(int hash, String JavaDoc s) {
1735    if (s == null) return hash;
1736    int h = hash;
1737    int n = s.length();
1738    for (int i = 0; i < n; i++)
1739        h = 31 * h + toLower(s.charAt(i));
1740    return h;
1741    }
1742
1743    private static int compare(String JavaDoc s, String JavaDoc t) {
1744    if (s == t) return 0;
1745    if (s != null) {
1746        if (t != null)
1747        return s.compareTo(t);
1748        else
1749        return +1;
1750    } else {
1751        return -1;
1752    }
1753    }
1754
1755    // US-ASCII only
1756
private static int compareIgnoringCase(String JavaDoc s, String JavaDoc t) {
1757    if (s == t) return 0;
1758    if (s != null) {
1759        if (t != null) {
1760        int sn = s.length();
1761        int tn = t.length();
1762        int n = sn < tn ? sn : tn;
1763        for (int i = 0; i < n; i++) {
1764            int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1765            if (c != 0)
1766            return c;
1767        }
1768        return sn - tn;
1769        }
1770        return +1;
1771    } else {
1772        return -1;
1773    }
1774    }
1775
1776
1777    // -- String construction --
1778

1779    // If a scheme is given then the path, if given, must be absolute
1780
//
1781
private static void checkPath(String JavaDoc s, String JavaDoc scheme, String JavaDoc path)
1782    throws URISyntaxException JavaDoc
1783    {
1784    if (scheme != null) {
1785        if ((path != null)
1786        && ((path.length() > 0) && (path.charAt(0) != '/')))
1787        throw new URISyntaxException JavaDoc(s,
1788                         "Relative path in absolute URI");
1789    }
1790    }
1791
1792    private void appendAuthority(StringBuffer JavaDoc sb,
1793                 String JavaDoc authority,
1794                 String JavaDoc userInfo,
1795                 String JavaDoc host,
1796                 int port)
1797    {
1798    if (host != null) {
1799        sb.append("//");
1800        if (userInfo != null) {
1801        sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1802        sb.append('@');
1803        }
1804        boolean needBrackets = ((host.indexOf(':') >= 0)
1805                    && !host.startsWith("[")
1806                    && !host.endsWith("]"));
1807        if (needBrackets) sb.append('[');
1808        sb.append(host);
1809        if (needBrackets) sb.append(']');
1810        if (port != -1) {
1811        sb.append(':');
1812        sb.append(port);
1813        }
1814    } else if (authority != null) {
1815        sb.append("//");
1816        if (authority.startsWith("[")) {
1817        int end = authority.indexOf("]");
1818        if (end != -1 && authority.indexOf(":")!=-1) {
1819            String JavaDoc doquote, dontquote;
1820            if (end == authority.length()) {
1821            dontquote = authority;
1822            doquote = "";
1823            } else {
1824                dontquote = authority.substring(0,end+1);
1825            doquote = authority.substring(end+1);
1826            }
1827            sb.append (dontquote);
1828                sb.append(quote(doquote,
1829                L_REG_NAME | L_SERVER,
1830                H_REG_NAME | H_SERVER));
1831        }
1832        } else {
1833            sb.append(quote(authority,
1834                L_REG_NAME | L_SERVER,
1835                H_REG_NAME | H_SERVER));
1836        }
1837    }
1838    }
1839
1840    private void appendSchemeSpecificPart(StringBuffer JavaDoc sb,
1841                      String JavaDoc opaquePart,
1842                      String JavaDoc authority,
1843                      String JavaDoc userInfo,
1844                      String JavaDoc host,
1845                      int port,
1846                      String JavaDoc path,
1847                      String JavaDoc query)
1848    {
1849    if (opaquePart != null) {
1850        /* check if SSP begins with an IPv6 address
1851         * because we must not quote a literal IPv6 address
1852         */

1853        if (opaquePart.startsWith("//[")) {
1854        int end = opaquePart.indexOf("]");
1855        if (end != -1 && opaquePart.indexOf(":")!=-1) {
1856            String JavaDoc doquote, dontquote;
1857            if (end == opaquePart.length()) {
1858            dontquote = opaquePart;
1859            doquote = "";
1860            } else {
1861                dontquote = opaquePart.substring(0,end+1);
1862            doquote = opaquePart.substring(end+1);
1863            }
1864            sb.append (dontquote);
1865                sb.append(quote(doquote, L_URIC, H_URIC));
1866        }
1867        } else {
1868            sb.append(quote(opaquePart, L_URIC, H_URIC));
1869        }
1870    } else {
1871        appendAuthority(sb, authority, userInfo, host, port);
1872        if (path != null)
1873        sb.append(quote(path, L_PATH, H_PATH));
1874        if (query != null) {
1875        sb.append('?');
1876        sb.append(quote(query, L_URIC, H_URIC));
1877        }
1878    }
1879    }
1880
1881    private void appendFragment(StringBuffer JavaDoc sb, String JavaDoc fragment) {
1882    if (fragment != null) {
1883        sb.append('#');
1884        sb.append(quote(fragment, L_URIC, H_URIC));
1885    }
1886    }
1887
1888    
1889    //
1890
// Note for maintainer: sun.net.www.ParseUtil.createURI(...) clones
1891
// this method and all necessary auxiliary code to fix 6274990-2127017.
1892
// Any change made here should be propagated to sun.net.www.ParseUtil.
1893
// The requirement only applies to 5.0 update release.
1894
//
1895
private String JavaDoc toString(String JavaDoc scheme,
1896                String JavaDoc opaquePart,
1897                String JavaDoc authority,
1898                String JavaDoc userInfo,
1899                String JavaDoc host,
1900                int port,
1901                String JavaDoc path,
1902                String JavaDoc query,
1903                String JavaDoc fragment)
1904    {
1905    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1906    if (scheme != null) {
1907        sb.append(scheme);
1908        sb.append(':');
1909    }
1910    appendSchemeSpecificPart(sb, opaquePart,
1911                 authority, userInfo, host, port,
1912                 path, query);
1913    appendFragment(sb, fragment);
1914    return sb.toString();
1915    }
1916
1917    private void defineSchemeSpecificPart() {
1918    if (schemeSpecificPart != null) return;
1919    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1920    appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1921                 host, port, getPath(), getQuery());
1922    if (sb.length() == 0) return;
1923    schemeSpecificPart = sb.toString();
1924    }
1925
1926    private void defineString() {
1927    if (string != null) return;
1928
1929    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
1930        if (scheme != null) {
1931            sb.append(scheme);
1932            sb.append(':');
1933        }
1934    if (isOpaque()) {
1935            sb.append(schemeSpecificPart);
1936        } else {
1937        if (host != null) {
1938                sb.append("//");
1939                if (userInfo != null) {
1940                    sb.append(userInfo);
1941                    sb.append('@');
1942                }
1943                boolean needBrackets = ((host.indexOf(':') >= 0)
1944                                    && !host.startsWith("[")
1945                                    && !host.endsWith("]"));
1946                if (needBrackets) sb.append('[');
1947                sb.append(host);
1948                if (needBrackets) sb.append(']');
1949                if (port != -1) {
1950                    sb.append(':');
1951                    sb.append(port);
1952                }
1953            } else if (authority != null) {
1954                sb.append("//");
1955                sb.append(authority);
1956        }
1957            if (path != null)
1958                sb.append(path);
1959            if (query != null) {
1960                sb.append('?');
1961                sb.append(query);
1962            }
1963        }
1964    if (fragment != null) {
1965            sb.append('#');
1966            sb.append(fragment);
1967    }
1968    string = sb.toString();
1969    }
1970
1971
1972    // -- Normalization, resolution, and relativization --
1973

1974    // RFC2396 5.2 (6)
1975
private static String JavaDoc resolvePath(String JavaDoc base, String JavaDoc child,
1976                      boolean absolute)
1977    {
1978        int i = base.lastIndexOf('/');
1979    int cn = child.length();
1980    String JavaDoc path = "";
1981
1982    if (cn == 0) {
1983        // 5.2 (6a)
1984
if (i >= 0)
1985        path = base.substring(0, i + 1);
1986    } else {
1987        StringBuffer JavaDoc sb = new StringBuffer JavaDoc(base.length() + cn);
1988        // 5.2 (6a)
1989
if (i >= 0)
1990        sb.append(base.substring(0, i + 1));
1991        // 5.2 (6b)
1992
sb.append(child);
1993        path = sb.toString();
1994    }
1995
1996    // 5.2 (6c-f)
1997
String JavaDoc np = normalize(path);
1998
1999    // 5.2 (6g): If the result is absolute but the path begins with "../",
2000
// then we simply leave the path as-is
2001

2002    return np;
2003    }
2004
2005    // RFC2396 5.2
2006
private static URI JavaDoc resolve(URI JavaDoc base, URI JavaDoc child) {
2007    // check if child if opaque first so that NPE is thrown
2008
// if child is null.
2009
if (child.isOpaque() || base.isOpaque())
2010        return child;
2011
2012    // 5.2 (2): Reference to current document (lone fragment)
2013
if ((child.scheme == null) && (child.authority == null)
2014        && child.path.equals("") && (child.fragment != null)
2015        && (child.query == null)) {
2016        if ((base.fragment != null)
2017        && child.fragment.equals(base.fragment)) {
2018        return base;
2019        }
2020        URI JavaDoc ru = new URI JavaDoc();
2021        ru.scheme = base.scheme;
2022        ru.authority = base.authority;
2023        ru.userInfo = base.userInfo;
2024        ru.host = base.host;
2025        ru.port = base.port;
2026        ru.path = base.path;
2027        ru.fragment = child.fragment;
2028        ru.query = base.query;
2029        return ru;
2030    }
2031
2032    // 5.2 (3): Child is absolute
2033
if (child.scheme != null)
2034        return child;
2035
2036    URI JavaDoc ru = new URI JavaDoc(); // Resolved URI
2037
ru.scheme = base.scheme;
2038    ru.query = child.query;
2039    ru.fragment = child.fragment;
2040
2041    // 5.2 (4): Authority
2042
if (child.authority == null) {
2043        ru.authority = base.authority;
2044        ru.host = base.host;
2045        ru.userInfo = base.userInfo;
2046        ru.port = base.port;
2047
2048        String JavaDoc cp = (child.path == null) ? "" : child.path;
2049        if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2050        // 5.2 (5): Child path is absolute
2051
ru.path = child.path;
2052        } else {
2053        // 5.2 (6): Resolve relative path
2054
ru.path = resolvePath(base.path, cp, base.isAbsolute());
2055        }
2056    } else {
2057        ru.authority = child.authority;
2058        ru.host = child.host;
2059        ru.userInfo = child.userInfo;
2060        ru.host = child.host;
2061        ru.port = child.port;
2062        ru.path = child.path;
2063    }
2064
2065    // 5.2 (7): Recombine (nothing to do here)
2066
return ru;
2067    }
2068
2069    // If the given URI's path is normal then return the URI;
2070
// o.w., return a new URI containing the normalized path.
2071
//
2072
private static URI JavaDoc normalize(URI JavaDoc u) {
2073    if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2074        return u;
2075
2076    String JavaDoc np = normalize(u.path);
2077    if (np == u.path)
2078        return u;
2079
2080    URI JavaDoc v = new URI JavaDoc();
2081    v.scheme = u.scheme;
2082    v.fragment = u.fragment;
2083    v.authority = u.authority;
2084    v.userInfo = u.userInfo;
2085    v.host = u.host;
2086    v.port = u.port;
2087    v.path = np;
2088    v.query = u.query;
2089    return v;
2090    }
2091
2092    // If both URIs are hierarchical, their scheme and authority components are
2093
// identical, and the base path is a prefix of the child's path, then
2094
// return a relative URI that, when resolved against the base, yields the
2095
// child; otherwise, return the child.
2096
//
2097
private static URI JavaDoc relativize(URI JavaDoc base, URI JavaDoc child) {
2098    // check if child if opaque first so that NPE is thrown
2099
// if child is null.
2100
if (child.isOpaque() || base.isOpaque())
2101        return child;
2102    if (!equalIgnoringCase(base.scheme, child.scheme)
2103        || !equal(base.authority, child.authority))
2104        return child;
2105
2106    String JavaDoc bp = normalize(base.path);
2107    String JavaDoc cp = normalize(child.path);
2108    if (!bp.equals(cp)) {
2109        if (!bp.endsWith("/"))
2110        bp = bp + "/";
2111        if (!cp.startsWith(bp))
2112        return child;
2113    }
2114
2115    URI JavaDoc v = new URI JavaDoc();
2116    v.path = cp.substring(bp.length());
2117    v.query = child.query;
2118    v.fragment = child.fragment;
2119    return v;
2120    }
2121
2122
2123
2124    // -- Path normalization --
2125

2126    // The following algorithm for path normalization avoids the creation of a
2127
// string object for each segment, as well as the use of a string buffer to
2128
// compute the final result, by using a single char array and editing it in
2129
// place. The array is first split into segments, replacing each slash
2130
// with '\0' and creating a segment-index array, each element of which is
2131
// the index of the first char in the corresponding segment. We then walk
2132
// through both arrays, removing ".", "..", and other segments as necessary
2133
// by setting their entries in the index array to -1. Finally, the two
2134
// arrays are used to rejoin the segments and compute the final result.
2135
//
2136
// This code is based upon src/solaris/native/java/io/canonicalize_md.c
2137

2138
2139    // Check the given path to see if it might need normalization. A path
2140
// might need normalization if it contains duplicate slashes, a "."
2141
// segment, or a ".." segment. Return -1 if no further normalization is
2142
// possible, otherwise return the number of segments found.
2143
//
2144
// This method takes a string argument rather than a char array so that
2145
// this test can be performed without invoking path.toCharArray().
2146
//
2147
static private int needsNormalization(String JavaDoc path) {
2148    boolean normal = true;
2149    int ns = 0; // Number of segments
2150
int end = path.length() - 1; // Index of last char in path
2151
int p = 0; // Index of next char in path
2152

2153    // Skip initial slashes
2154
while (p <= end) {
2155        if (path.charAt(p) != '/') break;
2156        p++;
2157    }
2158    if (p > 1) normal = false;
2159
2160    // Scan segments
2161
while (p <= end) {
2162
2163        // Looking at "." or ".." ?
2164
if ((path.charAt(p) == '.')
2165        && ((p == end)
2166            || ((path.charAt(p + 1) == '/')
2167            || ((path.charAt(p + 1) == '.')
2168                && ((p + 1 == end)
2169                || (path.charAt(p + 2) == '/')))))) {
2170        normal = false;
2171        }
2172        ns++;
2173
2174        // Find beginning of next segment
2175
while (p <= end) {
2176        if (path.charAt(p++) != '/')
2177            continue;
2178
2179        // Skip redundant slashes
2180
while (p <= end) {
2181            if (path.charAt(p) != '/') break;
2182            normal = false;
2183            p++;
2184        }
2185
2186        break;
2187        }
2188    }
2189
2190    return normal ? -1 : ns;
2191    }
2192
2193
2194    // Split the given path into segments, replacing slashes with nulls and
2195
// filling in the given segment-index array.
2196
//
2197
// Preconditions:
2198
// segs.length == Number of segments in path
2199
//
2200
// Postconditions:
2201
// All slashes in path replaced by '\0'
2202
// segs[i] == Index of first char in segment i (0 <= i < segs.length)
2203
//
2204
static private void split(char[] path, int[] segs) {
2205    int end = path.length - 1; // Index of last char in path
2206
int p = 0; // Index of next char in path
2207
int i = 0; // Index of current segment
2208

2209    // Skip initial slashes
2210
while (p <= end) {
2211        if (path[p] != '/') break;
2212        path[p] = '\0';
2213        p++;
2214    }
2215
2216    while (p <= end) {
2217
2218        // Note start of segment
2219
segs[i++] = p++;
2220
2221        // Find beginning of next segment
2222
while (p <= end) {
2223        if (path[p++] != '/')
2224            continue;
2225        path[p - 1] = '\0';
2226
2227        // Skip redundant slashes
2228
while (p <= end) {
2229            if (path[p] != '/') break;
2230            path[p++] = '\0';
2231        }
2232        break;
2233        }
2234    }
2235
2236    if (i != segs.length)
2237        throw new InternalError JavaDoc(); // ASSERT
2238
}
2239
2240
2241    // Join the segments in the given path according to the given segment-index
2242
// array, ignoring those segments whose index entries have been set to -1,
2243
// and inserting slashes as needed. Return the length of the resulting
2244
// path.
2245
//
2246
// Preconditions:
2247
// segs[i] == -1 implies segment i is to be ignored
2248
// path computed by split, as above, with '\0' having replaced '/'
2249
//
2250
// Postconditions:
2251
// path[0] .. path[return value] == Resulting path
2252
//
2253
static private int join(char[] path, int[] segs) {
2254    int ns = segs.length; // Number of segments
2255
int end = path.length - 1; // Index of last char in path
2256
int p = 0; // Index of next path char to write
2257

2258    if (path[p] == '\0') {
2259        // Restore initial slash for absolute paths
2260
path[p++] = '/';
2261    }
2262
2263    for (int i = 0; i < ns; i++) {
2264        int q = segs[i]; // Current segment
2265
if (q == -1)
2266        // Ignore this segment
2267
continue;
2268
2269        if (p == q) {
2270        // We're already at this segment, so just skip to its end
2271
while ((p <= end) && (path[p] != '\0'))
2272            p++;
2273        if (p <= end) {
2274            // Preserve trailing slash
2275
path[p++] = '/';
2276        }
2277        } else if (p < q) {
2278        // Copy q down to p
2279
while ((q <= end) && (path[q] != '\0'))
2280            path[p++] = path[q++];
2281        if (q <= end) {
2282            // Preserve trailing slash
2283
path[p++] = '/';
2284        }
2285        } else
2286        throw new InternalError JavaDoc(); // ASSERT false
2287
}
2288
2289    return p;
2290    }
2291
2292
2293    // Remove "." segments from the given path, and remove segment pairs
2294
// consisting of a non-".." segment followed by a ".." segment.
2295
//
2296
private static void removeDots(char[] path, int[] segs) {
2297    int ns = segs.length;
2298    int end = path.length - 1;
2299
2300    for (int i = 0; i < ns; i++) {
2301        int dots = 0; // Number of dots found (0, 1, or 2)
2302

2303        // Find next occurrence of "." or ".."
2304
do {
2305        int p = segs[i];
2306        if (path[p] == '.') {
2307            if (p == end) {
2308            dots = 1;
2309            break;
2310            } else if (path[p + 1] == '\0') {
2311            dots = 1;
2312            break;
2313            } else if ((path[p + 1] == '.')
2314                   && ((p + 1 == end)
2315                   || (path[p + 2] == '\0'))) {
2316            dots = 2;
2317            break;
2318            }
2319        }
2320        i++;
2321        } while (i < ns);
2322        if ((i > ns) || (dots == 0))
2323        break;
2324
2325        if (dots == 1) {
2326        // Remove this occurrence of "."
2327
segs[i] = -1;
2328        } else {
2329        // If there is a preceding non-".." segment, remove both that
2330
// segment and this occurrence of ".."; otherwise, leave this
2331
// ".." segment as-is.
2332
int j;
2333        for (j = i - 1; j >= 0; j--) {
2334            if (segs[j] != -1) break;
2335        }
2336        if (j >= 0) {
2337            int q = segs[j];
2338            if (!((path[q] == '.')
2339              && (path[q + 1] == '.')
2340              && (path[q + 2] == '\0'))) {
2341            segs[i] = -1;
2342            segs[j] = -1;
2343            }
2344        }
2345        }
2346    }
2347    }
2348
2349
2350    // DEVIATION: If the normalized path is relative, and if the first
2351
// segment could be parsed as a scheme name, then prepend a "." segment
2352
//
2353
private static void maybeAddLeadingDot(char[] path, int[] segs) {
2354
2355    if (path[0] == '\0')
2356        // The path is absolute
2357
return;
2358
2359    int ns = segs.length;
2360    int f = 0; // Index of first segment
2361
while (f < ns) {
2362        if (segs[f] >= 0)
2363        break;
2364        f++;
2365    }
2366    if ((f >= ns) || (f == 0))
2367        // The path is empty, or else the original first segment survived,
2368
// in which case we already know that no leading "." is needed
2369
return;
2370
2371    int p = segs[f];
2372    while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2373    if (p >= path.length || path[p] == '\0')
2374        // No colon in first segment, so no "." needed
2375
return;
2376
2377    // At this point we know that the first segment is unused,
2378
// hence we can insert a "." segment at that position
2379
path[0] = '.';
2380    path[1] = '\0';
2381    segs[0] = 0;
2382    }
2383
2384
2385    // Normalize the given path string. A normal path string has no empty
2386
// segments (i.e., occurrences of "//"), no segments equal to ".", and no
2387
// segments equal to ".." that are preceded by a segment not equal to "..".
2388
// In contrast to Unix-style pathname normalization, for URI paths we
2389
// always retain trailing slashes.
2390
//
2391
private static String JavaDoc normalize(String JavaDoc ps) {
2392
2393    // Does this path need normalization?
2394
int ns = needsNormalization(ps); // Number of segments
2395
if (ns < 0)
2396        // Nope -- just return it
2397
return ps;
2398
2399    char[] path = ps.toCharArray(); // Path in char-array form
2400

2401    // Split path into segments
2402
int[] segs = new int[ns]; // Segment-index array
2403
split(path, segs);
2404
2405    // Remove dots
2406
removeDots(path, segs);
2407
2408    // Prevent scheme-name confusion
2409
maybeAddLeadingDot(path, segs);
2410
2411    // Join the remaining segments and return the result
2412
String JavaDoc s = new String JavaDoc(path, 0, join(path, segs));
2413    if (s.equals(ps)) {
2414        // string was already normalized
2415
return ps;
2416    }
2417    return s;
2418    }
2419
2420
2421
2422    // -- Character classes for parsing --
2423

2424    // RFC2396 precisely specifies which characters in the US-ASCII charset are
2425
// permissible in the various components of a URI reference. We here
2426
// define a set of mask pairs to aid in enforcing these restrictions. Each
2427
// mask pair consists of two longs, a low mask and a high mask. Taken
2428
// together they represent a 128-bit mask, where bit i is set iff the
2429
// character with value i is permitted.
2430
//
2431
// This approach is more efficient than sequentially searching arrays of
2432
// permitted characters. It could be made still more efficient by
2433
// precompiling the mask information so that a character's presence in a
2434
// given mask could be determined by a single table lookup.
2435

2436    // Compute the low-order mask for the characters in the given string
2437
private static long lowMask(String JavaDoc chars) {
2438    int n = chars.length();
2439    long m = 0;
2440    for (int i = 0; i < n; i++) {
2441        char c = chars.charAt(i);
2442        if (c < 64)
2443        m |= (1L << c);
2444    }
2445    return m;
2446    }
2447
2448    // Compute the high-order mask for the characters in the given string
2449
private static long highMask(String JavaDoc chars) {
2450    int n = chars.length();
2451    long m = 0;
2452    for (int i = 0; i < n; i++) {
2453        char c = chars.charAt(i);
2454        if ((c >= 64) && (c < 128))
2455        m |= (1L << (c - 64));
2456    }
2457    return m;
2458    }
2459
2460    // Compute a low-order mask for the characters
2461
// between first and last, inclusive
2462
private static long lowMask(char first, char last) {
2463    long m = 0;
2464    int f = Math.max(Math.min(first, 63), 0);
2465    int l = Math.max(Math.min(last, 63), 0);
2466    for (int i = f; i <= l; i++)
2467        m |= 1L << i;
2468    return m;
2469    }
2470
2471    // Compute a high-order mask for the characters
2472
// between first and last, inclusive
2473
private static long highMask(char first, char last) {
2474    long m = 0;
2475    int f = Math.max(Math.min(first, 127), 64) - 64;
2476    int l = Math.max(Math.min(last, 127), 64) - 64;
2477    for (int i = f; i <= l; i++)
2478        m |= 1L << i;
2479    return m;
2480    }
2481
2482    // Tell whether the given character is permitted by the given mask pair
2483
private static boolean match(char c, long lowMask, long highMask) {
2484    if (c < 64)
2485        return ((1L << c) & lowMask) != 0;
2486    if (c < 128)
2487        return ((1L << (c - 64)) & highMask) != 0;
2488    return false;
2489    }
2490
2491    // Character-class masks, in reverse order from RFC2396 because
2492
// initializers for static fields cannot make forward references.
2493

2494    // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2495
// "8" | "9"
2496
private static final long L_DIGIT = lowMask('0', '9');
2497    private static final long H_DIGIT = 0L;
2498
2499    // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2500
// "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2501
// "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2502
private static final long L_UPALPHA = 0L;
2503    private static final long H_UPALPHA = highMask('A', 'Z');
2504
2505    // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2506
// "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2507
// "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2508
private static final long L_LOWALPHA = 0L;
2509    private static final long H_LOWALPHA = highMask('a', 'z');
2510
2511    // alpha = lowalpha | upalpha
2512
private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2513    private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2514
2515    // alphanum = alpha | digit
2516
private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2517    private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2518
2519    // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2520
// "a" | "b" | "c" | "d" | "e" | "f"
2521
private static final long L_HEX = L_DIGIT;
2522    private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2523
2524    // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2525
// "(" | ")"
2526
private static final long L_MARK = lowMask("-_.!~*'()");
2527    private static final long H_MARK = highMask("-_.!~*'()");
2528
2529    // unreserved = alphanum | mark
2530
private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2531    private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2532
2533    // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2534
// "$" | "," | "[" | "]"
2535
// Added per RFC2732: "[", "]"
2536
private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2537    private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2538
2539    // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2540
// characters are allowed; this is handled by the scanEscape method below.
2541
private static final long L_ESCAPED = 1L;
2542    private static final long H_ESCAPED = 0L;
2543
2544    // uric = reserved | unreserved | escaped
2545
private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2546    private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2547
2548    // pchar = unreserved | escaped |
2549
// ":" | "@" | "&" | "=" | "+" | "$" | ","
2550
private static final long L_PCHAR
2551    = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2552    private static final long H_PCHAR
2553    = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2554
2555    // All valid path characters
2556
private static final long L_PATH = L_PCHAR | lowMask(";/");
2557    private static final long H_PATH = H_PCHAR | highMask(";/");
2558
2559    // Dash, for use in domainlabel and toplabel
2560
private static final long L_DASH = lowMask("-");
2561    private static final long H_DASH = highMask("-");
2562
2563    // Dot, for use in hostnames
2564
private static final long L_DOT = lowMask(".");
2565    private static final long H_DOT = highMask(".");
2566
2567    // userinfo = *( unreserved | escaped |
2568
// ";" | ":" | "&" | "=" | "+" | "$" | "," )
2569
private static final long L_USERINFO
2570    = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2571    private static final long H_USERINFO
2572    = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2573
2574    // reg_name = 1*( unreserved | escaped | "$" | "," |
2575
// ";" | ":" | "@" | "&" | "=" | "+" )
2576
private static final long L_REG_NAME
2577    = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2578    private static final long H_REG_NAME
2579    = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2580
2581    // All valid characters for server-based authorities
2582
private static final long L_SERVER
2583    = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2584    private static final long H_SERVER
2585    = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2586
2587    // Special case of server authority that represents an IPv6 address
2588
// In this case, a % does not signify an escape sequence
2589
private static final long L_SERVER_PERCENT
2590    = L_SERVER | lowMask("%");
2591    private static final long H_SERVER_PERCENT
2592    = H_SERVER | highMask("%");
2593    private static final long L_LEFT_BRACKET = lowMask("[");
2594    private static final long H_LEFT_BRACKET = highMask("[");
2595
2596    // scheme = alpha *( alpha | digit | "+" | "-" | "." )
2597
private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2598    private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2599
2600    // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
2601
// "&" | "=" | "+" | "$" | ","
2602
private static final long L_URIC_NO_SLASH
2603    = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
2604    private static final long H_URIC_NO_SLASH
2605    = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
2606
2607
2608    // -- Escaping and encoding --
2609

2610    private final static char[] hexDigits = {
2611    '0', '1', '2', '3', '4', '5', '6', '7',
2612    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2613    };
2614
2615    private static void appendEscape(StringBuffer JavaDoc sb, byte b) {
2616    sb.append('%');
2617    sb.append(hexDigits[(b >> 4) & 0x0f]);
2618    sb.append(hexDigits[(b >> 0) & 0x0f]);
2619    }
2620
2621    private static void appendEncoded(StringBuffer JavaDoc sb, char c) {
2622    ByteBuffer JavaDoc bb = null;
2623    try {
2624        bb = ThreadLocalCoders.encoderFor("UTF-8")
2625        .encode(CharBuffer.wrap("" + c));
2626    } catch (CharacterCodingException JavaDoc x) {
2627        assert false;
2628    }
2629    while (bb.hasRemaining()) {
2630        int b = bb.get() & 0xff;
2631        if (b >= 0x80)
2632        appendEscape(sb, (byte)b);
2633        else
2634        sb.append((char)b);
2635    }
2636    }
2637
2638    // Quote any characters in s that are not permitted
2639
// by the given mask pair
2640
//
2641
private static String JavaDoc quote(String JavaDoc s, long lowMask, long highMask) {
2642    int n = s.length();
2643    StringBuffer JavaDoc sb = null;
2644    boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2645    for (int i = 0; i < s.length(); i++) {
2646        char c = s.charAt(i);
2647        if (c < '\u0080') {
2648        if (!match(c, lowMask, highMask)) {
2649            if (sb == null) {
2650            sb = new StringBuffer JavaDoc();
2651            sb.append(s.substring(0, i));
2652            }
2653            appendEscape(sb, (byte)c);
2654        } else {
2655            if (sb != null)
2656            sb.append(c);
2657        }
2658        } else if (allowNonASCII
2659               && (Character.isSpaceChar(c)
2660               || Character.isISOControl(c))) {
2661        if (sb == null) {
2662            sb = new StringBuffer JavaDoc();
2663            sb.append(s.substring(0, i));
2664        }
2665        appendEncoded(sb, c);
2666        } else {
2667        if (sb != null)
2668            sb.append(c);
2669        }
2670    }
2671    return (sb == null) ? s : sb.toString();
2672    }
2673
2674    // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2675
// assuming that s is otherwise legal
2676
//
2677
private static String JavaDoc encode(String JavaDoc s) {
2678    int n = s.length();
2679    if (n == 0)
2680        return s;
2681
2682    // First check whether we actually need to encode
2683
for (int i = 0;;) {
2684        if (s.charAt(i) >= '\u0080')
2685        break;
2686        if (++i >= n)
2687        return s;
2688    }
2689
2690    String JavaDoc ns = Normalizer.normalize(s, Normalizer.COMPOSE, 0);
2691    ByteBuffer JavaDoc bb = null;
2692    try {
2693        bb = ThreadLocalCoders.encoderFor("UTF-8")
2694        .encode(CharBuffer.wrap(ns));
2695    } catch (CharacterCodingException JavaDoc x) {
2696        assert false;
2697    }
2698
2699    StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
2700    while (bb.hasRemaining()) {
2701        int b = bb.get() & 0xff;
2702        if (b >= 0x80)
2703        appendEscape(sb, (byte)b);
2704        else
2705        sb.append((char)b);
2706    }
2707    return sb.toString();
2708    }
2709
2710    private static int decode(char c) {
2711    if ((c >= '0') && (c <= '9'))
2712        return c - '0';
2713    if ((c >= 'a') && (c <= 'f'))
2714        return c - 'a' + 10;
2715    if ((c >= 'A') && (c <= 'F'))
2716        return c - 'A' + 10;
2717    assert false;
2718    return -1;
2719    }
2720
2721    private static byte decode(char c1, char c2) {
2722    return (byte)( ((decode(c1) & 0xf) << 4)
2723              | ((decode(c2) & 0xf) << 0));
2724    }
2725
2726    // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes
2727
// that escapes are well-formed syntactically, i.e., of the form %XX. If a
2728
// sequence of escaped octets is not valid UTF-8 then the erroneous octets
2729
// are replaced with '\uFFFD'.
2730
// Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2731
// with a scope_id
2732
//
2733
private static String JavaDoc decode(String JavaDoc s) {
2734    if (s == null)
2735        return s;
2736    int n = s.length();
2737    if (n == 0)
2738        return s;
2739    if (s.indexOf('%') < 0)
2740        return s;
2741
2742    byte[] ba = new byte[n];
2743    StringBuffer JavaDoc sb = new StringBuffer JavaDoc(n);
2744    ByteBuffer JavaDoc bb = ByteBuffer.allocate(n);
2745    CharBuffer JavaDoc cb = CharBuffer.allocate(n);
2746    CharsetDecoder JavaDoc dec = ThreadLocalCoders.decoderFor("UTF-8")
2747        .onMalformedInput(CodingErrorAction.REPLACE)
2748        .onUnmappableCharacter(CodingErrorAction.REPLACE);
2749
2750    // This is not horribly efficient, but it will do for now
2751
char c = s.charAt(0);
2752        boolean betweenBrackets = false;
2753
2754    for (int i = 0; i < n;) {
2755        assert c == s.charAt(i); // Loop invariant
2756
if (c == '[') {
2757        betweenBrackets = true;
2758        } else if (betweenBrackets && c == ']') {
2759        betweenBrackets = false;
2760        }
2761        if (c != '%' || betweenBrackets) {
2762        sb.append(c);
2763        if (++i >= n)
2764            break;
2765        c = s.charAt(i);
2766        continue;
2767        }
2768        bb.clear();
2769        int ui = i;
2770        for (;;) {
2771        assert (n - i >= 2);
2772        bb.put(decode(s.charAt(++i), s.charAt(++i)));
2773        if (++i >= n)
2774            break;
2775        c = s.charAt(i);
2776        if (c != '%')
2777            break;
2778        }
2779        bb.flip();
2780        cb.clear();
2781        dec.reset();
2782        CoderResult JavaDoc cr = dec.decode(bb, cb, true);
2783        assert cr.isUnderflow();
2784        cr = dec.flush(cb);
2785        assert cr.isUnderflow();
2786        sb.append(cb.flip().toString());
2787    }
2788
2789    return sb.toString();
2790    }
2791
2792
2793    // -- Parsing --
2794

2795    // For convenience we wrap the input URI string in a new instance of the
2796
// following internal class. This saves always having to pass the input
2797
// string as an argument to each internal scan/parse method.
2798

2799    private class Parser {
2800
2801    private String JavaDoc input; // URI input string
2802
private boolean requireServerAuthority = false;
2803
2804    Parser(String JavaDoc s) {
2805        input = s;
2806        string = s;
2807    }
2808
2809    // -- Methods for throwing URISyntaxException in various ways --
2810

2811    private void fail(String JavaDoc reason) throws URISyntaxException JavaDoc {
2812        throw new URISyntaxException JavaDoc(input, reason);
2813    }
2814
2815    private void fail(String JavaDoc reason, int p) throws URISyntaxException JavaDoc {
2816        throw new URISyntaxException JavaDoc(input, reason, p);
2817    }
2818
2819    private void failExpecting(String JavaDoc expected, int p)
2820        throws URISyntaxException JavaDoc
2821    {
2822        fail("Expected " + expected, p);
2823    }
2824
2825    private void failExpecting(String JavaDoc expected, String JavaDoc prior, int p)
2826        throws URISyntaxException JavaDoc
2827    {
2828        fail("Expected " + expected + " following " + prior, p);
2829    }
2830
2831
2832    // -- Simple access to the input string --
2833

2834    // Return a substring of the input string
2835
//
2836
private String JavaDoc substring(int start, int end) {
2837        return input.substring(start, end);
2838    }
2839
2840    // Return the char at position p,
2841
// assuming that p < input.length()
2842
//
2843
private char charAt(int p) {
2844        return input.charAt(p);
2845    }
2846
2847    // Tells whether start < end and, if so, whether charAt(start) == c
2848
//
2849
private boolean at(int start, int end, char c) {
2850        return (start < end) && (charAt(start) == c);
2851    }
2852
2853    // Tells whether start + s.length() < end and, if so,
2854
// whether the chars at the start position match s exactly
2855
//
2856
private boolean at(int start, int end, String JavaDoc s) {
2857        int p = start;
2858        int sn = s.length();
2859        if (sn > end - p)
2860        return false;
2861        int i = 0;
2862        while (i < sn) {
2863        if (charAt(p++) != s.charAt(i)) {
2864            break;
2865        }
2866        i++;
2867        }
2868        return (i == sn);
2869    }
2870
2871
2872    // -- Scanning --
2873

2874    // The various scan and parse methods that follow use a uniform
2875
// convention of taking the current start position and end index as
2876
// their first two arguments. The start is inclusive while the end is
2877
// exclusive, just as in the String class, i.e., a start/end pair
2878
// denotes the left-open interval [start, end) of the input string.
2879
//
2880
// These methods never proceed past the end position. They may return
2881
// -1 to indicate outright failure, but more often they simply return
2882
// the position of the first char after the last char scanned. Thus
2883
// a typical idiom is
2884
//
2885
// int p = start;
2886
// int q = scan(p, end, ...);
2887
// if (q > p)
2888
// // We scanned something
2889
// ...;
2890
// else if (q == p)
2891
// // We scanned nothing
2892
// ...;
2893
// else if (q == -1)
2894
// // Something went wrong
2895
// ...;
2896

2897
2898    // Scan a specific char: If the char at the given start position is
2899
// equal to c, return the index of the next char; otherwise, return the
2900
// start position.
2901
//
2902
private int scan(int start, int end, char c) {
2903        if ((start < end) && (charAt(start) == c))
2904        return start + 1;
2905        return start;
2906    }
2907
2908    // Scan forward from the given start position. Stop at the first char
2909
// in the err string (in which case -1 is returned), or the first char
2910
// in the stop string (in which case the index of the preceding char is
2911
// returned), or the end of the input string (in which case the length
2912
// of the input string is returned). May return the start position if
2913
// nothing matches.
2914
//
2915
private int scan(int start, int end, String JavaDoc err, String JavaDoc stop) {
2916        int p = start;
2917        while (p < end) {
2918        char c = charAt(p);
2919        if (err.indexOf(c) >= 0)
2920            return -1;
2921        if (stop.indexOf(c) >= 0)
2922            break;
2923        p++;
2924        }
2925        return p;
2926    }
2927
2928    // Scan a potential escape sequence, starting at the given position,
2929
// with the given first char (i.e., charAt(start) == c).
2930
//
2931
// This method assumes that if escapes are allowed then visible
2932
// non-US-ASCII chars are also allowed.
2933
//
2934
private int scanEscape(int start, int n, char first)
2935        throws URISyntaxException JavaDoc
2936    {
2937        int p = start;
2938        char c = first;
2939        if (c == '%') {
2940        // Process escape pair
2941
if ((p + 3 <= n)
2942            && match(charAt(p + 1), L_HEX, H_HEX)
2943            && match(charAt(p + 2), L_HEX, H_HEX)) {
2944            return p + 3;
2945        }
2946        fail("Malformed escape pair", p);
2947        } else if ((c > 128)
2948               && !Character.isSpaceChar(c)
2949               && !Character.isISOControl(c)) {
2950        // Allow unescaped but visible non-US-ASCII chars
2951
return p + 1;
2952        }
2953        return p;
2954    }
2955
2956    // Scan chars that match the given mask pair
2957
//
2958
private int scan(int start, int n, long lowMask, long highMask)
2959        throws URISyntaxException JavaDoc
2960    {
2961        int p = start;
2962        while (p < n) {
2963        char c = charAt(p);
2964        if (match(c, lowMask, highMask)) {
2965            p++;
2966            continue;
2967        }
2968        if ((lowMask & L_ESCAPED) != 0) {
2969            int q = scanEscape(p, n, c);
2970            if (q > p) {
2971            p = q;
2972            continue;
2973            }
2974        }
2975        break;
2976        }
2977        return p;
2978    }
2979
2980    // Check that each of the chars in [start, end) matches the given mask
2981
//
2982
private void checkChars(int start, int end,
2983                long lowMask, long highMask,
2984                String JavaDoc what)
2985        throws URISyntaxException JavaDoc
2986    {
2987        int p = scan(start, end, lowMask, highMask);
2988        if (p < end)
2989        fail("Illegal character in " + what, p);
2990    }
2991
2992    // Check that the char at position p matches the given mask
2993
//
2994
private void checkChar(int p,
2995                   long lowMask, long highMask,
2996                   String JavaDoc what)
2997        throws URISyntaxException JavaDoc
2998    {
2999        checkChars(p, p + 1, lowMask, highMask, what);
3000    }
3001
3002
3003    // -- Parsing --
3004

3005    // [<scheme>:]<scheme-specific-part>[#<fragment>]
3006
//
3007
void parse(boolean rsa) throws URISyntaxException JavaDoc {
3008        requireServerAuthority = rsa;
3009        int ssp; // Start of scheme-specific part
3010
int n = input.length();
3011        int p = scan(0, n, "/?#", ":");
3012        if ((p >= 0) && at(p, n, ':')) {
3013        if (p == 0)
3014            failExpecting("scheme name", 0);
3015        checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3016        checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3017        scheme = substring(0, p);
3018        p++; // Skip ':'
3019
ssp = p;
3020        if (at(p, n, '/')) {
3021            p = parseHierarchical(p, n);
3022        } else {
3023            int q = scan(p, n, "", "#");
3024            if (q <= p)
3025            failExpecting("scheme-specific part", p);
3026            checkChars(p, q, L_URIC, H_URIC, "opaque part");
3027            p = q;
3028        }
3029        } else {
3030        ssp = 0;
3031        p = parseHierarchical(0, n);
3032        }
3033        schemeSpecificPart = substring(ssp, p);
3034        if (at(p, n, '#')) {
3035        checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3036        fragment = substring(p + 1, n);
3037        p = n;
3038        }
3039        if (p < n)
3040        fail("end of URI", p);
3041    }
3042
3043    // [//authority]<path>[?<query>]
3044
//
3045
// DEVIATION from RFC2396: We allow an empty authority component as
3046
// long as it's followed by a non-empty path, query component, or
3047
// fragment component. This is so that URIs such as "file:///foo/bar"
3048
// will parse. This seems to be the intent of RFC2396, though the
3049
// grammar does not permit it. If the authority is empty then the
3050
// userInfo, host, and port components are undefined.
3051
//
3052
// DEVIATION from RFC2396: We allow empty relative paths. This seems
3053
// to be the intent of RFC2396, but the grammar does not permit it.
3054
// The primary consequence of this deviation is that "#f" parses as a
3055
// relative URI with an empty path.
3056
//
3057
private int parseHierarchical(int start, int n)
3058        throws URISyntaxException JavaDoc
3059    {
3060        int p = start;
3061        if (at(p, n, '/') && at(p + 1, n, '/')) {
3062        p += 2;
3063        int q = scan(p, n, "", "/?#");
3064        if (q > p) {
3065            p = parseAuthority(p, q);
3066        } else if (q < n) {
3067            // DEVIATION: Allow empty authority prior to non-empty
3068
// path, query component or fragment identifier
3069
} else
3070            failExpecting("authority", p);
3071        }
3072        int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
3073
checkChars(p, q, L_PATH, H_PATH, "path");
3074        path = substring(p, q);
3075        p = q;
3076        if (at(p, n, '?')) {
3077        p++;
3078        q = scan(p, n, "", "#");
3079        checkChars(p, q, L_URIC, H_URIC, "query");
3080        query = substring(p, q);
3081        p = q;
3082        }
3083        return p;
3084    }
3085
3086    // authority = server | reg_name
3087
//
3088
// Ambiguity: An authority that is a registry name rather than a server
3089
// might have a prefix that parses as a server. We use the fact that
3090
// the authority component is always followed by '/' or the end of the
3091
// input string to resolve this: If the complete authority did not
3092
// parse as a server then we try to parse it as a registry name.
3093
//
3094
private int parseAuthority(int start, int n)
3095        throws URISyntaxException JavaDoc
3096    {
3097        int p = start;
3098        int q = p;
3099        URISyntaxException JavaDoc ex = null;
3100
3101        boolean serverChars;
3102        boolean regChars;
3103
3104        if (scan(p, n, "", "]") > p) {
3105        // contains a literal IPv6 address, therefore % is allowed
3106
serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3107        } else {
3108            serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3109        }
3110        regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3111
3112        if (regChars && !serverChars) {
3113        // Must be a registry-based authority
3114
authority = substring(p, n);
3115        return n;
3116        }
3117
3118        if (serverChars) {
3119        // Might be (probably is) a server-based authority, so attempt
3120
// to parse it as such. If the attempt fails, try to treat it
3121
// as a registry-based authority.
3122
try {
3123            q = parseServer(p, n);
3124            if (q < n)
3125            failExpecting("end of authority", q);
3126            authority = substring(p, n);
3127        } catch (URISyntaxException JavaDoc x) {
3128            // Undo results of failed parse
3129
userInfo = null;
3130            host = null;
3131            port = -1;
3132            if (requireServerAuthority) {
3133            // If we're insisting upon a server-based authority,
3134
// then just re-throw the exception
3135
throw x;
3136            } else {
3137            // Save the exception in case it doesn't parse as a
3138
// registry either
3139
ex = x;
3140            q = p;
3141            }
3142        }
3143        }
3144
3145        if (q < n) {
3146        if (regChars) {
3147            // Registry-based authority
3148
authority = substring(p, n);
3149        } else if (ex != null) {
3150            // Re-throw exception; it was probably due to
3151
// a malformed IPv6 address
3152
throw ex;
3153        } else {
3154            fail("Illegal character in authority", q);
3155        }
3156        }
3157
3158        return n;
3159    }
3160
3161
3162    // [<userinfo>@]<host>[:<port>]
3163
//
3164
private int parseServer(int start, int n)
3165        throws URISyntaxException JavaDoc
3166    {
3167        int p = start;
3168        int q;
3169
3170        // userinfo
3171
q = scan(p, n, "/?#", "@");
3172        if ((q >= p) && at(q, n, '@')) {
3173        checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3174        userInfo = substring(p, q);
3175        p = q + 1; // Skip '@'
3176
}
3177
3178        // hostname, IPv4 address, or IPv6 address
3179
if (at(p, n, '[')) {
3180        // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3181
p++;
3182        q = scan(p, n, "/?#", "]");
3183        if ((q > p) && at(q, n, ']')) {
3184            // look for a "%" scope id
3185
int r = scan (p, q, "", "%");
3186            if (r > p) {
3187                parseIPv6Reference(p, r);
3188            if (r+1 == q) {
3189                fail ("scope id expected");
3190            }
3191            checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
3192                        "scope id");
3193            } else {
3194                parseIPv6Reference(p, q);
3195            }
3196                host = substring(p-1, q+1);
3197            p = q + 1;
3198        } else {
3199            failExpecting("closing bracket for IPv6 address", q);
3200        }
3201        } else {
3202        q = parseIPv4Address(p, n);
3203        if (q <= p)
3204            q = parseHostname(p, n);
3205        p = q;
3206        }
3207
3208        // port
3209
if (at(p, n, ':')) {
3210        p++;
3211        q = scan(p, n, "", "/");
3212        if (q > p) {
3213            checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3214            try {
3215            port = Integer.parseInt(substring(p, q));
3216            } catch (NumberFormatException JavaDoc x) {
3217            fail("Malformed port number", p);
3218            }
3219            p = q;
3220        }
3221        }
3222        if (p < n)
3223        failExpecting("port number", p);
3224
3225        return p;
3226    }
3227
3228    // Scan a string of decimal digits whose value fits in a byte
3229
//
3230
private int scanByte(int start, int n)
3231        throws URISyntaxException JavaDoc
3232    {
3233        int p = start;
3234        int q = scan(p, n, L_DIGIT, H_DIGIT);
3235        if (q <= p) return q;
3236        if (Integer.parseInt(substring(p, q)) > 255) return p;
3237        return q;
3238    }
3239
3240    // Scan an IPv4 address.
3241
//
3242
// If the strict argument is true then we require that the given
3243
// interval contain nothing besides an IPv4 address; if it is false
3244
// then we only require that it start with an IPv4 address.
3245
//
3246
// If the interval does not contain or start with (depending upon the
3247
// strict argument) a legal IPv4 address characters then we return -1
3248
// immediately; otherwise we insist that these characters parse as a
3249
// legal IPv4 address and throw an exception on failure.
3250
//
3251
// We assume that any string of decimal digits and dots must be an IPv4
3252
// address. It won't parse as a hostname anyway, so making that
3253
// assumption here allows more meaningful exceptions to be thrown.
3254
//
3255
private int scanIPv4Address(int start, int n, boolean strict)
3256        throws URISyntaxException JavaDoc
3257    {
3258        int p = start;
3259        int q;
3260        int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3261        if ((m <= p) || (strict && (m != n)))
3262        return -1;
3263        for (;;) {
3264        // Per RFC2732: At most three digits per byte
3265
// Further constraint: Each element fits in a byte
3266
if ((q = scanByte(p, m)) <= p) break; p = q;
3267        if ((q = scan(p, m, '.')) <= p) break; p = q;
3268        if ((q = scanByte(p, m)) <= p) break; p = q;
3269        if ((q = scan(p, m, '.')) <= p) break; p = q;
3270        if ((q = scanByte(p, m)) <= p) break; p = q;
3271        if ((q = scan(p, m, '.')) <= p) break; p = q;
3272        if ((q = scanByte(p, m)) <= p) break; p = q;
3273        if (q < m) break;
3274        return q;
3275        }
3276        fail("Malformed IPv4 address", q);
3277        return -1;
3278    }
3279
3280    // Take an IPv4 address: Throw an exception if the given interval
3281
// contains anything except an IPv4 address
3282
//
3283
private int takeIPv4Address(int start, int n, String JavaDoc expected)
3284        throws URISyntaxException JavaDoc
3285    {
3286        int p = scanIPv4Address(start, n, true);
3287        if (p <= start)
3288        failExpecting(expected, start);
3289        return p;
3290    }
3291
3292    // Attempt to parse an IPv4 address, returning -1 on failure but
3293
// allowing the given interval to contain [:<characters>] after
3294
// the IPv4 address.
3295
//
3296
private int parseIPv4Address(int start, int n) {
3297        int p;
3298
3299        try {
3300            p = scanIPv4Address(start, n, false);
3301        } catch (URISyntaxException JavaDoc x) {
3302        return -1;
3303            } catch (NumberFormatException JavaDoc nfe) {
3304        return -1;
3305            }
3306
3307        if (p > start && p < n) {
3308            // IPv4 address is followed by something - check that
3309
// it's a ":" as this is the only valid character to
3310
// follow an address.
3311
if (charAt(p) != ':') {
3312            p = -1;
3313        }
3314        }
3315
3316        if (p > start)
3317        host = substring(start, p);
3318
3319        return p;
3320    }
3321
3322    // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3323
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
3324
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
3325
//
3326
private int parseHostname(int start, int n)
3327        throws URISyntaxException JavaDoc
3328    {
3329        int p = start;
3330        int q;
3331        int l = -1; // Start of last parsed label
3332

3333        do {
3334        // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3335
q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3336        if (q <= p)
3337            break;
3338        l = p;
3339        if (q > p) {
3340            p = q;
3341            q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3342            if (q > p) {
3343            if (charAt(q - 1) == '-')
3344                fail("Illegal character in hostname", q - 1);
3345            p = q;
3346            }
3347        }
3348        q = scan(p, n, '.');
3349        if (q <= p)
3350            break;
3351        p = q;
3352        } while (p < n);
3353
3354        if ((p < n) && !at(p, n, ':'))
3355        fail("Illegal character in hostname", p);
3356
3357        if (l < 0)
3358        failExpecting("hostname", start);
3359
3360        // for a fully qualified hostname check that the rightmost
3361
// label starts with an alpha character.
3362
if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
3363        fail("Illegal character in hostname", l);
3364        }
3365
3366        host = substring(start, p);
3367        return p;
3368    }
3369
3370
3371    // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3372
//
3373
// Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3374
// the form ::12.34.56.78, which are clearly shown in the examples
3375
// earlier in the document. Here is the original grammar:
3376
//
3377
// IPv6address = hexpart [ ":" IPv4address ]
3378
// hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3379
// hexseq = hex4 *( ":" hex4)
3380
// hex4 = 1*4HEXDIG
3381
//
3382
// We therefore use the following revised grammar:
3383
//
3384
// IPv6address = hexseq [ ":" IPv4address ]
3385
// | hexseq [ "::" [ hexpost ] ]
3386
// | "::" [ hexpost ]
3387
// hexpost = hexseq | hexseq ":" IPv4address | IPv4address
3388
// hexseq = hex4 *( ":" hex4)
3389
// hex4 = 1*4HEXDIG
3390
//
3391
// This covers all and only the following cases:
3392
//
3393
// hexseq
3394
// hexseq : IPv4address
3395
// hexseq ::
3396
// hexseq :: hexseq
3397
// hexseq :: hexseq : IPv4address
3398
// hexseq :: IPv4address
3399
// :: hexseq
3400
// :: hexseq : IPv4address
3401
// :: IPv4address
3402
// ::
3403
//
3404
// Additionally we constrain the IPv6 address as follows :-
3405
//
3406
// i. IPv6 addresses without compressed zeros should contain
3407
// exactly 16 bytes.
3408
//
3409
// ii. IPv6 addresses with compressed zeros should contain
3410
// less than 16 bytes.
3411

3412    private int ipv6byteCount = 0;
3413
3414    private int parseIPv6Reference(int start, int n)
3415        throws URISyntaxException JavaDoc
3416    {
3417        int p = start;
3418        int q;
3419        boolean compressedZeros = false;
3420
3421        q = scanHexSeq(p, n);
3422
3423        if (q > p) {
3424        p = q;
3425        if (at(p, n, "::")) {
3426            compressedZeros = true;
3427            p = scanHexPost(p + 2, n);
3428        } else if (at(p, n, ':')) {
3429            p = takeIPv4Address(p + 1, n, "IPv4 address");
3430            ipv6byteCount += 4;
3431        }
3432        } else if (at(p, n, "::")) {
3433        compressedZeros = true;
3434        p = scanHexPost(p + 2, n);
3435        }
3436        if (p < n)
3437        fail("Malformed IPv6 address", start);
3438        if (ipv6byteCount > 16)
3439        fail("IPv6 address too long", start);
3440        if (!compressedZeros && ipv6byteCount < 16)
3441        fail("IPv6 address too short", start);
3442        if (compressedZeros && ipv6byteCount == 16)
3443        fail("Malformed IPv6 address", start);
3444
3445        return p;
3446    }
3447
3448    private int scanHexPost(int start, int n)
3449        throws URISyntaxException JavaDoc
3450    {
3451        int p = start;
3452        int q;
3453
3454        if (p == n)
3455        return p;
3456
3457        q = scanHexSeq(p, n);
3458        if (q > p) {
3459        p = q;
3460        if (at(p, n, ':')) {
3461            p++;
3462            p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3463            ipv6byteCount += 4;
3464        }
3465        } else {
3466        p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3467        ipv6byteCount += 4;
3468        }
3469        return p;
3470    }
3471
3472    // Scan a hex sequence; return -1 if one could not be scanned
3473
//
3474
private int scanHexSeq(int start, int n)
3475        throws URISyntaxException JavaDoc
3476    {
3477        int p = start;
3478        int q;
3479
3480        q = scan(p, n, L_HEX, H_HEX);
3481        if (q <= p)
3482        return -1;
3483        if (at(q, n, '.')) // Beginning of IPv4 address
3484
return -1;
3485        if (q > p + 4)
3486                fail("IPv6 hexadecimal digit sequence too long", p);
3487        ipv6byteCount += 2;
3488        p = q;
3489        while (p < n) {
3490        if (!at(p, n, ':'))
3491            break;
3492        if (at(p + 1, n, ':'))
3493            break; // "::"
3494
p++;
3495        q = scan(p, n, L_HEX, H_HEX);
3496        if (q <= p)
3497            failExpecting("digits for an IPv6 address", p);
3498        if (at(q, n, '.')) { // Beginning of IPv4 address
3499
p--;
3500            break;
3501        }
3502        if (q > p + 4)
3503            fail("IPv6 hexadecimal digit sequence too long", p);
3504        ipv6byteCount += 2;
3505        p = q;
3506        }
3507
3508        return p;
3509    }
3510
3511    }
3512
3513}
3514
Popular Tags