Java > Open Source Codes > java > net > URI


1   /*
2    * @(#)URI.java 1.40 05/11/28
3    *
4    * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5    * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   package java.net;
9   
10  import java.io.IOException  ;
11  import java.io.InvalidObjectException  ;
12  import java.io.ObjectInputStream  ;
13  import java.io.ObjectOutputStream  ;
14  import java.io.Serializable  ;
15  import java.nio.ByteBuffer  ;
16  import java.nio.CharBuffer  ;
17  import java.nio.charset.CharsetDecoder  ;
18  import java.nio.charset.CharsetEncoder  ;
19  import java.nio.charset.CoderResult  ;
20  import java.nio.charset.CodingErrorAction  ;
21  import java.nio.charset.CharacterCodingException  ;
22  import sun.nio.cs.ThreadLocalCoders;
23  import sun.text.Normalizer;
24  
25  import java.lang.Character  ;     // for javadoc
26  import java.lang.NullPointerException  ;  // for javadoc
27  
28  
29  /**
30   * Represents a Uniform Resource Identifier (URI) reference.
31   *
32   * <p> Aside from some minor deviations noted below, an instance of this 
33   * class represents a URI reference as defined by
34   * <a HREF="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
35   * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
36   * HREF="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
37   * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
38   * also supports scope_ids. The syntax and usage of scope_ids is described
39   * <a HREF="Inet6Address.html#scoped">here</a>.
40   * This class provides constructors for creating URI instances from
41   * their components or by parsing their string forms, methods for accessing the
42   * various components of an instance, and methods for normalizing, resolving,
43   * and relativizing URI instances.  Instances of this class are immutable.
44   *
45   *
46   * <h4> URI syntax and components </h4>
47   *
48   * At the highest level a URI reference (hereinafter simply "URI") in string
49   * form has the syntax
50   *
51   * <blockquote>
52   * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
53   * </blockquote>
54   *
55   * where square brackets [...] delineate optional components and the characters
56   * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
57   *
58   * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
59   * said to be <i>relative</i>.  URIs are also classified according to whether
60   * they are <i>opaque</i> or <i>hierarchical</i>.
61   *
62   * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
63   * not begin with a slash character (<tt>'/'</tt>).  Opaque URIs are not
64   * subject to further parsing.  Some examples of opaque URIs are:
65   *
66   * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
67   * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
68   * <tr><td><tt>news:comp.lang.java</tt><td></tr>
69   * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
70   * </table></blockquote>
71   *
72   * <p> A <i>hierarchical</i> URI is either an absolute URI whose
73   * scheme-specific part begins with a slash character, or a relative URI, that
74   * is, a URI that does not specify a scheme.  Some examples of hierarchical
75   * URIs are:
76   *
77   * <blockquote>
78   * <tt>http://java.sun.com/j2se/1.3/</tt><br>
79   * <tt>docs/guide/collections/designfaq.html#28</tt><br>
80   * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
81   * <tt>file:///~/calendar</tt>
82   * </blockquote>
83   *
84   * <p> A hierarchical URI is subject to further parsing according to the syntax
85   *
86   * <blockquote>
87   * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
88   * </blockquote>
89   *
90   * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
91   * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves.  The
92   * scheme-specific part of a hierarchical URI consists of the characters
93   * between the scheme and fragment components.
94   *
95   * <p> The authority component of a hierarchical URI is, if specified, either
96   * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
97   * parses according to the familiar syntax
98   *
99   * <blockquote>
100  * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
101  * </blockquote>
102  *
103  * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
104  * themselves.  Nearly all URI schemes currently in use are server-based.  An
105  * authority component that does not parse in this way is considered to be
106  * registry-based.
107  *
108  * <p> The path component of a hierarchical URI is itself said to be absolute
109  * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
110  * relative.  The path of a hierarchical URI that is either absolute or
111  * specifies an authority is always absolute.
112  *
113  * <p> All told, then, a URI instance has the following nine components:
114  *
115  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
116  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
117  * <tr><td>scheme</td><td><tt>String</tt></td></tr>
118  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>
119  * <tr><td>authority</td><td><tt>String</tt></td></tr>
120  * <tr><td>user-info</td><td><tt>String</tt></td></tr>
121  * <tr><td>host</td><td><tt>String</tt></td></tr>
122  * <tr><td>port</td><td><tt>int</tt></td></tr>
123  * <tr><td>path</td><td><tt>String</tt></td></tr>
124  * <tr><td>query</td><td><tt>String</tt></td></tr>
125  * <tr><td>fragment</td><td><tt>String</tt></td></tr>
126  * </table></blockquote>
127  *
128  * In a given instance any particular component is either <i>undefined</i> or
129  * <i>defined</i> with a distinct value.  Undefined string components are
130  * represented by <tt>null</tt>, while undefined integer components are
131  * represented by <tt>-1</tt>.  A string component may be defined to have the
132  * empty string as its value; this is not equivalent to that component being
133  * undefined.
134  *
135  * <p> Whether a particular component is or is not defined in an instance
136  * depends upon the type of the URI being represented.  An absolute URI has a
137  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
138  * possibly a fragment, but has no other components.  A hierarchical URI always
139  * has a path (though it may be empty) and a scheme-specific-part (which at
140  * least contains the path), and may have any of the other components.  If the
141  * authority component is present and is server-based then the host component
142  * will be defined and the user-information and port components may be defined.
143  *
144  *
145  * <h4> Operations on URI instances </h4>
146  *
147  * The key operations supported by this class are those of
148  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
149  *
150  * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
151  * and <tt>".."</tt> segments from the path component of a hierarchical URI.
152  * Each <tt>"."</tt> segment is simply removed.  A <tt>".."</tt> segment is
153  * removed only if it is preceded by a non-<tt>".."</tt> segment.
154  * Normalization has no effect upon opaque URIs.
155  *
156  * <p> <i>Resolution</i> is the process of resolving one URI against another,
157  * <i>base</i> URI.  The resulting URI is constructed from components of both
158  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
159  * base URI for those not specified in the original.  For hierarchical URIs,
160  * the path of the original is resolved against the path of the base and then
161  * normalized.  The result, for example, of resolving
162  *
163  * <blockquote>
164  * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)
165  * </blockquote>
166  *
167  * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
168  * URI
169  *
170  * <blockquote>
171  * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
172  * </blockquote>
173  *
174  * Resolving the relative URI
175  *
176  * <blockquote>
177  * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)
178  * </blockquote>
179  *
180  * against this result yields, in turn,
181  *
182  * <blockquote>
183  * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
184  * </blockquote>
185  *
186  * Resolution of both absolute and relative URIs, and of both absolute and
187  * relative paths in the case of hierarchical URIs, is supported.  Resolving
188  * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
189  * original URI, since it is absolute.  Resolving the relative URI (2) above
190  * against the relative base URI (1) yields the normalized, but still relative,
191  * URI
192  *
193  * <blockquote>
194  * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
195  * </blockquote>
196  *
197  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
198  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
199  *
200  * <blockquote>
201  *   <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>
202  *   <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>
203  * </blockquote>
204  *
205  * This operation is often useful when constructing a document containing URIs
206  * that must be made relative to the base URI of the document wherever
207  * possible.  For example, relativizing the URI
208  *
209  * <blockquote>
210  * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
211  * </blockquote>
212  *
213  * against the base URI
214  *
215  * <blockquote>
216  * <tt>http://java.sun.com/j2se/1.3</tt>
217  * </blockquote>
218  *
219  * yields the relative URI <tt>docs/guide/index.html</tt>.
220  *
221  *
222  * <h4> Character categories </h4>
223  *
224  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
225  * various components of a URI reference.  The following categories, most of
226  * which are taken from that specification, are used below to describe these
227  * constraints:
228  *
229  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
230  *   <tr><th valign=top><i>alpha</i></th>
231  *       <td>The US-ASCII alphabetic characters,
232  *    <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>
233  *    and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>
234  *   <tr><th valign=top><i>digit</i></th>
235  *       <td>The US-ASCII decimal digit characters,
236  *       <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>
237  *   <tr><th valign=top><i>alphanum</i></th>
238  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
239  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
240  *       <td>All <i>alphanum</i> characters together with those in the string
241  *    <tt>"_-!.~'()*"</tt></td></tr>
242  *   <tr><th valign=top><i>punct</i></th>
243  *       <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
244  *   <tr><th valign=top><i>reserved</i></th>
245  *       <td>All <i>punct</i> characters together with those in the string
246  *    <tt>"?/[]@"</tt></td></tr>
247  *   <tr><th valign=top><i>escaped</i></th>
248  *       <td>Escaped octets, that is, triplets consisting of the percent
249  *           character (<tt>'%'</tt>) followed by two hexadecimal digits
250  *           (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
251  *           <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
252  *   <tr><th valign=top><i>other</i></th>
253  *       <td>The Unicode characters that are not in the US-ASCII character set,
254  *           are not control characters (according to the {@link
255  *           java.lang.Character#isISOControl(char) Character.isISOControl}
256  *       method), and are not space characters (according to the {@link
257  *       java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
258  *       method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
259  *       limited to US-ASCII)</i></td></tr>
260  * </table></blockquote>
261  *
262  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
263  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
264  * characters.
265  *
266  *
267  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
268  *
269  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
270  * fragment components.  Escaping serves two purposes in URIs:
271  *
272  * <ul>
273  *
274  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
275  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
276  *   characters.  </p></li>
277  *
278  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
279  *   component.  The user-info, path, query, and fragment components differ
280  *   slightly in terms of which characters are considered legal and illegal.
281  *   </p></li>
282  *
283  * </ul>
284  *
285  * These purposes are served in this class by three related operations:
286  *
287  * <ul>
288  *
289  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
290  *   with the sequence of escaped octets that represent that character in the
291  *   UTF-8 character set.  The Euro currency symbol (<tt>'&#92;u20AC'</tt>),
292  *   for example, is encoded as <tt>"%E2%82%AC"</tt>.  <i>(<b>Deviation from
293  *   RFC&nbsp;2396</b>, which does not specify any particular character
294  *   set.)</i> </p></li>
295  *
296  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
297  *   encoding it.  The space character, for example, is quoted by replacing it
298  *   with <tt>"%20"</tt>.  UTF-8 contains US-ASCII, hence for US-ASCII
299  *   characters this transformation has exactly the effect required by
300  *   RFC&nbsp;2396. </p></li>
301  *
302  *   <li><p><a name="decode"></a>
303  *   A sequence of escaped octets is <i>decoded</i> by
304  *   replacing it with the sequence of characters that it represents in the
305  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
306  *   effect of de-quoting any quoted US-ASCII characters as well as that of
307  *   decoding any encoded non-US-ASCII characters.  If a <a
308  *   HREF="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
309  *   when decoding the escaped octets then the erroneous octets are replaced by
310  *   <tt>'&#92;uFFFD'</tt>, the Unicode replacement character.  </p></li>
311  *
312  * </ul>
313  *
314  * These operations are exposed in the constructors and methods of this class
315  * as follows:
316  *
317  * <ul>
318  *
319  *   <li><p> The {@link #URI(java.lang.String) <code>single-argument
320  *   constructor</code>} requires any illegal characters in its argument to be
321  *   quoted and preserves any escaped octets and <i>other</i> characters that
322  *   are present.  </p></li>
323  *
324  *   <li><p> The {@link
325  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
326  *   <code>multi-argument constructors</code>} quote illegal characters as
327  *   required by the components in which they appear.  The percent character
328  *   (<tt>'%'</tt>) is always quoted by these constructors.  Any <i>other</i>
329  *   characters are preserved.  </p></li>
330  *
331  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
332  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
333  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
334  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
335  *   values of their corresponding components in raw form, without interpreting
336  *   any escaped octets.  The strings returned by these methods may contain
337  *   both escaped octets and <i>other</i> characters, and will not contain any
338  *   illegal characters.  </p></li>
339  *
340  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
341  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
342  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
343  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
344  *   octets in their corresponding components.  The strings returned by these
345  *   methods may contain both <i>other</i> characters and illegal characters,
346  *   and will not contain any escaped octets.  </p></li>
347  *
348  *   <li><p> The {@link #toString() toString} method returns a URI string with
349  *   all necessary quotation but which may contain <i>other</i> characters.
350  *   </p></li>
351  *
352  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
353  *   quoted and encoded URI string that does not contain any <i>other</i>
354  *   characters.  </p></li>
355  *
356  * </ul>
357  *
358  *
359  * <h4> Identities </h4>
360  *
361  * For any URI <i>u</i>, it is always the case that
362  *
363  * <blockquote>
364  * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.
365  * </blockquote>
366  *
367  * For any URI <i>u</i> that does not contain redundant syntax such as two
368  * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a
369  * colon following a host name but no port (as in
370  * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters
371  * except those that must be quoted, the following identities also hold:
372  *
373  * <blockquote>
374  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
375  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
376  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
377  * .equals(</tt><i>u</i><tt>)</tt>
378  * </blockquote>
379  *
380  * in all cases,
381  *
382  * <blockquote>
383  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
384  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>
385  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
386  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
387  * .equals(</tt><i>u</i><tt>)</tt>
388  * </blockquote>
389  *
390  * if <i>u</i> is hierarchical, and
391  *
392  * <blockquote>
393  * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
394  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>
395  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
396  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
397  * .equals(</tt><i>u</i><tt>)</tt>
398  * </blockquote>
399  *
400  * if <i>u</i> is hierarchical and has either no authority or a server-based
401  * authority.
402  *
403  *
404  * <h4> URIs, URLs, and URNs </h4>
405  *
406  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
407  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
408  * not every URI is a URL.  This is because there is another subcategory of
409  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
410  * specify how to locate them.  The <tt>mailto</tt>, <tt>news</tt>, and
411  * <tt>isbn</tt> URIs shown above are examples of URNs.
412  *
413  * <p> The conceptual distinction between URIs and URLs is reflected in the
414  * differences between this class and the {@link URL} class.
415  *
416  * <p> An instance of this class represents a URI reference in the syntactic
417  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
418  * A URI string is parsed according to the generic syntax without regard to the
419  * scheme, if any, that it specifies.  No lookup of the host, if any, is
420  * performed, and no scheme-dependent stream handler is constructed.  Equality,
421  * hashing, and comparison are defined strictly in terms of the character
422  * content of the instance.  In other words, a URI instance is little more than
423  * a structured string that supports the syntactic, scheme-independent
424  * operations of comparison, normalization, resolution, and relativization.
425  *
426  * <p> An instance of the {@link URL} class, by contrast, represents the
427  * syntactic components of a URL together with some of the information required
428  * to access the resource that it describes.  A URL must be absolute, that is,
429  * it must always specify a scheme.  A URL string is parsed according to its
430  * scheme.  A stream handler is always established for a URL, and in fact it is
431  * impossible to create a URL instance for a scheme for which no handler is
432  * available.  Equality and hashing depend upon both the scheme and the
433  * Internet address of the host, if any; comparison is not defined.  In other
434  * words, a URL is a structured string that supports the syntactic operation of
435  * resolution as well as the network I/O operations of looking up the host and
436  * opening a connection to the specified resource.
437  *
438  *
439  * @version 1.40, 05/11/28
440  * @author Mark Reinhold
441  * @since 1.4
442  *
443  * @see <a HREF="http://ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
444  * transformation format of ISO 10646</i></a>, <br><a
445  * HREF="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
446  * Architecture</i></a>, <br><a
447  * HREF="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
448  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
449  * HREF="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
450  * Literal IPv6 Addresses in URLs</i></a>, <br><a
451  * HREF="URISyntaxException.html">URISyntaxException</a>
452  */
453 
454 public final class URI
455     implements Comparable  <URI  >, Serializable  
456 {
457 
458     // Note: Comments containing the word "ASSERT" indicate places where a
459     // throw of an InternalError should be replaced by an appropriate assertion
460     // statement once asserts are enabled in the build.
461 
462     static final long serialVersionUID = -6052424284110960213L;
463 
464 
465     // -- Properties and components of this instance --
466 
467     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
468     private transient String   scheme;        // null ==> relative URI
469     private transient String   fragment;
470 
471     // Hierarchical URI components: [//<authority>]<path>[?<query>]
472     private transient String   authority;     // Registry or server
473 
474     // Server-based authority: [<userInfo>@]<host>[:<port>]
475     private transient String   userInfo;
476     private transient String   host;      // null ==> registry-based
477     private transient int port = -1;        // -1 ==> undefined
478 
479     // Remaining components of hierarchical URIs
480     private transient String   path;      // null ==> opaque
481     private transient String   query;
482 
483     // The remaining fields may be computed on demand
484 
485     private volatile transient String   schemeSpecificPart;
486     private volatile transient int hash;    // Zero ==> undefined
487 
488     private volatile transient String   decodedUserInfo = null;
489     private volatile transient String   decodedAuthority = null;
490     private volatile transient String   decodedPath = null;
491     private volatile transient String   decodedQuery = null;
492     private volatile transient String   decodedFragment = null;
493     private volatile transient String   decodedSchemeSpecificPart = null;
494 
495     /**
496      * The string form of this URI.
497      *
498      * @serial
499      */
500     private volatile String   string;     // The only serializable field
501 
502 
503 
504     // -- Constructors and factories --
505 
506     private URI() { }               // Used internally
507 
508     /**
509      * Constructs a URI by parsing the given string.
510      *
511      * <p> This constructor parses the given string exactly as specified by the
512      * grammar in <a
513      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
514      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
515      *
516      * <ul type=disc>
517      *
518      *   <li><p> An empty authority component is permitted as long as it is
519      *   followed by a non-empty path, a query component, or a fragment
520      *   component.  This allows the parsing of URIs such as
521      *   <tt>"file:///foo/bar"</tt>, which seems to be the intent of
522      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
523      *   authority component is empty then the user-information, host, and port
524      *   components are undefined. </p></li>
525      *
526      *   <li><p> Empty relative paths are permitted; this seems to be the
527      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
528      *   primary consequence of this deviation is that a standalone fragment
529      *   such as <tt>"#foo"</tt> parses as a relative URI with an empty path
530      *   and the given fragment, and can be usefully <a
531      *   HREF="#resolve-frag">resolved</a> against a base URI.
532      *
533      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
534      *   specified by <a
535      *   HREF="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
536      *   element of a dotted-quad address must contain no more than three
537      *   decimal digits.  Each element is further constrained to have a value
538      *   no greater than 255. </p></li>
539      *
540      *   <li> <p> Hostnames in host components that comprise only a single
541      *   domain label are permitted to start with an <i>alphanum</i> 
542      *   character. This seems to be the intent of <a
543      *   HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
544      *   section&nbsp;3.2.2 although the grammar does not permit it. The
545      *   consequence of this deviation is that the authority component of a
546      *   hierarchical URI such as <tt>s://123</tt>, will parse as a server-based 
547      *   authority. </p></li>
548      *
549      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
550      *   address must be enclosed in square brackets (<tt>'['</tt> and
551      *   <tt>']'</tt>) as specified by <a
552      *   HREF="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
553      *   IPv6 address itself must parse according to <a
554      *   HREF="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
555      *   addresses are further constrained to describe no more than sixteen
556      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
557      *   but not expressible in the grammar. </p></li>
558      *
559      *   <li><p> Characters in the <i>other</i> category are permitted wherever
560      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
561      *   user-information, path, query, and fragment components, as well as in
562      *   the authority component if the authority is registry-based.  This
563      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
564      *   character set. </p></li>
565      *
566      * </ul>
567      *
568      * @param  str   The string to be parsed into a URI
569      *
570      * @throws  NullPointerException
571      *          If <tt>str</tt> is <tt>null</tt>
572      *
573      * @throws  URISyntaxException
574      *          If the given string violates RFC&nbsp;2396, as augmented
575      *          by the above deviations
576      */
577     public URI(String   str) throws URISyntaxException   {
578     new Parser(str).parse(false);
579     }
580 
581     /**
582      * Constructs a hierarchical URI from the given components.
583      *
584      * <p> If a scheme is given then the path, if also given, must either be
585      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
586      * component of the new URI may be left undefined by passing <tt>null</tt>
587      * for the corresponding parameter or, in the case of the <tt>port</tt>
588      * parameter, by passing <tt>-1</tt>.
589      *
590      * <p> This constructor first builds a URI string from the given components
591      * according to the rules specified in <a
592      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
593      * section&nbsp;5.2, step&nbsp;7: </p>
594      *
595      * <ol>
596      *
597      *   <li><p> Initially, the result string is empty. </p></li>
598      *
599      *   <li><p> If a scheme is given then it is appended to the result,
600      *   followed by a colon character (<tt>':'</tt>).  </p></li>
601      *
602      *   <li><p> If user information, a host, or a port are given then the
603      *   string <tt>"//"</tt> is appended.  </p></li>
604      *
605      *   <li><p> If user information is given then it is appended, followed by
606      *   a commercial-at character (<tt>'@'</tt>).  Any character not in the
607      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
608      *   categories is <a HREF="#quote">quoted</a>.  </p></li>
609      *
610      *   <li><p> If a host is given then it is appended.  If the host is a
611      *   literal IPv6 address but is not enclosed in square brackets
612      *   (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
613      *   </p></li>
614      *
615      *   <li><p> If a port number is given then a colon character
616      *   (<tt>':'</tt>) is appended, followed by the port number in decimal.
617      *   </p></li>
618      *
619      *   <li><p> If a path is given then it is appended.  Any character not in
620      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
621      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
622      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
623      *
624      *   <li><p> If a query is given then a question-mark character
625      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
626      *   is not a <a HREF="#legal-chars">legal URI character</a> is quoted.
627      *   </p></li>
628      *
629      *   <li><p> Finally, if a fragment is given then a hash character
630      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
631      *   that is not a legal URI character is quoted.  </p></li>
632      *
633      * </ol>
634      *
635      * <p> The resulting URI string is then parsed as if by invoking the {@link
636      * #URI(String)} constructor and then invoking the {@link
637      * #parseServerAuthority()} method upon the result; this may cause a {@link
638      * URISyntaxException} to be thrown.  </p>
639      *
640      * @param   scheme    Scheme name
641      * @param   userInfo  User name and authorization information
642      * @param   host      Host name
643      * @param   port      Port number
644      * @param   path      Path
645      * @param   query     Query
646      * @param   fragment  Fragment
647      *
648      * @throws URISyntaxException
649      *         If both a scheme and a path are given but the path is relative,
650      *         if the URI string constructed from the given components violates
651      *         RFC&nbsp;2396, or if the authority component of the string is
652      *         present but cannot be parsed as a server-based authority
653      */
654     public URI(String   scheme,
655                String   userInfo, String   host, int port,
656                String   path, String   query, String   fragment)
657     throws URISyntaxException  
658     {
659     String   s = toString(scheme, null,
660                 null, userInfo, host, port,
661                 path, query, fragment);
662     checkPath(s, scheme, path);
663     new Parser(s).parse(true);
664     }
665 
666     /**
667      * Constructs a hierarchical URI from the given components.
668      *
669      * <p> If a scheme is given then the path, if also given, must either be
670      * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
671      * component of the new URI may be left undefined by passing <tt>null</tt>
672      * for the corresponding parameter.
673      *
674      * <p> This constructor first builds a URI string from the given components
675      * according to the rules specified in <a
676      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
677      * section&nbsp;5.2, step&nbsp;7: </p>
678      *
679      * <ol>
680      *
681      *   <li><p> Initially, the result string is empty.  </p></li>
682      *
683      *   <li><p> If a scheme is given then it is appended to the result,
684      *   followed by a colon character (<tt>':'</tt>).  </p></li>
685      *
686      *   <li><p> If an authority is given then the string <tt>"//"</tt> is
687      *   appended, followed by the authority.  If the authority contains a
688      *   literal IPv6 address then the address must be enclosed in square
689      *   brackets (<tt>'['</tt> and <tt>']'</tt>).  Any character not in the
690      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
691      *   categories, and not equal to the commercial-at character
692      *   (<tt>'@'</tt>), is <a HREF="#quote">quoted</a>.  </p></li>
693      *
694      *   <li><p> If a path is given then it is appended.  Any character not in
695      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
696      *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
697      *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
698      *
699      *   <li><p> If a query is given then a question-mark character
700      *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
701      *   is not a <a HREF="#legal-chars">legal URI character</a> is quoted.
702      *   </p></li>
703      *
704      *   <li><p> Finally, if a fragment is given then a hash character
705      *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
706      *   that is not a legal URI character is quoted.  </p></li>
707      *
708      * </ol>
709      *
710      * <p> The resulting URI string is then parsed as if by invoking the {@link
711      * #URI(String)} constructor and then invoking the {@link
712      * #parseServerAuthority()} method upon the result; this may cause a {@link
713      * URISyntaxException} to be thrown.  </p>
714      *
715      * @param   scheme     Scheme name
716      * @param   authority  Authority
717      * @param   path       Path
718      * @param   query      Query
719      * @param   fragment   Fragment
720      *
721      * @throws URISyntaxException
722      *         If both a scheme and a path are given but the path is relative,
723      *         if the URI string constructed from the given components violates
724      *         RFC&nbsp;2396, or if the authority component of the string is
725      *         present but cannot be parsed as a server-based authority
726      */
727     public URI(String   scheme,
728            String   authority,
729            String   path, String   query, String   fragment)
730     throws URISyntaxException  
731     {
732     String   s = toString(scheme, null,
733                 authority, null, null, -1,
734                 path, query, fragment);
735     checkPath(s, scheme, path);
736     new Parser(s).parse(false);
737     }
738 
739     /**
740      * Constructs a hierarchical URI from the given components.
741      *
742      * <p> A component may be left undefined by passing <tt>null</tt>.
743      *
744      * <p> This convenience constructor works as if by invoking the
745      * seven-argument constructor as follows:
746      *
747      * <blockquote><tt>
748      * new&nbsp;{@link #URI(String, String, String, int, String, String, String)
749      * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);
750      * </tt></blockquote>
751      *
752      * @param   scheme    Scheme name
753      * @param   host      Host name
754      * @param   path      Path
755      * @param   fragment  Fragment
756      *
757      * @throws  URISyntaxException
758      *          If the URI string constructed from the given components
759      *          violates RFC&nbsp;2396
760      */
761     public URI(String   scheme, String   host, String   path, String   fragment)
762     throws URISyntaxException  
763     {
764     this(scheme, null, host, -1, path, null, fragment);
765     }
766 
767     /**
768      * Constructs a URI from the given components.
769      *
770      * <p> A component may be left undefined by passing <tt>null</tt>.
771      *
772      * <p> This constructor first builds a URI in string form using the given
773      * components as follows:  </p>
774      *
775      * <ol>
776      *
777      *   <li><p> Initially, the result string is empty.  </p></li>
778      *
779      *   <li><p> If a scheme is given then it is appended to the result,
780      *   followed by a colon character (<tt>':'</tt>).  </p></li>
781      *
782      *   <li><p> If a scheme-specific part is given then it is appended.  Any
783      *   character that is not a <a HREF="#legal-chars">legal URI character</a>
784      *   is <a HREF="#quote">quoted</a>.  </p></li>
785      *
786      *   <li><p> Finally, if a fragment is given then a hash character
787      *   (<tt>'#'</tt>) is appended to the string, followed by the fragment.
788      *   Any character that is not a legal URI character is quoted.  </p></li>
789      *
790      * </ol>
791      *
792      * <p> The resulting URI string is then parsed in order to create the new
793      * URI instance as if by invoking the {@link #URI(String)} constructor;
794      * this may cause a {@link URISyntaxException} to be thrown.  </p>
795      *
796      * @param   scheme    Scheme name
797      * @param   ssp       Scheme-specific part
798      * @param   fragment  Fragment
799      *
800      * @throws  URISyntaxException
801      *          If the URI string constructed from the given components
802      *          violates RFC&nbsp;2396
803      */
804     public URI(String   scheme, String   ssp, String   fragment)
805     throws URISyntaxException  
806     {
807     new Parser(toString(scheme, ssp,
808                 null, null, null, -1,
809                 null, null, fragment))
810         .parse(false);
811     }
812 
813     /**
814      * Creates a URI by parsing the given string.
815      *
816      * <p> This convenience factory method works as if by invoking the {@link
817      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
818      * constructor is caught and wrapped in a new {@link
819      * IllegalArgumentException} object, which is then thrown.
820      *
821      * <p> This method is provided for use in situations where it is known that
822      * the given string is a legal URI, for example for URI constants declared
823      * within in a program, and so it would be considered a programming error
824      * for the string not to parse as such.  The constructors, which throw
825      * {@link URISyntaxException} directly, should be used situations where a
826      * URI is being constructed from user input or from some other source that
827      * may be prone to errors.  </p>
828      *
829      * @param  str   The string to be parsed into a URI
830      * @return The new URI
831      *
832      * @throws  NullPointerException
833      *          If <tt>str</tt> is <tt>null</tt>
834      *
835      * @throws  IllegalArgumentException
836      *          If the given string violates RFC&nbsp;2396
837      */
838     public static URI   create(String   str) {
839     try {
840         return new URI  (str);
841     } catch (URISyntaxException   x) {
842         IllegalArgumentException   y = new IllegalArgumentException  ();
843         y.initCause(x);
844         throw y;
845     }
846     }
847 
848 
849     // -- Operations --
850 
851     /**
852      * Attempts to parse this URI's authority component, if defined, into
853      * user-information, host, and port components.
854      *
855      * <p> If this URI's authority component has already been recognized as
856      * being server-based then it will already have been parsed into
857      * user-information, host, and port components.  In this case, or if this
858      * URI has no authority component, this method simply returns this URI.
859      *
860      * <p> Otherwise this method attempts once more to parse the authority
861      * component into user-information, host, and port components, and throws
862      * an exception describing why the authority component could not be parsed
863      * in that way.
864      *
865      * <p> This method is provided because the generic URI syntax specified in
866      * <a HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
867      * cannot always distinguish a malformed server-based authority from a
868      * legitimate registry-based authority.  It must therefore treat some
869      * instances of the former as instances of the latter.  The authority
870      * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
871      * legal server-based authority but it is legal as a registry-based
872      * authority.
873      *
874      * <p> In many common situations, for example when working URIs that are
875      * known to be either URNs or URLs, the hierarchical URIs being used will
876      * always be server-based.  They therefore must either be parsed as such or
877      * treated as an error.  In these cases a statement such as
878      *
879      * <blockquote>
880      * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
881      * </blockquote>
882      *
883      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
884      * it has an authority component, has a server-based authority with proper
885      * user-information, host, and port components.  Invoking this method also
886      * ensures that if the authority could not be parsed in that way then an
887      * appropriate diagnostic message can be issued based upon the exception
888      * that is thrown. </p>
889      *
890      * @return  A URI whose authority field has been parsed
891      *          as a server-based authority
892      *
893      * @throws  URISyntaxException
894      *          If the authority component of this URI is defined
895      *          but cannot be parsed as a server-based authority
896      *          according to RFC&nbsp;2396
897      */
898     public URI   parseServerAuthority()
899     throws URISyntaxException  
900     {
901     // We could be clever and cache the error message and index from the
902     // exception thrown during the original parse, but that would require
903     // either more fields or a more-obscure representation.
904     if ((host != null) || (authority == null))
905         return this;
906     defineString();
907     new Parser(string).parse(true);
908     return this;
909     }
910 
911     /**
912      * Normalizes this URI's path.
913      *
914      * <p> If this URI is opaque, or if its path is already in normal form,
915      * then this URI is returned.  Otherwise a new URI is constructed that is
916      * identical to this URI except that its path is computed by normalizing
917      * this URI's path in a manner consistent with <a
918      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
919      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
920      * </p>
921      *
922      * <ol>
923      *
924      *   <li><p> All <tt>"."</tt> segments are removed. </p></li>
925      *
926      *   <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
927      *   segment then both of these segments are removed.  This step is
928      *   repeated until it is no longer applicable. </p></li>
929      *
930      *   <li><p> If the path is relative, and if its first segment contains a
931      *   colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
932      *   prepended.  This prevents a relative URI with a path such as
933      *   <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
934      *   scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
935      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
936      *
937      * </ol>
938      *
939      * <p> A normalized path will begin with one or more <tt>".."</tt> segments
940      * if there were insufficient non-<tt>".."</tt> segments preceding them to
941      * allow their removal.  A normalized path will begin with a <tt>"."</tt>
942      * segment if one was inserted by step 3 above.  Otherwise, a normalized
943      * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
944      *
945      * @return  A URI equivalent to this URI,
946      *          but whose path is in normal form
947      */
948     public URI   normalize() {
949     return normalize(this);
950     }
951 
952     /**
953      * Resolves the given URI against this URI.
954      *
955      * <p> If the given URI is already absolute, or if this URI is opaque, then
956      * the given URI is returned.
957      *
958      * <p><a name="resolve-frag"></a> If the given URI's fragment component is
959      * defined, its path component is empty, and its scheme, authority, and
960      * query components are undefined, then a URI with the given fragment but
961      * with all other components equal to those of this URI is returned.  This
962      * allows a URI representing a standalone fragment reference, such as
963      * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
964      *
965      * <p> Otherwise this method constructs a new hierarchical URI in a manner
966      * consistent with <a
967      * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
968      * section&nbsp;5.2; that is: </p>
969      *
970      * <ol>
971      *
972      *   <li><p> A new URI is constructed with this URI's scheme and the given
973      *   URI's query and fragment components. </p></li>
974      *
975      *   <li><p> If the given URI has an authority component then the new URI's
976      *   authority and path are taken from the given URI. </p></li>
977      *
978      *   <li><p> Otherwise the new URI's authority component is copied from
979      *   this URI, and its path is computed as follows: </p></li>
980      *
981      *   <ol type=a>
982      *
983      *     <li><p> If the given URI's path is absolute then the new URI's path
984      *     is taken from the given URI. </p></li>
985      *
986      *     <li><p> Otherwise the given URI's path is relative, and so the new
987      *     URI's path is computed by resolving the path of the given URI
988      *     against the path of this URI.  This is done by concatenating all but
989      *     the last segment of this URI's path, if any, with the given URI's
990      *     path and then normalizing the result as if by invoking the {@link
991      *     #normalize() normalize} method. </p></li>
992      *
993      *   </ol>
994      *
995      * </ol>
996      *
997      * <p> The result of this method is absolute if, and only if, either this
998      * URI is absolute or the given URI is absolute.  </p>
999      *
1000     * @param  uri  The URI to be resolved against this URI
1001     * @return The resulting URI
1002     *
1003     * @throws  NullPointerException
1004     *          If <tt>uri</tt> is <tt>null</tt>
1005     */
1006    public URI   resolve(URI   uri) {
1007    return resolve(this, uri);
1008    }
1009
1010    /**
1011     * Constructs a new URI by parsing the given string and then resolving it
1012     * against this URI.
1013     *
1014     * <p> This convenience method works as if invoking it were equivalent to
1015     * evaluating the expression <tt>{@link #resolve(java.net.URI)
1016     * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
1017     *
1018     * @param  str   The string to be parsed into a URI
1019     * @return The resulting URI
1020     *
1021     * @throws  NullPointerException
1022     *          If <tt>str</tt> is <tt>null</tt>
1023     *
1024     * @throws  IllegalArgumentException
1025     *          If the given string violates RFC&nbsp;2396
1026     */
1027    public URI   resolve(String   str) {
1028    return resolve(URI.create(str));
1029    }
1030
1031    /**
1032     * Relativizes the given URI against this URI.
1033     *
1034     * <p> The relativization of the given URI against this URI is computed as
1035     * follows: </p>
1036     *
1037     * <ol>
1038     *
1039     *   <li><p> If either this URI or the given URI are opaque, or if the
1040     *   scheme and authority components of the two URIs are not identical, or
1041     *   if the path of this URI is not a prefix of the path of the given URI,
1042     *   then the given URI is returned. </p></li>
1043     *
1044     *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1045     *   query and fragment components taken from the given URI and with a path
1046     *   component computed by removing this URI's path from the beginning of
1047     *   the given URI's path. </p></li>
1048     *
1049     * </ol>
1050     *
1051     * @param  uri  The URI to be relativized against this URI
1052     * @return The resulting URI
1053     *
1054     * @throws  NullPointerException
1055     *          If <tt>uri</tt> is <tt>null</tt>
1056     */
1057    public URI   relativize(URI   uri) {
1058    return relativize(this, uri);
1059    }
1060
1061    /**
1062     * Constructs a URL from this URI.
1063     *
1064     * <p> This convenience method works as if invoking it were equivalent to
1065     * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after
1066     * first checking that this URI is absolute. </p>
1067     *
1068     * @return  A URL constructed from this URI
1069     *
1070     * @throws  IllegalArgumentException
1071     *          If this URL is not absolute
1072     *
1073     * @throws  MalformedURLException
1074     *          If a protocol handler for the URL could not be found,
1075     *          or if some other error occurred while constructing the URL
1076     */
1077    public URL   toURL()
1078    throws MalformedURLException   {
1079    if (!isAbsolute())
1080        throw new IllegalArgumentException  ("URI is not absolute");
1081    return new URL  (toString());
1082    }
1083
1084    // -- Component access methods --
1085
1086    /**
1087     * Returns the scheme component of this URI.
1088     *
1089     * <p> The scheme component of a URI, if defined, only contains characters
1090     * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>.  A
1091     * scheme always starts with an <i>alpha</i> character. <p>
1092     *
1093     * The scheme component of a URI cannot contain escaped octets, hence this
1094     * method does not perform any decoding.
1095     *
1096     * @return  The scheme component of this URI,
1097     *          or <tt>null</tt> if the scheme is undefined
1098     */
1099    public String   getScheme() {
1100    return scheme;
1101    }
1102
1103    /**
1104     * Tells whether or not this URI is absolute.
1105     *
1106     * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1107     *
1108     * @return  <tt>true</tt> if, and only if, this URI is absolute
1109     */
1110    public boolean isAbsolute() {
1111    return scheme != null;
1112    }
1113
1114    /**
1115     * Tells whether or not this URI is opaque.
1116     *
1117     * <p> A URI is opaque if, and only if, it is absolute and its
1118     * scheme-specific part does not begin with a slash character ('/').
1119     * An opaque URI has a scheme, a scheme-specific part, and possibly
1120     * a fragment; all other components are undefined. </p>
1121     *
1122     * @return  <tt>true</tt> if, and only if, this URI is opaque
1123     */
1124    public boolean isOpaque() {
1125        return path == null;
1126    }
1127
1128    /**
1129     * Returns the raw scheme-specific part of this URI.  The scheme-specific
1130     * part is never undefined, though it may be empty.
1131     *
1132     * <p> The scheme-specific part of a URI only contains legal URI
1133     * characters. </p>
1134     *
1135     * @return  The raw scheme-specific part of this URI
1136     *          (never <tt>null</tt>)
1137     */
1138    public String   getRawSchemeSpecificPart() {
1139    defineSchemeSpecificPart();
1140    return schemeSpecificPart;
1141    }
1142
1143    /**
1144     * Returns the decoded scheme-specific part of this URI.
1145     *
1146     * <p> The string returned by this method is equal to that returned by the
1147     * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1148     * except that all sequences of escaped octets are <a
1149     * HREF="#decode">decoded</a>.  </p>
1150     *
1151     * @return  The decoded scheme-specific part of this URI
1152     *          (never <tt>null</tt>)
1153     */
1154    public String   getSchemeSpecificPart() {
1155    if (decodedSchemeSpecificPart == null)
1156        decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
1157    return decodedSchemeSpecificPart;
1158    }
1159
1160    /**
1161     * Returns the raw authority component of this URI.
1162     *
1163     * <p> The authority component of a URI, if defined, only contains the
1164     * commercial-at character (<tt>'@'</tt>) and characters in the
1165     * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1166     * categories.  If the authority is server-based then it is further
1167     * constrained to have valid user-information, host, and port
1168     * components. </p>
1169     *
1170     * @return  The raw authority component of this URI,
1171     *          or <tt>null</tt> if the authority is undefined
1172     */
1173    public String   getRawAuthority() {
1174    return authority;
1175    }
1176
1177    /**
1178     * Returns the decoded authority component of this URI.
1179     *
1180     * <p> The string returned by this method is equal to that returned by the
1181     * {@link #getRawAuthority() getRawAuthority} method except that all
1182     * sequences of escaped octets are <a HREF="#decode">decoded</a>.  </p>
1183     *
1184     * @return  The decoded authority component of this URI,
1185     *          or <tt>null</tt> if the authority is undefined
1186     */
1187    public String   getAuthority() {
1188    if (decodedAuthority == null)
1189        decodedAuthority = decode(authority);
1190    return decodedAuthority;
1191    }
1192
1193    /**
1194     * Returns the raw user-information component of this URI.
1195     *
1196     * <p> The user-information component of a URI, if defined, only contains
1197     * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1198     * <i>other</i> categories. </p>
1199     *
1200     * @return  The raw user-information component of this URI,
1201     *          or <tt>null</tt> if the user information is undefined
1202     */
1203    public String   getRawUserInfo() {
1204    return userInfo;
1205    }
1206
1207    /**
1208     * Returns the decoded user-information component of this URI.
1209     *
1210     * <p> The string returned by this method is equal to that returned by the
1211     * {@link #getRawUserInfo() getRawUserInfo} method except that all
1212     * sequences of escaped octets are <a HREF="#decode">decoded</a>.  </p>
1213     *
1214     * @return  The decoded user-information component of this URI,
1215     *          or <tt>null</tt> if the user information is undefined
1216     */
1217    public String   getUserInfo() {
1218    if ((decodedUserInfo == null) && (userInfo != null))
1219        decodedUserInfo = decode(userInfo);
1220    return decodedUserInfo;
1221    }
1222
1223    /**
1224     * Returns the host component of this URI.
1225     *
1226     * <p> The host component of a URI, if defined, will have one of the
1227     * following forms: </p>
1228     *
1229     * <ul type=disc>
1230     *
1231     *   <li><p> A domain name consisting of one or more <i>labels</i>
1232     *   separated by period characters (<tt>'.'</tt>), optionally followed by
1233     *   a period character.  Each label consists of <i>alphanum</i> characters
1234     *   as well as hyphen characters (<tt>'-'</tt>), though hyphens never
1235     *   occur as the first or last characters in a label. The rightmost
1236     *   label of a domain name consisting of two or more labels, begins
1237     *   with an <i>alpha</i> character. </li>
1238     *
1239     *   <li><p> A dotted-quad IPv4 address of the form
1240     *   <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
1241     *   where no <i>digit</i> sequence is longer than three characters and no
1242     *   sequence has a value larger than 255. </p></li>
1243     *
1244     *   <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
1245     *   <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
1246     *   (<tt>':'</tt>), and possibly an embedded IPv4 address.  The full
1247     *   syntax of IPv6 addresses is specified in <a
1248     *   HREF="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1249     *   Addressing Architecture</i></a>.  </p></li>
1250     *
1251     * </ul>
1252     *
1253     * The host component of a URI cannot contain escaped octets, hence this
1254     * method does not perform any decoding.
1255     *
1256     * @return  The host component of this URI,
1257     *          or <tt>null</tt> if the host is undefined
1258     */
1259    public String   getHost() {
1260    return host;
1261    }
1262
1263    /**
1264     * Returns the port number of this URI.
1265     *
1266     * <p> The port component of a URI, if defined, is a non-negative
1267     * integer. </p>
1268     *
1269     * @return  The port component of this URI,
1270     *          or <tt>-1</tt> if the port is undefined
1271     */
1272    public int getPort() {
1273    return port;
1274    }
1275
1276    /**
1277     * Returns the raw path component of this URI.
1278     *
1279     * <p> The path component of a URI, if defined, only contains the slash
1280     * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
1281     * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1282     * and <i>other</i> categories. </p>
1283     *
1284     * @return  The path component of this URI,
1285     *          or <tt>null</tt> if the path is undefined
1286     */
1287    public String   getRawPath() {
1288    return path;
1289    }
1290
1291    /**
1292     * Returns the decoded path component of this URI.
1293     *
1294     * <p> The string returned by this method is equal to that returned by the
1295     * {@link #getRawPath() getRawPath} method except that all sequences of
1296     * escaped octets are <a HREF="#decode">decoded</a>.  </p>
1297     *
1298     * @return  The decoded path component of this URI,
1299     *          or <tt>null</tt> if the path is undefined
1300     */
1301    public String   getPath() {
1302    if ((decodedPath == null) && (path != null))
1303        decodedPath = decode(path);
1304    return decodedPath;
1305    }
1306
1307    /**
1308     * Returns the raw query component of this URI.
1309     *
1310     * <p> The query component of a URI, if defined, only contains legal URI
1311     * characters. </p>
1312     *
1313     * @return  The raw query component of this URI,
1314     *          or <tt>null</tt> if the query is undefined
1315     */
1316    public String   getRawQuery() {
1317    return query;
1318    }
1319
1320    /**
1321     * Returns the decoded query component of this URI.
1322     *
1323     * <p> The string returned by this method is equal to that returned by the
1324     * {@link #getRawQuery() getRawQuery} method except that all sequences of
1325     * escaped octets are <a HREF="#decode">decoded</a>.  </p>
1326     *
1327     * @return  The decoded query component of this URI,
1328     *          or <tt>null</tt> if the query is undefined
1329     */
1330    public String   getQuery() {
1331    if ((decodedQuery == null) && (query != null))
1332        decodedQuery = decode(query);
1333    return decodedQuery;
1334    }
1335
1336    /**
1337     * Returns the raw fragment component of this URI.
1338     *
1339     * <p> The fragment component of a URI, if defined, only contains legal URI
1340     * characters. </p>
1341     *
1342     * @return  The raw fragment component of this URI,
1343     *          or <tt>null</tt> if the fragment is undefined
1344     */
1345    public String   getRawFragment() {
1346    return fragment;
1347    }
1348
1349    /**
1350     * Returns the decoded fragment component of this URI.
1351     *
1352     * <p> The string returned by this method is equal to that returned by the
1353     * {@link #getRawFragment() getRawFragment} method except that all
1354     * sequences of escaped octets are <a HREF="#decode">decoded</a>.  </p>
1355     *
1356     * @return  The decoded fragment component of this URI,
1357     *          or <tt>null</tt> if the fragment is undefined
1358     */
1359    public String   getFragment() {
1360    if ((decodedFragment == null) && (fragment != null))
1361        decodedFragment = decode(fragment);
1362    return decodedFragment;
1363    }
1364
1365
1366    // -- Equality, comparison, hash code, toString, and serialization --
1367
1368    /**
1369     * Tests this URI for equality with another object.
1370     *
1371     * <p> If the given object is not a URI then this method immediately
1372     * returns <tt>false</tt>.
1373     *
1374     * <p> For two URIs to be considered equal requires that either both are
1375     * opaque or both are hierarchical.  Their schemes must either both be
1376     * undefined or else be equal without regard to case. Their fragments
1377     * must either both be undefined or else be equal.
1378     *
1379     * <p> For two opaque URIs to be considered equal, their scheme-specific
1380     * parts must be equal.
1381     *
1382     * <p> For two hierarchical URIs to be considered equal, their paths must
1383     * be equal and their queries must either both be undefined or else be
1384     * equal.  Their authorities must either both be undefined, or both be
1385     * registry-based, or both be server-based.  If their authorities are
1386     * defined and are registry-based, then they must be equal.  If their
1387     * authorities are defined and are server-based, then their hosts must be
1388     * equal without regard to case, their port numbers must be equal, and
1389     * their user-information components must be equal.
1390     *
1391     * <p> When testing the user-information, path, query, fragment, authority,
1392     * or scheme-specific parts of two URIs for equality, the raw forms rather
1393     * than the encoded forms of these components are compared and the
1394     * hexadecimal digits of escaped octets are compared without regard to
1395     * case.
1396     *
1397     * <p> This method satisfies the general contract of the {@link
1398     * java.lang.Object#equals(Object) Object.equals} method. </p>
1399     *
1400     * @param   ob   The object to which this object is to be compared
1401     *
1402     * @return  <tt>true</tt> if, and only if, the given object is a URI that
1403     *          is identical to this URI
1404     */
1405    public boolean equals(Object   ob) {
1406    if (ob == this)
1407        return true;
1408    if (!(ob instanceof URI  ))
1409        return false;
1410    URI   that = (URI  )ob;
1411    if (this.isOpaque() != that.isOpaque()) return false;
1412    if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1413    if (!equal(this.fragment, that.fragment)) return false;
1414
1415    // Opaque
1416    if (this.isOpaque())
1417        return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1418
1419    // Hierarchical
1420    if (!equal(this.path, that.path)) return false;
1421    if (!equal(this.query, that.query)) return false;
1422
1423    // Authorities
1424    if (this.authority == that.authority) return true;
1425    if (this.host != null) {
1426        // Server-based
1427        if (!equal(this.userInfo, that.userInfo)) return false;
1428        if (!equalIgnoringCase(this.host, that.host)) return false;
1429        if (this.port != that.port) return false;
1430    } else if (this.authority != null) {
1431        // Registry-based
1432        if (!equal(this.authority, that.authority)) return false;
1433    } else if (this.authority != that.authority) {
1434        return false;
1435    }
1436
1437    return true;
1438    }
1439
1440    /**
1441     * Returns a hash-code value for this URI.  The hash code is based upon all
1442     * of the URI's components, and satisfies the general contract of the
1443     * {@link java.lang.Object#hashCode() Object.hashCode} method.
1444     *
1445     * @return  A hash-code value for this URI
1446     */
1447    public int hashCode() {
1448    if (hash != 0)
1449        return hash;
1450    int h = hashIgnoringCase(0, scheme);
1451    h = hash(h, fragment);
1452    if (isOpaque()) {
1453        h = hash(h, schemeSpecificPart);
1454    } else {
1455        h = hash(h, path);
1456        h = hash(h, query);
1457        if (host != null) {
1458        h = hash(h, userInfo);
1459        h = hashIgnoringCase(h, host);
1460        h += 1949 * port;
1461        } else {
1462        h = hash(h, authority);
1463        }
1464    }
1465    hash = h;
1466    return h;
1467    }
1468
1469    /**
1470     * Compares this URI to another object, which must be a URI.
1471     *
1472     * <p> When comparing corresponding components of two URIs, if one
1473     * component is undefined but the other is defined then the first is
1474     * considered to be less than the second.  Unless otherwise noted, string
1475     * components are ordered according to their natural, case-sensitive
1476     * ordering as defined by the {@link java.lang.String#compareTo(Object)
1477     * String.compareTo} method.  String components that are subject to
1478     * encoding are compared by comparing their raw forms rather than their
1479     * encoded forms.
1480     *
1481     * <p> The ordering of URIs is defined as follows: </p>
1482     *
1483     * <ul type=disc>
1484     *
1485     *   <li><p> Two URIs with different schemes are ordered according the
1486     *   ordering of their schemes, without regard to case. </p></li>
1487     *
1488     *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1489     *   with an identical scheme. </p></li>
1490     *
1491     *   <li><p> Two opaque URIs with identical schemes are ordered according
1492     *   to the ordering of their scheme-specific parts. </p></li>
1493     *
1494     *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1495     *   parts are ordered according to the ordering of their
1496     *   fragments. </p></li>
1497     *
1498     *   <li><p> Two hierarchical URIs with identical schemes are ordered
1499     *   according to the ordering of their authority components: </p></li>
1500     *
1501     *   <ul type=disc>
1502     *
1503     *     <li><p> If both authority components are server-based then the URIs
1504     *     are ordered according to their user-information components; if these
1505     *     components are identical then the URIs are ordered according to the
1506     *     ordering of their hosts, without regard to case; if the hosts are
1507     *     identical then the URIs are ordered according to the ordering of
1508     *     their ports. </p></li>
1509     *
1510     *     <li><p> If one or both authority components are registry-based then
1511     *     the URIs are ordered according to the ordering of their authority
1512     *     components. </p></li>
1513     *
1514     *   </ul>
1515     *
1516     *   <li><p> Finally, two hierarchical URIs with identical schemes and
1517     *   authority components are ordered according to the ordering of their
1518     *   paths; if their paths are identical then they are ordered according to
1519     *   the ordering of their queries; if the queries are identical then they
1520     *   are ordered according to the order of their fragments. </p></li>
1521     *
1522     * </ul>
1523     *
1524     * <p> This method satisfies the general contract of the {@link
1525     * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1526     * method. </p>
1527     *
1528     * @param   ob
1529     *          The object to which this URI is to be compared
1530     *
1531     * @return  A negative integer, zero, or a positive integer as this URI is
1532     *          less than, equal to, or greater than the given URI
1533     *
1534     * @throws  ClassCastException
1535     *          If the given object is not a URI
1536     */
1537    public int compareTo(URI   that) {
1538    int c;
1539
1540    if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1541        return c;
1542
1543    if (this.isOpaque()) {
1544        if (that.isOpaque()) {
1545        // Both opaque
1546        if ((c = compare(this.schemeSpecificPart,
1547                 that.schemeSpecificPart)) != 0)
1548            return c;
1549        return compare(this.fragment, that.fragment);
1550        }
1551        return +1;          // Opaque > hierarchical
1552    } else if (that.isOpaque()) {
1553        return -1;          // Hierarchical < opaque
1554    }
1555
1556    // Hierarchical
1557    if ((this.host != null) && (that.host != null)) {
1558        // Both server-based
1559        if ((c = compare(this.userInfo, that.userInfo)) != 0)
1560        return c;
1561        if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1562        return c;
1563        if ((c = this.port - that.port) != 0)
1564        return c;
1565    } else {
1566        // If one or both authorities are registry-based then we simply
1567        // compare them in the usual, case-sensitive way.  If one is
1568        // registry-based and one is server-based then the strings are
1569        // guaranteed to be unequal, hence the comparison will never return
1570        // zero and the compareTo and equals methods will remain
1571        // consistent.
1572        if ((c = compare(this.authority, that.authority)) != 0) return c;
1573    }
1574
1575    if ((c = compare(this.path, that.path)) != 0) return c;
1576    if ((c = compare(this.query, that.query)) != 0) return c;
1577    return compare(this.fragment, that.fragment);
1578    }
1579
1580    /**
1581     * Returns the content of this URI as a string.
1582     *
1583     * <p> If this URI was created by invoking one of the constructors in this
1584     * class then a string equivalent to the original input string, or to the
1585     * string computed from the originally-given components, as appropriate, is
1586     * returned.  Otherwise this URI was created by normalization, resolution,
1587     * or relativization, and so a string is constructed from this URI's
1588     * components according to the rules specified in <a
1589     * HREF="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1590     * section&nbsp;5.2, step&nbsp;7. </p>
1591     *
1592     * @return  The string form of this URI
1593     */
1594    public String   toString() {
1595    defineString();
1596    return string;
1597    }
1598
1599    /**
1600     * Returns the content of this URI as a US-ASCII string.
1601     *
1602     * <p> If this URI does not contain any characters in the <i>other</i>
1603     * category then an invocation of this method will return the same value as
1604     * an invocation of the {@link #toString() toString} method.  Otherwise
1605     * this method works as if by invoking that method and then <a
1606     * HREF="#encode">encoding</a> the result.  </p>
1607     *
1608     * @return  The string form of this URI, encoded as needed
1609     *          so that it only contains characters in the US-ASCII
1610     *          charset
1611     */
1612    public String   toASCIIString() {
1613    defineString();
1614    return encode(string);
1615    }
1616
1617
1618    // -- Serialization support --
1619
1620    /**
1621     * Saves the content of this URI to the given serial stream.
1622     *
1623     * <p> The only serializable field of a URI instance is its <tt>string</tt>
1624     * field.  That field is given a value, if it does not have one already,
1625     * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1626     * method of the given object-output stream is invoked. </p>
1627     *
1628     * @param  os  The object-output stream to which this object
1629     *             is to be written
1630     */
1631    private void writeObject(ObjectOutputStream   os)
1632    throws IOException  
1633    {
1634    defineString();
1635    os.defaultWriteObject();    // Writes the string field only
1636    }
1637
1638    /**
1639     * Reconstitutes a URI from the given serial stream.
1640     *
1641     * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1642     * invoked to read the value of the <tt>string</tt> field.  The result is
1643     * then parsed in the usual way.
1644     *
1645     * @param  is  The object-input stream from which this object
1646     *             is being read
1647     */
1648    private void readObject(ObjectInputStream   is)
1649    throws ClassNotFoundException  , IOException  
1650    {
1651    port = -1;          // Argh
1652    is.defaultReadObject();
1653    try {
1654        new Parser(string).parse(false);
1655    } catch (URISyntaxException   x) {
1656        IOException   y = new InvalidObjectException  ("Invalid URI");
1657        y.initCause(x);
1658        throw y;
1659    }
1660    }
1661
1662
1663    // -- End of public methods --
1664
1665
1666    // -- Utility methods for string-field comparison and hashing --
1667
1668    // These methods return appropriate values for null string arguments,
1669    // thereby simplifying the equals, hashCode, and compareTo methods.
1670    //
1671    // The case-ignoring methods should only be applied to strings whose
1672    // characters are all known to be US-ASCII.  Because of this restriction,
1673    // these methods are faster than the similar methods in the String class.
1674
1675    // US-ASCII only
1676    private static int toLower(char c) {
1677    if ((c >= 'A') && (c <= 'Z'))
1678        return c + ('a' - 'A');
1679    return c;
1680    }
1681
1682    private static boolean equal(String   s, String   t) {
1683    if (s == t) return true;
1684    if ((s != null) && (t != null)) {
1685        if (s.length() != t.length())
1686        return false;
1687        if (s.indexOf('%') < 0)
1688        return s.equals(t);
1689        int n = s.length();
1690        for (int i = 0; i < n;) {
1691        char c = s.charAt(i);
1692        char d = t.charAt(i);
1693        if (c != '%') {
1694            if (c != d)
1695            return false;
1696            i++;
1697            continue;
1698        }
1699        i++;
1700        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1701            return false;
1702        i++;
1703        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1704            return false;
1705        i++;
1706        }
1707        return true;
1708    }
1709    return false;
1710    }
1711
1712    // US-ASCII only
1713    private static boolean equalIgnoringCase(String   s, String   t) {
1714    if (s == t) return true;
1715    if ((s != null) && (t != null)) {
1716        int n = s.length();
1717        if (t.length() != n)
1718        return false;
1719        for (int i = 0; i < n; i++) {
1720        if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1721            return false;
1722        }
1723        return true;
1724    }
1725    return false;
1726    }
1727
1728    private static int hash(int hash, String   s) {
1729    if (s == null) return hash;
1730    return hash * 127 + s.hashCode();
1731    }
1732
1733    // US-ASCII only
1734    private static int hashIgnoringCase(int hash, String   s) {
1735    if (s == null) return hash;
1736    int h = hash;
1737    int n = s.length();
1738    for (int i = 0; i < n; i++)
1739        h = 31 * h + toLower(s.charAt(i));
1740    return h;
1741    }
1742
1743    private static int compare(String   s, String   t) {
1744    if (s == t) return 0;
1745    if (s != null) {
1746        if (t != null)
1747        return s.compareTo(t);
1748        else
1749        return +1;
1750    } else {
1751        return -1;
1752    }
1753    }
1754
1755    // US-ASCII only
1756    private static int compareIgnoringCase(String   s, String   t) {
1757    if (s == t) return 0;
1758    if (s != null) {
1759        if (t != null) {
1760        int sn = s.length();
1761        int tn = t.length();
1762        int n = sn < tn ? sn : tn;
1763        for (int i = 0; i < n; i++) {
1764            int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1765            if (c != 0)
1766            return c;
1767        }
1768        return sn - tn;
1769        }
1770        return +1;
1771    } else {
1772        return -1;
1773    }
1774    }
1775
1776
1777    // -- String construction --
1778
1779    // If a scheme is given then the path, if given, must be absolute
1780    //
1781    private static void checkPath(String   s, String   scheme, String   path)
1782    throws URISyntaxException  
1783    {
1784    if (scheme != null) {
1785        if ((path != null)
1786        && ((path.length() > 0) && (path.charAt(0) != '/')))
1787        throw new URISyntaxException  (s,
1788                         "Relative path in absolute URI");
1789    }
1790    }
1791
1792    private void appendAuthority(StringBuffer   sb,
1793                 String   authority,
1794                 String   userInfo,
1795                 String   host,
1796                 int port)
1797    {
1798    if (host != null) {
1799        sb.append("//");
1800        if (userInfo != null) {
1801        sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1802        sb.append('@');
1803        }
1804        boolean needBrackets = ((host.indexOf(':') >= 0)
1805                    && !host.startsWith("[")
1806                    && !host.endsWith("]"));
1807        if (needBrackets) sb.append('[');
1808        sb.append(host);
1809        if (needBrackets) sb.append(']');
1810        if (port != -1) {
1811        sb.append(':');
1812        sb.append(port);
1813        }
1814    } else if (authority != null) {
1815        sb.append("//");
1816        if (authority.startsWith("[")) {
1817        int end = authority.indexOf("]");
1818        if (end != -1 && authority.indexOf(":")!=-1) {
1819            String   doquote, dontquote;
1820            if (end == authority.length()) {
1821            dontquote = authority;
1822            doquote = "";
1823            } else {
1824                dontquote = authority.substring(0,end+1);
1825            doquote = authority.substring(end+1);
1826            }
1827            sb.append (dontquote);
1828                sb.append(quote(doquote, 
1829                L_REG_NAME | L_SERVER,
1830                H_REG_NAME | H_SERVER));
1831        }
1832        } else {
1833            sb.append(quote(authority,
1834                L_REG_NAME | L_SERVER,
1835                H_REG_NAME | H_SERVER));
1836        }
1837    }
1838    }
1839
1840    private void appendSchemeSpecificPart(StringBuffer   sb,
1841                      String   opaquePart,
1842                      String   authority,
1843                      String   userInfo,
1844                      String   host,
1845                      int port,
1846                      String   path,
1847                      String   query)
1848    {
1849    if (opaquePart != null) {
1850        /* check if SSP begins with an IPv6 address
1851         * because we must not quote a literal IPv6 address
1852         */
1853        if (opaquePart.startsWith("//[")) {
1854        int end =  opaquePart.indexOf("]");
1855        if (end != -1 && opaquePart.indexOf(":")!=-1) {
1856            String   doquote, dontquote;
1857            if (end == opaquePart.length()) {
1858            dontquote = opaquePart;
1859            doquote = "";
1860            } else {
1861                dontquote = opaquePart.substring(0,end+1);
1862            doquote = opaquePart.substring(end+1);
1863            }
1864            sb.append (dontquote);
1865                sb.append(quote(doquote, L_URIC, H_URIC));
1866        }
1867        } else {
1868            sb.append(quote(opaquePart, L_URIC, H_URIC));
1869        }
1870    } else {
1871        appendAuthority(sb, authority, userInfo, host, port);
1872        if (path != null) 
1873        sb.append(quote(path, L_PATH, H_PATH));
1874        if (query != null) {
1875        sb.append('?');
1876        sb.append(quote(query, L_URIC, H_URIC));
1877        }
1878    }
1879    }
1880
1881    private void appendFragment(StringBuffer   sb, String   fragment) {
1882    if (fragment != null) {
1883        sb.append('#');
1884        sb.append(quote(fragment, L_URIC, H_URIC));
1885    }
1886    }
1887
1888    
1889    //
1890    // Note for maintainer: sun.net.www.ParseUtil.createURI(...) clones
1891    // this method and all necessary auxiliary code to fix 6274990-2127017.
1892    // Any change made here should be propagated to sun.net.www.ParseUtil.
1893    // The requirement only applies to 5.0 update release.
1894    //
1895    private String   toString(String   scheme,
1896                String   opaquePart,
1897                String   authority,
1898                String   userInfo,
1899                String   host,
1900                int port,
1901                String   path,
1902                String   query,
1903                String   fragment)
1904    {
1905    StringBuffer   sb = new StringBuffer  ();
1906    if (scheme != null) {
1907        sb.append(scheme);
1908        sb.append(':');
1909    }
1910    appendSchemeSpecificPart(sb, opaquePart,
1911                 authority, userInfo, host, port,
1912                 path, query);
1913    appendFragment(sb, fragment);
1914    return sb.toString();
1915    }
1916
1917    private void defineSchemeSpecificPart() {
1918    if (schemeSpecificPart != null) return;
1919    StringBuffer   sb = new StringBuffer  ();
1920    appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1921                 host, port, getPath(), getQuery());
1922    if (sb.length() == 0) return;
1923    schemeSpecificPart = sb.toString();
1924    }
1925
1926    private void defineString() {
1927    if (string != null) return;
1928
1929    StringBuffer   sb = new StringBuffer  ();
1930        if (scheme != null) {
1931            sb.append(scheme);
1932            sb.append(':');
1933        }
1934    if (isOpaque()) {
1935            sb.append(schemeSpecificPart);
1936        } else {
1937        if (host != null) {
1938                sb.append("//");
1939                if (userInfo != null) {
1940                    sb.append(userInfo);
1941                    sb.append('@');
1942                }
1943                boolean needBrackets = ((host.indexOf(':') >= 0)
1944                                    && !host.startsWith("[")
1945                                    && !host.endsWith("]"));
1946                if (needBrackets) sb.append('[');
1947                sb.append(host);
1948                if (needBrackets) sb.append(']');
1949                if (port != -1) {
1950                    sb.append(':');
1951                    sb.append(port);
1952                }
1953            } else if (authority != null) {
1954                sb.append("//");
1955                sb.append(authority);
1956        }
1957            if (path != null)
1958                sb.append(path);
1959            if (query != null) {
1960                sb.append('?');
1961                sb.append(query);
1962            }
1963        }
1964    if (fragment != null) {
1965            sb.append('#');
1966            sb.append(fragment);
1967    }
1968    string = sb.toString();
1969    }
1970
1971
1972    // -- Normalization, resolution, and relativization --
1973
1974    // RFC2396 5.2 (6)
1975    private static String   resolvePath(String   base, String   child,
1976                      boolean absolute)
1977    {
1978        int i = base.lastIndexOf('/');
1979    int cn = child.length();
1980    String   path = "";
1981
1982    if (cn == 0) {
1983        // 5.2 (6a)
1984        if (i >= 0)
1985        path = base.substring(0, i + 1);
1986    } else {
1987        StringBuffer   sb = new StringBuffer  (base.length() + cn);
1988        // 5.2 (6a)
1989        if (i >= 0)
1990        sb.append(base.substring(0, i + 1));
1991        // 5.2 (6b)
1992        sb.append(child);
1993        path = sb.toString();
1994    }
1995
1996    // 5.2 (6c-f)
1997    String   np = normalize(path);
1998
1999    // 5.2 (6g): If the result is absolute but the path begins with "../",
2000    // then we simply leave the path as-is
2001
2002    return np;
2003    }
2004
2005    // RFC2396 5.2
2006    private static URI   resolve(URI   base, URI   child) {
2007    // check if child if opaque first so that NPE is thrown 
2008    // if child is null.
2009    if (child.isOpaque() || base.isOpaque())
2010        return child;
2011
2012    // 5.2 (2): Reference to current document (lone fragment)
2013    if ((child.scheme == null) && (child.authority == null)
2014        && child.path.equals("") && (child.fragment != null)
2015        && (child.query == null)) {
2016        if ((base.fragment != null)
2017        && child.fragment.equals(base.fragment)) {
2018        return base;
2019        }
2020        URI   ru = new URI  ();
2021        ru.scheme = base.scheme;
2022        ru.authority = base.authority;
2023        ru.userInfo = base.userInfo;
2024        ru.host = base.host;
2025        ru.port = base.port;
2026        ru.path = base.path;
2027        ru.fragment = child.fragment;
2028        ru.query = base.query;
2029        return ru;
2030    }
2031
2032    // 5.2 (3): Child is absolute
2033    if (child.scheme != null)
2034        return child;
2035
2036    URI   ru = new URI  ();     // Resolved URI
2037    ru.scheme = base.scheme;
2038    ru.query = child.query;
2039    ru.fragment = child.fragment;
2040
2041    // 5.2 (4): Authority
2042    if (child.authority == null) {
2043        ru.authority = base.authority;
2044        ru.host = base.host;
2045        ru.userInfo = base.userInfo;
2046        ru.port = base.port;
2047
2048        String   cp = (child.path == null) ? "" : child.path;
2049        if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2050        // 5.2 (5): Child path is absolute
2051        ru.path = child.path;
2052        } else {
2053        // 5.2 (6): Resolve relative path
2054        ru.path = resolvePath(base.path, cp, base.isAbsolute());
2055        }
2056    } else {
2057        ru.authority = child.authority;
2058        ru.host = child.host;
2059        ru.userInfo = child.userInfo;
2060        ru.host = child.host;
2061        ru.port = child.port;
2062        ru.path = child.path;
2063    }
2064
2065    // 5.2 (7): Recombine (nothing to do here)
2066    return ru;
2067    }
2068
2069    // If the given URI's path is normal then return the URI;
2070    // o.w., return a new URI containing the normalized path.
2071    //
2072    private static URI   normalize(URI   u) {
2073    if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2074        return u;
2075
2076    String   np = normalize(u.path);
2077    if (np == u.path)
2078        return u;
2079
2080    URI   v = new URI  ();
2081    v.scheme = u.scheme;
2082    v.fragment = u.fragment;
2083    v.authority = u.authority;
2084    v.userInfo = u.userInfo;
2085    v.host = u.host;
2086    v.port = u.port;
2087    v.path = np;
2088    v.query = u.query;
2089    return v;
2090    }
2091
2092    // If both URIs are hierarchical, their scheme and authority components are
2093    // identical, and the base path is a prefix of the child's path, then
2094    // return a relative URI that, when resolved against the base, yields the
2095    // child; otherwise, return the child.
2096    //
2097    private static URI   relativize(URI   base, URI   child) {
2098    // check if child if opaque first so that NPE is thrown 
2099        // if child is null.
2100    if (child.isOpaque() || base.isOpaque())
2101        return child;
2102    if (!equalIgnoringCase(base.scheme, child.scheme)
2103        || !equal(base.authority, child.authority))
2104        return child;
2105
2106    String   bp = normalize(base.path);
2107    String   cp = normalize(child.path);
2108    if (!bp.equals(cp)) {
2109        if (!bp.endsWith("/"))
2110        bp = bp + "/";
2111        if (!cp.startsWith(bp))
2112        return child;
2113    }
2114
2115    URI   v = new URI  ();
2116    v.path = cp.substring(bp.length());
2117    v.query = child.query;
2118    v.fragment = child.fragment;
2119    return v;
2120    }
2121
2122
2123
2124    // -- Path normalization --
2125
2126    // The following algorithm for path normalization avoids the creation of a
2127    // string object for each segment, as well as the use of a string buffer to
2128    // compute the final result, by using a single char array and editing it in
2129    // place.  The array is first split into segments, replacing each slash
2130    // with '\0' and creating a segment-index array, each element of which is
2131    // the index of the first char in the corresponding segment.  We then walk
2132    // through both arrays, removing ".", "..", and other segments as necessary
2133    // by setting their entries in the index array to -1.  Finally, the two
2134    // arrays are used to rejoin the segments and compute the final result.
2135    //
2136    // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2137
2138
2139    // Check the given path to see if it might need normalization.  A path
2140    // might need normalization if it contains duplicate slashes, a "."
2141    // segment, or a ".." segment.  Return -1 if no further normalization is
2142    // possible, otherwise return the number of segments found.
2143    //
2144    // This method takes a string argument rather than a char array so that
2145    // this test can be performed without invoking path.toCharArray().
2146    //
2147    static private int needsNormalization(String   path) {
2148    boolean normal = true;
2149    int ns = 0;         // Number of segments
2150    int end = path.length() - 1;    // Index of last char in path
2151    int p = 0;          // Index of next char in path
2152
2153    // Skip initial slashes
2154    while (p <= end) {
2155        if (path.charAt(p) != '/') break;
2156        p++;
2157    }
2158    if (p > 1) normal = false;
2159
2160    // Scan segments
2161    while (p <= end) {
2162
2163        // Looking at "." or ".." ?
2164        if ((path.charAt(p) == '.')
2165        && ((p == end)
2166            || ((path.charAt(p + 1) == '/')
2167            || ((path.charAt(p + 1) == '.')
2168                && ((p + 1 == end)
2169                || (path.charAt(p + 2) == '/')))))) {
2170        normal = false;
2171        }
2172        ns++;
2173
2174        // Find beginning of next segment
2175        while (p <= end) {
2176        if (path.charAt(p++) != '/')
2177            continue;
2178
2179        // Skip redundant slashes
2180        while (p <= end) {
2181            if (path.charAt(p) != '/') break;
2182            normal = false;
2183            p++;
2184        }
2185
2186        break;
2187        }
2188    }
2189
2190    return normal ? -1 : ns;
2191    }
2192
2193
2194    // Split the given path into segments, replacing slashes with nulls and
2195    // filling in the given segment-index array.
2196    //
2197    // Preconditions:
2198    //   segs.length == Number of segments in path
2199    //
2200    // Postconditions:
2201    //   All slashes in path replaced by '\0'
2202    //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2203    //
2204    static private void split(char[] path, int[] segs) {
2205    int end = path.length - 1;  // Index of last char in path
2206    int p = 0;          // Index of next char in path
2207    int i = 0;          // Index of current segment
2208
2209    // Skip initial slashes
2210    while (p <= end) {
2211        if (path[p] != '/') break;
2212        path[p] = '\0';
2213        p++;
2214    }
2215
2216    while (p <= end) {
2217
2218        // Note start of segment
2219        segs[i++] = p++;
2220
2221        // Find beginning of next segment
2222        while (p <= end) {
2223        if (path[p++] != '/')
2224            continue;
2225        path[p - 1] = '\0';
2226
2227        // Skip redundant slashes
2228        while (p <= end) {
2229            if (path[p] != '/') break;
2230            path[p++] = '\0';
2231        }
2232        break;
2233        }
2234    }
2235
2236    if (i != segs.length)
2237        throw new InternalError  ();  // ASSERT
2238    }
2239
2240
2241    // Join the segments in the given path according to the given segment-index
2242    // array, ignoring those segments whose index entries have been set to -1,
2243    // and inserting slashes as needed.  Return the length of the resulting
2244    // path.
2245    //
2246    // Preconditions:
2247    //   segs[i] == -1 implies segment i is to be ignored
2248    //   path computed by split, as above, with '\0' having replaced '/'
2249    //
2250    // Postconditions:
2251    //   path[0] .. path[return value] == Resulting path
2252    //
2253    static private int join(char[] path, int[] segs) {
2254    int ns = segs.length;       // Number of segments
2255    int end = path.length - 1;  // Index of last char in path
2256    int p = 0;          // Index of next path char to write
2257
2258    if (path[p] == '\0') {
2259        // Restore initial slash for absolute paths
2260        path[p++] = '/';
2261    }
2262
2263    for (int i = 0; i < ns; i++) {
2264        int q = segs[i];        // Current segment
2265        if (q == -1)
2266        // Ignore this segment
2267        continue;
2268
2269        if (p == q) {
2270        // We're already at this segment, so just skip to its end
2271        while ((p <= end) && (path[p] != '\0'))
2272            p++;
2273        if (p <= end) {
2274            // Preserve trailing slash
2275            path[p++] = '/';
2276        }
2277        } else if (p < q) {
2278        // Copy q down to p
2279        while ((q <= end) && (path[q] != '\0'))
2280            path[p++] = path[q++];
2281        if (q <= end) {
2282            // Preserve trailing slash
2283            path[p++] = '/';
2284        }
2285        } else
2286        throw new InternalError  (); // ASSERT false
2287    }
2288
2289    return p;
2290    }
2291
2292
2293    // Remove "." segments from the given path, and remove segment pairs
2294    // consisting of a non-".." segment followed by a ".." segment.
2295    //
2296    private static void removeDots(char[] path, int[] segs) {
2297    int ns = segs.length;
2298    int end = path.length - 1;
2299
2300    for (int i = 0; i < ns; i++) {
2301        int dots = 0;       // Number of dots found (0, 1, or 2)
2302
2303        // Find next occurrence of "." or ".."
2304        do {
2305        int p = segs[i];
2306        if (path[p] == '.') {
2307            if (p == end) {
2308            dots = 1;
2309            break;
2310            } else if (path[p + 1] == '\0') {
2311            dots = 1;
2312            break;
2313            } else if ((path[p + 1] == '.')
2314                   && ((p + 1 == end)
2315                   || (path[p + 2] == '\0'))) {
2316            dots = 2;
2317            break;
2318            }
2319        }
2320        i++;
2321        } while (i < ns);
2322        if ((i > ns) || (dots == 0))
2323        break;
2324
2325        if (dots == 1) {
2326        // Remove this occurrence of "."
2327        segs[i] = -1;
2328        } else {
2329        // If there is a preceding non-".." segment, remove both that
2330        // segment and this occurrence of ".."; otherwise, leave this
2331        // ".." segment as-is.
2332        int j;
2333        for (j = i - 1; j >= 0; j--) {
2334            if (segs[j] != -1) break;
2335        }
2336        if (j >= 0) {
2337            int q = segs[j];
2338            if (!((path[q] == '.')
2339              && (path[q + 1] == '.')
2340              && (path[q + 2] == '\0'))) {
2341            segs[i] = -1;
2342            segs[j] = -1;
2343            }
2344        }
2345        }
2346    }
2347    }
2348
2349
2350    // DEVIATION: If the normalized path is relative, and if the first
2351    // segment could be parsed as a scheme name, then prepend a "." segment
2352    //
2353    private static void maybeAddLeadingDot(char[] path, int[] segs) {
2354
2355    if (path[0] == '\0')
2356        // The path is absolute
2357        return;
2358
2359    int ns = segs.length;
2360    int f = 0;          // Index of first segment
2361    while (f < ns) {
2362        if (segs[f] >= 0)
2363        break;
2364        f++;
2365    }
2366    if ((f >= ns) || (f == 0))
2367        // The path is empty, or else the original first segment survived,
2368        // in which case we already know that no leading "." is needed
2369        return;
2370
2371    int p = segs[f];
2372    while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2373    if (p >= path.length || path[p] == '\0')
2374        // No colon in first segment, so no "." needed
2375        return;
2376
2377    // At this point we know that the first segment is unused,
2378    // hence we can insert a "." segment at that position
2379    path[0] = '.';
2380    path[1] = '\0';
2381    segs[0] = 0;
2382    }
2383
2384
2385    // Normalize the given path string.  A normal path string has no empty
2386    // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2387    // segments equal to ".." that are preceded by a segment not equal to "..".
2388    // In contrast to Unix-style pathname normalization, for URI paths we
2389    // always retain trailing slashes.
2390    //
2391    private static String   normalize(String   ps) {
2392
2393    // Does this path need normalization?
2394    int ns = needsNormalization(ps);    // Number of segments
2395    if (ns < 0)
2396        // Nope -- just return it
2397        return ps;
2398
2399    char[] path = ps.toCharArray();     // Path in char-array form
2400
2401    // Split path into segments
2402    int[] segs = new int[ns];       // Segment-index array
2403    split(path, segs);
2404
2405    // Remove dots
2406    removeDots(path, segs);
2407
2408    // Prevent scheme-name confusion
2409    maybeAddLeadingDot(path, segs);
2410
2411    // Join the remaining segments and return the result
2412    String   s = new String  (path, 0, join(path, segs));
2413    if (s.equals(ps)) {
2414        // string was already normalized
2415        return ps;
2416    }
2417    return s;
2418    }
2419
2420
2421
2422    // -- Character classes for parsing --
2423
2424    // RFC2396 precisely specifies which characters in the US-ASCII charset are
2425    // permissible in the various components of a URI reference.  We here
2426    // define a set of mask pairs to aid in enforcing these restrictions.  Each
2427    // mask pair consists of two longs, a low mask and a high mask.  Taken
2428    // together they represent a 128-bit mask, where bit i is set iff the
2429    // character with value i is permitted.
2430    //
2431    // This approach is more efficient than sequentially searching arrays of
2432    // permitted characters.  It could be made still more efficient by
2433    // precompiling the mask information so that a character's presence in a
2434    // given mask could be determined by a single table lookup.
2435
2436    // Compute the low-order mask for the characters in the given string
2437    private static long lowMask(String   chars) {
2438    int n = chars.length();
2439    long m = 0;
2440    for (int i = 0; i < n; i++) {
2441        char c = chars.charAt(i);
2442        if (c < 64)
2443        m |= (1L << c);
2444    }
2445    return m;
2446    }
2447
2448    // Compute the high-order mask for the characters in the given string
2449    private static long highMask(String   chars) {
2450    int n = chars.length();
2451    long m = 0;
2452    for (int i = 0; i < n; i++) {
2453        char c = chars.charAt(i);
2454        if ((c >= 64) && (c < 128))
2455        m |= (1L << (c - 64));
2456    }
2457    return m;
2458    }
2459
2460    // Compute a low-order mask for the characters
2461    // between first and last, inclusive
2462    private static long lowMask(char first, char last) {
2463    long m = 0;
2464    int f = Math.max(Math.min(first, 63), 0);
2465    int l = Math.max(Math.min(last, 63), 0);
2466    for (int i = f; i <= l; i++)
2467        m |= 1L << i;
2468    return m;
2469    }
2470
2471    // Compute a high-order mask for the characters
2472    // between first and last, inclusive
2473    private static long highMask(char first, char last) {
2474    long m = 0;
2475    int f = Math.max(Math.min(first, 127), 64) - 64;
2476    int l = Math.max(Math.min(last, 127), 64) - 64;
2477    for (int i = f; i <= l; i++)
2478        m |= 1L << i;
2479    return m;
2480    }
2481
2482    // Tell whether the given character is permitted by the given mask pair
2483    private static boolean match(char c, long lowMask, long highMask) {
2484    if (c < 64)
2485        return ((1L << c) & lowMask) != 0;
2486    if (c < 128)
2487        return ((1L << (c - 64)) & highMask) != 0;
2488    return false;
2489    }
2490
2491    // Character-class masks, in reverse order from RFC2396 because
2492    // initializers for static fields cannot make forward references.
2493
2494    // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2495    //            "8" | "9"
2496    private static final long L_DIGIT = lowMask('0', '9');
2497    private static final long H_DIGIT = 0L;
2498
2499    // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2500    //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2501    //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2502    private static final long L_UPALPHA = 0L;
2503    private static final long H_UPALPHA = highMask('A', 'Z');
2504
2505    // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2506    //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2507    //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2508    private static final long L_LOWALPHA = 0L;
2509    private static final long H_LOWALPHA = highMask('a', 'z');
2510
2511    // alpha         = lowalpha | upalpha
2512    private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2513    private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2514
2515    // alphanum      = alpha | digit
2516    private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2517    private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2518
2519    // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2520    //                         "a" | "b" | "c" | "d" | "e" | "f"
2521    private static final long L_HEX = L_DIGIT;
2522    private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2523
2524    // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2525    //                 "(" | ")"
2526    private static final long L_MARK = lowMask("-_.!~*'()");
2527    private static final long H_MARK = highMask("-_.!~*'()");
2528
2529    // unreserved    = alphanum | mark
2530    private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2531    private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2532
2533    // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2534    //                 "$" | "," | "[" | "]"
2535    // Added per RFC2732: "[", "]"
2536    private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2537    private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2538
2539    // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2540    // characters are allowed; this is handled by the scanEscape method below.
2541    private static final long L_ESCAPED = 1L;
2542    private static final long H_ESCAPED = 0L;
2543
2544    // uric          = reserved | unreserved | escaped
2545    private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2546    private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2547
2548    // pchar         = unreserved | escaped |
2549    //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2550    private static final long L_PCHAR
2551    = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2552    private static final long H_PCHAR
2553    = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2554
2555    // All valid path characters
2556    private static final long L_PATH = L_PCHAR | lowMask(";/");
2557    private static final long H_PATH = H_PCHAR | highMask(";/");
2558
2559    // Dash, for use in domainlabel and toplabel
2560    private static final long L_DASH = lowMask("-");
2561    private static final long H_DASH = highMask("-");
2562
2563    // Dot, for use in hostnames
2564    private static final long L_DOT = lowMask(".");
2565    private static final long H_DOT = highMask(".");
2566
2567    // userinfo      = *( unreserved | escaped |
2568    //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2569    private static final long L_USERINFO
2570    = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2571    private static final long H_USERINFO
2572    = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2573
2574    // reg_name      = 1*( unreserved | escaped | "$" | "," |
2575    //                     ";" | ":" | "@" | "&" | "=" | "+" )
2576    private static final long L_REG_NAME
2577    = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2578    private static final long H_REG_NAME
2579    = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2580
2581    // All valid characters for server-based authorities
2582    private static final long L_SERVER
2583    = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2584    private static final long H_SERVER
2585    = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2586
2587    // Special case of server authority that represents an IPv6 address
2588    // In this case, a % does not signify an escape sequence
2589    private static final long L_SERVER_PERCENT
2590    = L_SERVER | lowMask("%");
2591    private static final long H_SERVER_PERCENT
2592    = H_SERVER | highMask("%");
2593    private static final long L_LEFT_BRACKET = lowMask("[");
2594    private static final long H_LEFT_BRACKET = highMask("[");
2595
2596    // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2597    private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2598    private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2599
2600    // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
2601    //                 "&" | "=" | "+" | "$" | ","
2602    private static final long L_URIC_NO_SLASH
2603    = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
2604    private static final long H_URIC_NO_SLASH
2605    = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
2606
2607
2608    // -- Escaping and encoding --
2609
2610    private final static char[] hexDigits = {
2611    '0', '1', '2', '3', '4', '5', '6', '7',
2612    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2613    };
2614
2615    private static void appendEscape(StringBuffer   sb, byte b) {
2616    sb.append('%');
2617    sb.append(hexDigits[(b >> 4) & 0x0f]);
2618    sb.append(hexDigits[(b >> 0) & 0x0f]);
2619    }
2620
2621    private static void appendEncoded(StringBuffer   sb, char c) {
2622    ByteBuffer   bb = null;
2623    try {
2624        bb = ThreadLocalCoders.encoderFor("UTF-8")
2625        .encode(CharBuffer.wrap("" + c));
2626    } catch (CharacterCodingException   x) {
2627        assert false;
2628    }
2629    while (bb.hasRemaining()) {
2630        int b = bb.get() & 0xff;
2631        if (b >= 0x80)
2632        appendEscape(sb, (byte)b);
2633        else
2634        sb.append((char)b);
2635    }
2636    }
2637
2638    // Quote any characters in s that are not permitted
2639    // by the given mask pair
2640    //
2641    private static String   quote(String   s, long lowMask, long highMask) {
2642    int n = s.length();
2643    StringBuffer   sb = null;
2644    boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2645    for (int i = 0; i < s.length(); i++) {
2646        char c = s.charAt(i);
2647        if (c < '\u0080') {
2648        if (!match(c, lowMask, highMask)) {
2649            if (sb == null) {
2650            sb = new StringBuffer  ();
2651            sb.append(s.substring(0, i));
2652            }
2653            appendEscape(sb, (byte)c);
2654        } else {
2655            if (sb != null)
2656            sb.append(c);
2657        }
2658        } else if (allowNonASCII
2659               && (Character.isSpaceChar(c)
2660               || Character.isISOControl(c))) {
2661        if (sb == null) {
2662            sb = new StringBuffer  ();
2663            sb.append(s.substring(0, i));
2664        }
2665        appendEncoded(sb, c);
2666        } else {
2667        if (sb != null)
2668            sb.append(c);
2669        }
2670    }
2671    return (sb == null) ? s : sb.toString();
2672    }
2673
2674    // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2675    // assuming that s is otherwise legal
2676    //
2677    private static String   encode(String   s) {
2678    int n = s.length();
2679    if (n == 0)
2680        return s;
2681
2682    // First check whether we actually need to encode
2683    for (int i = 0;;) {
2684        if (s.charAt(i) >= '\u0080')
2685        break;
2686        if (++i >= n)
2687        return s;
2688    }
2689
2690    String   ns = Normalizer.normalize(s, Normalizer.COMPOSE, 0);
2691    ByteBuffer   bb = null;
2692    try {
2693        bb = ThreadLocalCoders.encoderFor("UTF-8")
2694        .encode(CharBuffer.wrap(ns));
2695    } catch (CharacterCodingException   x) {
2696        assert false;
2697    }
2698
2699    StringBuffer   sb = new StringBuffer  ();
2700    while (bb.hasRemaining()) {
2701        int b = bb.get() & 0xff;
2702        if (b >= 0x80)
2703        appendEscape(sb, (byte)b);
2704        else
2705        sb.append((char)b);
2706    }
2707    return sb.toString();
2708    }
2709
2710    private static int decode(char c) {
2711    if ((c >= '0') && (c <= '9'))
2712        return c - '0';
2713    if ((c >= 'a') && (c <= 'f'))
2714        return c - 'a' + 10;
2715    if ((c >= 'A') && (c <= 'F'))
2716        return c - 'A' + 10;
2717    assert false;
2718    return -1;
2719    }
2720
2721    private static byte decode(char c1, char c2) {
2722    return (byte)(  ((decode(c1) & 0xf) << 4)
2723              | ((decode(c2) & 0xf) << 0));
2724    }
2725
2726    // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2727    // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2728    // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2729    // are replaced with '\uFFFD'.
2730    // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2731    //            with a scope_id
2732    //
2733    private static String   decode(String   s) {
2734    if (s == null)
2735        return s;
2736    int n = s.length();
2737    if (n == 0)
2738        return s;
2739    if (s.indexOf('%') < 0)
2740        return s;
2741
2742    byte[] ba = new byte[n];
2743    StringBuffer   sb = new StringBuffer  (n);
2744    ByteBuffer   bb = ByteBuffer.allocate(n);
2745    CharBuffer   cb = CharBuffer.allocate(n);
2746    CharsetDecoder   dec = ThreadLocalCoders.decoderFor("UTF-8")
2747        .onMalformedInput(CodingErrorAction.REPLACE)
2748        .onUnmappableCharacter(CodingErrorAction.REPLACE);
2749
2750    // This is not horribly efficient, but it will do for now
2751    char c = s.charAt(0);
2752        boolean betweenBrackets = false;
2753
2754    for (int i = 0; i < n;) {
2755        assert c == s.charAt(i);    // Loop invariant
2756        if (c == '[') {
2757        betweenBrackets = true;
2758        } else if (betweenBrackets && c == ']') {
2759        betweenBrackets = false;
2760        }
2761        if (c != '%' || betweenBrackets) {
2762        sb.append(c);
2763        if (++i >= n)
2764            break;
2765        c = s.charAt(i);
2766        continue;
2767        }
2768        bb.clear();
2769        int ui = i;
2770        for (;;) {
2771        assert (n - i >= 2);
2772        bb.put(decode(s.charAt(++i), s.charAt(++i)));
2773        if (++i >= n)
2774            break;
2775        c = s.charAt(i);
2776        if (c != '%')
2777            break;
2778        }
2779        bb.flip();
2780        cb.clear();
2781        dec.reset();
2782        CoderResult   cr = dec.decode(bb, cb, true);
2783        assert cr.isUnderflow();
2784        cr = dec.flush(cb);
2785        assert cr.isUnderflow();
2786        sb.append(cb.flip().toString());
2787    }
2788
2789    return sb.toString();
2790    }
2791
2792
2793    // -- Parsing --
2794
2795    // For convenience we wrap the input URI string in a new instance of the
2796    // following internal class.  This saves always having to pass the input
2797    // string as an argument to each internal scan/parse method.
2798
2799    private class Parser {
2800
2801    private String   input;       // URI input string
2802    private boolean requireServerAuthority = false;
2803
2804    Parser(String   s) {
2805        input = s;
2806        string = s;
2807    }
2808
2809    // -- Methods for throwing URISyntaxException in various ways --
2810
2811    private void fail(String   reason) throws URISyntaxException   {
2812        throw new URISyntaxException  (input, reason);
2813    }
2814
2815    private void fail(String   reason, int p) throws URISyntaxException   {
2816        throw new URISyntaxException  (input, reason, p);
2817    }
2818
2819    private void failExpecting(String   expected, int p)
2820        throws URISyntaxException  
2821    {
2822        fail("Expected " + expected, p);
2823    }
2824
2825    private void failExpecting(String   expected, String   prior, int p)
2826        throws URISyntaxException  
2827    {
2828        fail("Expected " + expected + " following " + prior, p);
2829    }
2830
2831
2832    // -- Simple access to the input string --
2833
2834    // Return a substring of the input string
2835    //
2836    private String   substring(int start, int end) {
2837        return input.substring(start, end);
2838    }
2839
2840    // Return the char at position p,
2841    // assuming that p < input.length()
2842    //
2843    private char charAt(int p) {
2844        return input.charAt(p);
2845    }
2846
2847    // Tells whether start < end and, if so, whether charAt(start) == c
2848    //
2849    private boolean at(int start, int end, char c) {
2850        return (start < end) && (charAt(start) == c);
2851    }
2852
2853    // Tells whether start + s.length() < end and, if so,
2854    // whether the chars at the start position match s exactly
2855    //
2856    private boolean at(int start, int end, String   s) {
2857        int p = start;
2858        int sn = s.length();
2859        if (sn > end - p)
2860        return false;
2861        int i = 0;
2862        while (i < sn) {
2863        if (charAt(p++) != s.charAt(i)) {
2864            break;
2865        }
2866        i++;
2867        }
2868        return (i == sn);
2869    }
2870
2871
2872    // -- Scanning --
2873
2874    // The various scan and parse methods that follow use a uniform
2875    // convention of taking the current start position and end index as
2876    // their first two arguments.  The start is inclusive while the end is
2877    // exclusive, just as in the String class, i.e., a start/end pair
2878    // denotes the left-open interval [start, end) of the input string.
2879    //
2880    // These methods never proceed past the end position.  They may return
2881    // -1 to indicate outright failure, but more often they simply return
2882    // the position of the first char after the last char scanned.  Thus
2883    // a typical idiom is
2884    //
2885    //     int p = start;
2886    //     int q = scan(p, end, ...);
2887    //     if (q > p)
2888    //         // We scanned something
2889    //         ...;
2890    //     else if (q == p)
2891    //         // We scanned nothing
2892    //         ...;
2893    //     else if (q == -1)
2894    //         // Something went wrong
2895    //         ...;
2896
2897
2898    // Scan a specific char: If the char at the given start position is
2899    // equal to c, return the index of the next char; otherwise, return the
2900    // start position.
2901    //
2902    private int scan(int start, int end, char c) {
2903        if ((start < end) && (charAt(start) == c))
2904        return start + 1;
2905        return start;
2906    }
2907
2908    // Scan forward from the given start position.  Stop at the first char
2909    // in the err string (in which case -1 is returned), or the first char
2910    // in the stop string (in which case the index of the preceding char is
2911    // returned), or the end of the input string (in which case the length
2912    // of the input string is returned).  May return the start position if
2913    // nothing matches.
2914    //
2915    private int scan(int start, int end, String   err, String   stop) {
2916        int p = start;
2917        while (p < end) {
2918        char c = charAt(p);
2919        if (err.indexOf(c) >= 0)
2920            return -1;
2921        if (stop.indexOf(c) >= 0)
2922            break;
2923        p++;
2924        }
2925        return p;
2926    }
2927
2928    // Scan a potential escape sequence, starting at the given position,
2929    // with the given first char (i.e., charAt(start) == c).
2930    //
2931    // This method assumes that if escapes are allowed then visible
2932    // non-US-ASCII chars are also allowed.
2933    //
2934    private int scanEscape(int start, int n, char first)
2935        throws URISyntaxException  
2936    {
2937        int p = start;
2938        char c = first;
2939        if (c == '%') {
2940        // Process escape pair
2941        if ((p + 3 <= n)
2942            && match(charAt(p + 1), L_HEX, H_HEX)
2943            && match(charAt(p + 2), L_HEX, H_HEX)) {
2944            return p + 3;
2945        }
2946        fail("Malformed escape pair", p);
2947        } else if ((c > 128)
2948               && !Character.isSpaceChar(c)
2949               && !Character.isISOControl(c)) {
2950        // Allow unescaped but visible non-US-ASCII chars
2951        return p + 1;
2952        }
2953        return p;
2954    }
2955
2956    // Scan chars that match the given mask pair
2957    //
2958    private int scan(int start, int n, long lowMask, long highMask)
2959        throws URISyntaxException  
2960    {
2961        int p = start;
2962        while (p < n) {
2963        char c = charAt(p);
2964        if (match(c, lowMask, highMask)) {
2965            p++;
2966            continue;
2967        }
2968        if ((lowMask & L_ESCAPED) != 0) {
2969            int q = scanEscape(p, n, c);
2970            if (q > p) {
2971            p = q;
2972            continue;
2973            }
2974        }
2975        break;
2976        }
2977        return p;
2978    }
2979
2980    // Check that each of the chars in [start, end) matches the given mask
2981    //
2982    private void checkChars(int start, int end,
2983                long lowMask, long highMask,
2984                String   what)
2985        throws URISyntaxException  
2986    {
2987        int p = scan(start, end, lowMask, highMask);
2988        if (p < end)
2989        fail("Illegal character in " + what, p);
2990    }
2991
2992    // Check that the char at position p matches the given mask
2993    //
2994    private void checkChar(int p,
2995                   long lowMask, long highMask,
2996                   String   what)
2997        throws URISyntaxException  
2998    {
2999        checkChars(p, p + 1, lowMask, highMask, what);
3000    }
3001
3002
3003    // -- Parsing --
3004
3005    // [<scheme>:]<scheme-specific-part>[#<fragment>]
3006    //
3007    void parse(boolean rsa) throws URISyntaxException   {
3008        requireServerAuthority = rsa;
3009        int ssp;            // Start of scheme-specific part
3010        int n = input.length();
3011        int p = scan(0, n, "/?#", ":");
3012        if ((p >= 0) && at(p, n, ':')) {
3013        if (p == 0)
3014            failExpecting("scheme name", 0);
3015        checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3016        checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3017        scheme = substring(0, p);
3018        p++;            // Skip ':'
3019        ssp = p;
3020        if (at(p, n, '/')) {
3021            p = parseHierarchical(p, n);
3022        } else {
3023            int q = scan(p, n, "", "#");
3024            if (q <= p)
3025            failExpecting("scheme-specific part", p);
3026            checkChars(p, q, L_URIC, H_URIC, "opaque part");
3027            p = q;
3028        }
3029        } else {
3030        ssp = 0;
3031        p = parseHierarchical(0, n);
3032        }
3033        schemeSpecificPart = substring(ssp, p);
3034        if (at(p, n, '#')) {
3035        checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3036        fragment = substring(p + 1, n);
3037        p = n;
3038        }
3039        if (p < n)
3040        fail("end of URI", p);
3041    }
3042
3043    // [//authority]<path>[?<query>]
3044    //
3045    // DEVIATION from RFC2396: We allow an empty authority component as
3046    // long as it's followed by a non-empty path, query component, or
3047    // fragment component.  This is so that URIs such as "file:///foo/bar"
3048    // will parse.  This seems to be the intent of RFC2396, though the
3049    // grammar does not permit it.  If the authority is empty then the
3050    // userInfo, host, and port components are undefined.
3051    //
3052    // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3053    // to be the intent of RFC2396, but the grammar does not permit it.
3054    // The primary consequence of this deviation is that "#f" parses as a
3055    // relative URI with an empty path.
3056    //
3057    private int parseHierarchical(int start, int n)
3058        throws URISyntaxException  
3059    {
3060        int p = start;
3061        if (at(p, n, '/') && at(p + 1, n, '/')) {
3062        p += 2;
3063        int q = scan(p, n, "", "/?#");
3064        if (q > p) {
3065            p = parseAuthority(p, q);
3066        } else if (q < n) {
3067            // DEVIATION: Allow empty authority prior to non-empty 
3068            // path, query component or fragment identifier
3069        } else
3070            failExpecting("authority", p);
3071        }
3072        int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
3073        checkChars(p, q, L_PATH, H_PATH, "path");
3074        path = substring(p, q);
3075        p = q;
3076        if (at(p, n, '?')) {
3077        p++;
3078        q = scan(p, n, "", "#");
3079        checkChars(p, q, L_URIC, H_URIC, "query");
3080        query = substring(p, q);
3081        p = q;
3082        }
3083        return p;
3084    }
3085
3086    // authority     = server | reg_name
3087    //
3088    // Ambiguity: An authority that is a registry name rather than a server
3089    // might have a prefix that parses as a server.  We use the fact that
3090    // the authority component is always followed by '/' or the end of the
3091    // input string to resolve this: If the complete authority did not
3092    // parse as a server then we try to parse it as a registry name.
3093    //
3094    private int parseAuthority(int start, int n)
3095        throws URISyntaxException  
3096    {
3097        int p = start;
3098        int q = p;
3099        URISyntaxException   ex = null;
3100
3101        boolean serverChars;
3102        boolean regChars;
3103
3104        if (scan(p, n, "", "]") > p) {
3105        // contains a literal IPv6 address, therefore % is allowed
3106            serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3107        } else {
3108            serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3109        }
3110        regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3111
3112        if (regChars && !serverChars) {
3113        // Must be a registry-based authority
3114        authority = substring(p, n);
3115        return n;
3116        }
3117
3118        if (serverChars) {
3119        // Might be (probably is) a server-based authority, so attempt
3120        // to parse it as such.  If the attempt fails, try to treat it
3121        // as a registry-based authority.
3122        try {
3123            q = parseServer(p, n);
3124            if (q < n)
3125            failExpecting("end of authority", q);
3126            authority = substring(p, n);
3127        } catch (URISyntaxException   x) {
3128            // Undo results of failed parse
3129            userInfo = null;
3130            host = null;
3131            port = -1;
3132            if (requireServerAuthority) {
3133            // If we're insisting upon a server-based authority,
3134            // then just re-throw the exception
3135            throw x;
3136            } else {
3137            // Save the exception in case it doesn't parse as a
3138            // registry either
3139            ex = x;
3140            q = p;
3141            }
3142        }
3143        }
3144
3145        if (q < n) {
3146        if (regChars) {
3147            // Registry-based authority
3148            authority = substring(p, n);
3149        } else if (ex != null) {
3150            // Re-throw exception; it was probably due to
3151            // a malformed IPv6 address
3152            throw ex;
3153        } else {
3154            fail("Illegal character in authority", q);
3155        }
3156        }
3157
3158        return n;
3159    }
3160
3161
3162    // [<userinfo>@]<host>[:<port>]
3163    //
3164    private int parseServer(int start, int n)
3165        throws URISyntaxException  
3166    {
3167        int p = start;
3168        int q;
3169
3170        // userinfo
3171        q = scan(p, n, "/?#", "@");
3172        if ((q >= p) && at(q, n, '@')) {
3173        checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3174        userInfo = substring(p, q);
3175        p = q + 1;      // Skip '@'
3176        }
3177
3178        // hostname, IPv4 address, or IPv6 address
3179        if (at(p, n, '[')) {
3180        // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3181        p++;
3182        q = scan(p, n, "/?#", "]");
3183        if ((q > p) && at(q, n, ']')) {
3184            // look for a "%" scope id
3185            int r = scan (p, q, "", "%");
3186            if (r > p) {
3187                parseIPv6Reference(p, r);
3188            if (r+1 == q) {
3189                fail ("scope id expected");
3190            }
3191            checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM, 
3192                        "scope id");
3193            } else {
3194                parseIPv6Reference(p, q);
3195            }
3196                host = substring(p-1, q+1);
3197            p = q + 1;
3198        } else {
3199            failExpecting("closing bracket for IPv6 address", q);
3200        }
3201        } else {
3202        q = parseIPv4Address(p, n);
3203        if (q <= p)
3204            q = parseHostname(p, n);
3205        p = q;
3206        }
3207
3208        // port
3209        if (at(p, n, ':')) {
3210        p++;
3211        q = scan(p, n, "", "/");
3212        if (q > p) {
3213            checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3214            try {
3215            port = Integer.parseInt(substring(p, q));
3216            } catch (NumberFormatException   x) {
3217            fail("Malformed port number", p);
3218            }
3219            p = q;
3220        }
3221        }
3222        if (p < n)
3223        failExpecting("port number", p);
3224
3225        return p;
3226    }
3227
3228    // Scan a string of decimal digits whose value fits in a byte
3229    //
3230    private int scanByte(int start, int n)
3231        throws URISyntaxException  
3232    {
3233        int p = start;
3234        int q = scan(p, n, L_DIGIT, H_DIGIT);
3235        if (q <= p) return q;
3236        if (Integer.parseInt(substring(p, q)) > 255) return p;
3237        return q;
3238    }
3239
3240    // Scan an IPv4 address.
3241    //
3242    // If the strict argument is true then we require that the given
3243    // interval contain nothing besides an IPv4 address; if it is false
3244    // then we only require that it start with an IPv4 address.
3245    //
3246    // If the interval does not contain or start with (depending upon the
3247    // strict argument) a legal IPv4 address characters then we return -1
3248    // immediately; otherwise we insist that these characters parse as a
3249    // legal IPv4 address and throw an exception on failure.
3250    //
3251    // We assume that any string of decimal digits and dots must be an IPv4
3252    // address.  It won't parse as a hostname anyway, so making that
3253    // assumption here allows more meaningful exceptions to be thrown.
3254    //
3255    private int scanIPv4Address(int start, int n, boolean strict)
3256        throws URISyntaxException  
3257    {
3258        int p = start;
3259        int q;
3260        int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3261        if ((m <= p) || (strict && (m != n)))
3262        return -1;
3263        for (;;) {
3264        // Per RFC2732: At most three digits per byte
3265        // Further constraint: Each element fits in a byte
3266        if ((q = scanByte(p, m)) <= p) break;   p = q;
3267        if ((q = scan(p, m, '.')) <= p) break;  p = q;
3268        if ((q = scanByte(p, m)) <= p) break;   p = q;
3269        if ((q = scan(p, m, '.')) <= p) break;  p = q;
3270        if ((q = scanByte(p, m)) <= p) break;   p = q;
3271        if ((q = scan(p, m, '.')) <= p) break;  p = q;
3272        if ((q = scanByte(p, m)) <= p) break;   p = q;
3273        if (q < m) break;
3274        return q;
3275        }
3276        fail("Malformed IPv4 address", q);
3277        return -1;
3278    }
3279
3280    // Take an IPv4 address: Throw an exception if the given interval
3281    // contains anything except an IPv4 address
3282    //
3283    private int takeIPv4Address(int start, int n, String   expected)
3284        throws URISyntaxException  
3285    {
3286        int p = scanIPv4Address(start, n, true);
3287        if (p <= start)
3288        failExpecting(expected, start);
3289        return p;
3290    }
3291
3292    // Attempt to parse an IPv4 address, returning -1 on failure but
3293    // allowing the given interval to contain [:<characters>] after
3294    // the IPv4 address.
3295    //
3296    private int parseIPv4Address(int start, int n) {
3297        int p;
3298
3299        try {
3300            p = scanIPv4Address(start, n, false);
3301        } catch (URISyntaxException   x) {
3302        return -1;
3303            } catch (NumberFormatException   nfe) {
3304        return -1;
3305            }
3306
3307        if (p > start && p < n) {
3308            // IPv4 address is followed by something - check that
3309        // it's a ":" as this is the only valid character to
3310        // follow an address.
3311        if (charAt(p) != ':') {
3312            p = -1;
3313        }
3314        }
3315
3316        if (p > start)
3317        host = substring(start, p);
3318
3319        return p;
3320    }
3321
3322    // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 
3323    // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3324        // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3325    //
3326    private int parseHostname(int start, int n)
3327        throws URISyntaxException  
3328    {
3329        int p = start;
3330        int q;
3331        int l = -1;         // Start of last parsed label
3332
3333        do {
3334        // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3335        q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3336        if (q <= p)
3337            break;
3338        l = p;
3339        if (q > p) {
3340            p = q;
3341            q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3342            if (q > p) {
3343            if (charAt(q - 1) == '-')
3344                fail("Illegal character in hostname", q - 1);
3345            p = q;
3346            }
3347        }
3348        q = scan(p, n, '.');
3349        if (q <= p)
3350            break;
3351        p = q;
3352        } while (p < n);
3353
3354        if ((p < n) && !at(p, n, ':'))
3355        fail("Illegal character in hostname", p);
3356
3357        if (l < 0)
3358        failExpecting("hostname", start);
3359
3360        // for a fully qualified hostname check that the rightmost
3361        // label starts with an alpha character.
3362        if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
3363        fail("Illegal character in hostname", l);
3364        }
3365
3366        host = substring(start, p);
3367        return p;
3368    }
3369
3370
3371    // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3372    //
3373    // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3374    // the form ::12.34.56.78, which are clearly shown in the examples
3375    // earlier in the document.  Here is the original grammar:
3376    //
3377    //   IPv6address = hexpart [ ":" IPv4address ]
3378    //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3379    //   hexseq      = hex4 *( ":" hex4)
3380    //   hex4        = 1*4HEXDIG
3381    //
3382    // We therefore use the following revised grammar:
3383    //
3384    //   IPv6address = hexseq [ ":" IPv4address ]
3385    //                 | hexseq [ "::" [ hexpost ] ]
3386    //                 | "::" [ hexpost ]
3387    //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3388    //   hexseq      = hex4 *( ":" hex4)
3389    //   hex4        = 1*4HEXDIG
3390    //
3391    // This covers all and only the following cases:
3392    //
3393    //   hexseq
3394    //   hexseq : IPv4address
3395    //   hexseq ::
3396    //   hexseq :: hexseq
3397    //   hexseq :: hexseq : IPv4address
3398    //   hexseq :: IPv4address
3399    //   :: hexseq
3400    //   :: hexseq : IPv4address
3401    //   :: IPv4address
3402    //   ::
3403    //
3404    // Additionally we constrain the IPv6 address as follows :-
3405    //
3406    //  i.  IPv6 addresses without compressed zeros should contain
3407    //      exactly 16 bytes.
3408    //
3409    //  ii. IPv6 addresses with compressed zeros should contain
3410    //      less than 16 bytes.
3411
3412    private int ipv6byteCount = 0;
3413
3414    private int parseIPv6Reference(int start, int n)
3415        throws URISyntaxException  
3416    {
3417        int p = start;
3418        int q;
3419        boolean compressedZeros = false;
3420
3421        q = scanHexSeq(p, n);
3422
3423        if (q > p) {
3424        p = q;
3425        if (at(p, n, "::")) {
3426            compressedZeros = true;
3427            p = scanHexPost(p + 2, n);
3428        } else if (at(p, n, ':')) {
3429            p = takeIPv4Address(p + 1,  n, "IPv4 address");
3430            ipv6byteCount += 4;
3431        }
3432        } else if (at(p, n, "::")) {
3433        compressedZeros = true;
3434        p = scanHexPost(p + 2, n);
3435        }
3436        if (p < n)
3437        fail("Malformed IPv6 address", start);
3438        if (ipv6byteCount > 16)
3439        fail("IPv6 address too long", start);
3440        if (!compressedZeros && ipv6byteCount < 16) 
3441        fail("IPv6 address too short", start);
3442        if (compressedZeros && ipv6byteCount == 16)
3443        fail("Malformed IPv6 address", start);
3444
3445        return p;
3446    }
3447
3448    private int scanHexPost(int start, int n)
3449        throws URISyntaxException  
3450    {
3451        int p = start;
3452        int q;
3453
3454        if (p == n)
3455        return p;
3456
3457        q = scanHexSeq(p, n);
3458        if (q > p) {
3459        p = q;
3460        if (at(p, n, ':')) {
3461            p++;
3462            p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3463            ipv6byteCount += 4;
3464        }
3465        } else {
3466        p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3467        ipv6byteCount += 4;
3468        }
3469        return p;
3470    }
3471
3472    // Scan a hex sequence; return -1 if one could not be scanned
3473    //
3474    private int scanHexSeq(int start, int n)
3475        throws URISyntaxException  
3476    {
3477        int p = start;
3478        int q;
3479
3480        q = scan(p, n, L_HEX, H_HEX);
3481        if (q <= p)
3482        return -1;
3483        if (at(q, n, '.'))      // Beginning of IPv4 address
3484        return -1;
3485        if (q > p + 4)
3486                fail("IPv6 hexadecimal digit sequence too long", p);
3487        ipv6byteCount += 2;
3488        p = q;
3489        while (p < n) {
3490        if (!at(p, n, ':'))
3491            break;
3492        if (at(p + 1, n, ':'))
3493            break;      // "::"
3494        p++;
3495        q = scan(p, n, L_HEX, H_HEX);
3496        if (q <= p)
3497            failExpecting("digits for an IPv6 address", p);
3498        if (at(q, n, '.')) {    // Beginning of IPv4 address
3499            p--;
3500            break;
3501        }
3502        if (q > p + 4)
3503            fail("IPv6 hexadecimal digit sequence too long", p);
3504        ipv6byteCount += 2;
3505        p = q;
3506        }
3507
3508        return p;
3509    }
3510
3511    }
3512
3513}
3514
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags