1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 package java.net;
28 
29 import java.io.IOException;
30 import java.io.InvalidObjectException;
31 import java.io.ObjectInputStream;
32 import java.io.ObjectOutputStream;
33 import java.io.Serializable;
34 import java.nio.ByteBuffer;
35 import java.nio.CharBuffer;
36 import java.nio.charset.CharsetDecoder;
37 import java.nio.charset.CoderResult;
38 import java.nio.charset.CodingErrorAction;
39 import java.nio.charset.CharacterCodingException;
40 import java.text.Normalizer;
41 import sun.nio.cs.ThreadLocalCoders;
42 
43 import java.lang.Character;             // for javadoc
44 import java.lang.NullPointerException;  // for javadoc
45 
46 
47 // Android-changed: Reformat @see links.
48 /**
49  * Represents a Uniform Resource Identifier (URI) reference.
50  *
51  * <p> Aside from some minor deviations noted below, an instance of this
52  * class represents a URI reference as defined by
53  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
54  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
55  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
56  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
57  * also supports scope_ids. The syntax and usage of scope_ids is described
58  * <a href="Inet6Address.html#scoped">here</a>.
59  * This class provides constructors for creating URI instances from
60  * their components or by parsing their string forms, methods for accessing the
61  * various components of an instance, and methods for normalizing, resolving,
62  * and relativizing URI instances.  Instances of this class are immutable.
63  *
64  *
65  * <h3> URI syntax and components </h3>
66  *
67  * At the highest level a URI reference (hereinafter simply "URI") in string
68  * form has the syntax
69  *
70  * <blockquote>
71  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
72  * </blockquote>
73  *
74  * where square brackets [...] delineate optional components and the characters
75  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
76  *
77  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
78  * said to be <i>relative</i>.  URIs are also classified according to whether
79  * they are <i>opaque</i> or <i>hierarchical</i>.
80  *
81  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
82  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
83  * subject to further parsing.  Some examples of opaque URIs are:
84  *
85  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
86  * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr>
87  * <tr><td>{@code news:comp.lang.java}<td></tr>
88  * <tr><td>{@code urn:isbn:096139210x}</td></tr>
89  * </table></blockquote>
90  *
91  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
92  * scheme-specific part begins with a slash character, or a relative URI, that
93  * is, a URI that does not specify a scheme.  Some examples of hierarchical
94  * URIs are:
95  *
96  * <blockquote>
97  * {@code http://java.sun.com/j2se/1.3/}<br>
98  * {@code docs/guide/collections/designfaq.html#28}<br>
99  * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java}<br>
100  * {@code file:///~/calendar}
101  * </blockquote>
102  *
103  * <p> A hierarchical URI is subject to further parsing according to the syntax
104  *
105  * <blockquote>
106  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
107  * </blockquote>
108  *
109  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
110  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
111  * scheme-specific part of a hierarchical URI consists of the characters
112  * between the scheme and fragment components.
113  *
114  * <p> The authority component of a hierarchical URI is, if specified, either
115  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
116  * parses according to the familiar syntax
117  *
118  * <blockquote>
119  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
120  * </blockquote>
121  *
122  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
123  * themselves.  Nearly all URI schemes currently in use are server-based.  An
124  * authority component that does not parse in this way is considered to be
125  * registry-based.
126  *
127  * <p> The path component of a hierarchical URI is itself said to be absolute
128  * if it begins with a slash character ({@code '/'}); otherwise it is
129  * relative.  The path of a hierarchical URI that is either absolute or
130  * specifies an authority is always absolute.
131  *
132  * <p> All told, then, a URI instance has the following nine components:
133  *
134  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
135  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
136  * <tr><td>scheme</td><td>{@code String}</td></tr>
137  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td>{@code String}</td></tr>
138  * <tr><td>authority</td><td>{@code String}</td></tr>
139  * <tr><td>user-info</td><td>{@code String}</td></tr>
140  * <tr><td>host</td><td>{@code String}</td></tr>
141  * <tr><td>port</td><td>{@code int}</td></tr>
142  * <tr><td>path</td><td>{@code String}</td></tr>
143  * <tr><td>query</td><td>{@code String}</td></tr>
144  * <tr><td>fragment</td><td>{@code String}</td></tr>
145  * </table></blockquote>
146  *
147  * In a given instance any particular component is either <i>undefined</i> or
148  * <i>defined</i> with a distinct value.  Undefined string components are
149  * represented by {@code null}, while undefined integer components are
150  * represented by {@code -1}.  A string component may be defined to have the
151  * empty string as its value; this is not equivalent to that component being
152  * undefined.
153  *
154  * <p> Whether a particular component is or is not defined in an instance
155  * depends upon the type of the URI being represented.  An absolute URI has a
156  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
157  * possibly a fragment, but has no other components.  A hierarchical URI always
158  * has a path (though it may be empty) and a scheme-specific-part (which at
159  * least contains the path), and may have any of the other components.  If the
160  * authority component is present and is server-based then the host component
161  * will be defined and the user-information and port components may be defined.
162  *
163  *
164  * <h4> Operations on URI instances </h4>
165  *
166  * The key operations supported by this class are those of
167  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
168  *
169  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
170  * and {@code ".."} segments from the path component of a hierarchical URI.
171  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
172  * removed only if it is preceded by a non-{@code ".."} segment.
173  * Normalization has no effect upon opaque URIs.
174  *
175  * <p> <i>Resolution</i> is the process of resolving one URI against another,
176  * <i>base</i> URI.  The resulting URI is constructed from components of both
177  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
178  * base URI for those not specified in the original.  For hierarchical URIs,
179  * the path of the original is resolved against the path of the base and then
180  * normalized.  The result, for example, of resolving
181  *
182  * <blockquote>
183  * {@code docs/guide/collections/designfaq.html#28}
184  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
185  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
186  * </blockquote>
187  *
188  * against the base URI {@code http://java.sun.com/j2se/1.3/} is the result
189  * URI
190  *
191  * <blockquote>
192  * {@code http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28}
193  * </blockquote>
194  *
195  * Resolving the relative URI
196  *
197  * <blockquote>
198  * {@code ../../../demo/jfc/SwingSet2/src/SwingSet2.java}&nbsp;&nbsp;&nbsp;&nbsp;(2)
199  * </blockquote>
200  *
201  * against this result yields, in turn,
202  *
203  * <blockquote>
204  * {@code http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java}
205  * </blockquote>
206  *
207  * Resolution of both absolute and relative URIs, and of both absolute and
208  * relative paths in the case of hierarchical URIs, is supported.  Resolving
209  * the URI {@code file:///~calendar} against any other URI simply yields the
210  * original URI, since it is absolute.  Resolving the relative URI (2) above
211  * against the relative base URI (1) yields the normalized, but still relative,
212  * URI
213  *
214  * <blockquote>
215  * {@code demo/jfc/SwingSet2/src/SwingSet2.java}
216  * </blockquote>
217  *
218  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
219  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
220  *
221  * <blockquote>
222  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
223  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
224  * </blockquote>
225  *
226  * This operation is often useful when constructing a document containing URIs
227  * that must be made relative to the base URI of the document wherever
228  * possible.  For example, relativizing the URI
229  *
230  * <blockquote>
231  * {@code http://java.sun.com/j2se/1.3/docs/guide/index.html}
232  * </blockquote>
233  *
234  * against the base URI
235  *
236  * <blockquote>
237  * {@code http://java.sun.com/j2se/1.3}
238  * </blockquote>
239  *
240  * yields the relative URI {@code docs/guide/index.html}.
241  *
242  *
243  * <h4> Character categories </h4>
244  *
245  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
246  * various components of a URI reference.  The following categories, most of
247  * which are taken from that specification, are used below to describe these
248  * constraints:
249  *
250  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
251  *   <tr><th valign=top><i>alpha</i></th>
252  *       <td>The US-ASCII alphabetic characters,
253  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
254  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
255  *   <tr><th valign=top><i>digit</i></th>
256  *       <td>The US-ASCII decimal digit characters,
257  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
258  *   <tr><th valign=top><i>alphanum</i></th>
259  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
260  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
261  *       <td>All <i>alphanum</i> characters together with those in the string
262  *        {@code "_-!.~'()*"}</td></tr>
263  *   <tr><th valign=top><i>punct</i></th>
264  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
265  *   <tr><th valign=top><i>reserved</i></th>
266  *       <td>All <i>punct</i> characters together with those in the string
267  *        {@code "?/[]@"}</td></tr>
268  *   <tr><th valign=top><i>escaped</i></th>
269  *       <td>Escaped octets, that is, triplets consisting of the percent
270  *           character ({@code '%'}) followed by two hexadecimal digits
271  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
272  *           {@code 'a'}-{@code 'f'})</td></tr>
273  *   <tr><th valign=top><i>other</i></th>
274  *       <td>The Unicode characters that are not in the US-ASCII character set,
275  *           are not control characters (according to the {@link
276  *           java.lang.Character#isISOControl(char) Character.isISOControl}
277  *           method), and are not space characters (according to the {@link
278  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
279  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
280  *           limited to US-ASCII)</i></td></tr>
281  * </table></blockquote>
282  *
283  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
284  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
285  * characters.
286  *
287  *
288  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
289  *
290  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
291  * fragment components.  Escaping serves two purposes in URIs:
292  *
293  * <ul>
294  *
295  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
296  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
297  *   characters.  </p></li>
298  *
299  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
300  *   component.  The user-info, path, query, and fragment components differ
301  *   slightly in terms of which characters are considered legal and illegal.
302  *   </p></li>
303  *
304  * </ul>
305  *
306  * These purposes are served in this class by three related operations:
307  *
308  * <ul>
309  *
310  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
311  *   with the sequence of escaped octets that represent that character in the
312  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
313  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
314  *   RFC&nbsp;2396</b>, which does not specify any particular character
315  *   set.)</i> </p></li>
316  *
317  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
318  *   encoding it.  The space character, for example, is quoted by replacing it
319  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
320  *   characters this transformation has exactly the effect required by
321  *   RFC&nbsp;2396. </p></li>
322  *
323  *   <li><p><a name="decode"></a>
324  *   A sequence of escaped octets is <i>decoded</i> by
325  *   replacing it with the sequence of characters that it represents in the
326  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
327  *   effect of de-quoting any quoted US-ASCII characters as well as that of
328  *   decoding any encoded non-US-ASCII characters.  If a <a
329  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
330  *   when decoding the escaped octets then the erroneous octets are replaced by
331  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
332  *
333  * </ul>
334  *
335  * These operations are exposed in the constructors and methods of this class
336  * as follows:
337  *
338  * <ul>
339  *
340  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
341  *   constructor} requires any illegal characters in its argument to be
342  *   quoted and preserves any escaped octets and <i>other</i> characters that
343  *   are present.  </p></li>
344  *
345  *   <li><p> The {@linkplain
346  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
347  *   multi-argument constructors} quote illegal characters as
348  *   required by the components in which they appear.  The percent character
349  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
350  *   characters are preserved.  </p></li>
351  *
352  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
353  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
354  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
355  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
356  *   values of their corresponding components in raw form, without interpreting
357  *   any escaped octets.  The strings returned by these methods may contain
358  *   both escaped octets and <i>other</i> characters, and will not contain any
359  *   illegal characters.  </p></li>
360  *
361  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
362  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
363  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
364  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
365  *   octets in their corresponding components.  The strings returned by these
366  *   methods may contain both <i>other</i> characters and illegal characters,
367  *   and will not contain any escaped octets.  </p></li>
368  *
369  *   <li><p> The {@link #toString() toString} method returns a URI string with
370  *   all necessary quotation but which may contain <i>other</i> characters.
371  *   </p></li>
372  *
373  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
374  *   quoted and encoded URI string that does not contain any <i>other</i>
375  *   characters.  </p></li>
376  *
377  * </ul>
378  *
379  *
380  * <h4> Identities </h4>
381  *
382  * For any URI <i>u</i>, it is always the case that
383  *
384  * <blockquote>
385  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
386  * </blockquote>
387  *
388  * For any URI <i>u</i> that does not contain redundant syntax such as two
389  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
390  * colon following a host name but no port (as in
391  * {@code http://java.sun.com:}&nbsp;), and that does not encode characters
392  * except those that must be quoted, the following identities also hold:
393  * <pre>
394  *     new URI(<i>u</i>.getScheme(),
395  *             <i>u</i>.getSchemeSpecificPart(),
396  *             <i>u</i>.getFragment())
397  *     .equals(<i>u</i>)</pre>
398  * in all cases,
399  * <pre>
400  *     new URI(<i>u</i>.getScheme(),
401  *             <i>u</i>.getUserInfo(), <i>u</i>.getAuthority(),
402  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
403  *             <i>u</i>.getFragment())
404  *     .equals(<i>u</i>)</pre>
405  * if <i>u</i> is hierarchical, and
406  * <pre>
407  *     new URI(<i>u</i>.getScheme(),
408  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
409  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
410  *             <i>u</i>.getFragment())
411  *     .equals(<i>u</i>)</pre>
412  * if <i>u</i> is hierarchical and has either no authority or a server-based
413  * authority.
414  *
415  *
416  * <h4> URIs, URLs, and URNs </h4>
417  *
418  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
419  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
420  * not every URI is a URL.  This is because there is another subcategory of
421  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
422  * specify how to locate them.  The {@code mailto}, {@code news}, and
423  * {@code isbn} URIs shown above are examples of URNs.
424  *
425  * <p> The conceptual distinction between URIs and URLs is reflected in the
426  * differences between this class and the {@link URL} class.
427  *
428  * <p> An instance of this class represents a URI reference in the syntactic
429  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
430  * A URI string is parsed according to the generic syntax without regard to the
431  * scheme, if any, that it specifies.  No lookup of the host, if any, is
432  * performed, and no scheme-dependent stream handler is constructed.  Equality,
433  * hashing, and comparison are defined strictly in terms of the character
434  * content of the instance.  In other words, a URI instance is little more than
435  * a structured string that supports the syntactic, scheme-independent
436  * operations of comparison, normalization, resolution, and relativization.
437  *
438  * <p> An instance of the {@link URL} class, by contrast, represents the
439  * syntactic components of a URL together with some of the information required
440  * to access the resource that it describes.  A URL must be absolute, that is,
441  * it must always specify a scheme.  A URL string is parsed according to its
442  * scheme.  A stream handler is always established for a URL, and in fact it is
443  * impossible to create a URL instance for a scheme for which no handler is
444  * available.  Equality and hashing depend upon both the scheme and the
445  * Internet address of the host, if any; comparison is not defined.  In other
446  * words, a URL is a structured string that supports the syntactic operation of
447  * resolution as well as the network I/O operations of looking up the host and
448  * opening a connection to the specified resource.
449  *
450  *
451  * @author Mark Reinhold
452  * @since 1.4
453  *
454  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt">RFC&nbsp;2279: UTF-8, a transformation format of ISO 10646</a>
455  * @see <a href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373: IPv6 Addressing Architecture</a>
456  * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396: Uniform Resource Identifiers (URI): Generic Syntax</a>
457  * @see <a href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732: Format for Literal IPv6 Addresses in URLs</a>
458  */
459 
460 public final class URI
461     implements Comparable<URI>, Serializable
462 {
463 
464     // Note: Comments containing the word "ASSERT" indicate places where a
465     // throw of an InternalError should be replaced by an appropriate assertion
466     // statement once asserts are enabled in the build.
467 
468     static final long serialVersionUID = -6052424284110960213L;
469 
470 
471     // -- Properties and components of this instance --
472 
473     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
474     private transient String scheme;            // null ==> relative URI
475     private transient String fragment;
476 
477     // Hierarchical URI components: [//<authority>]<path>[?<query>]
478     private transient String authority;         // Registry or server
479 
480     // Server-based authority: [<userInfo>@]<host>[:<port>]
481     private transient String userInfo;
482     private transient String host;              // null ==> registry-based
483     private transient int port = -1;            // -1 ==> undefined
484 
485     // Remaining components of hierarchical URIs
486     private transient String path;              // null ==> opaque
487     private transient String query;
488 
489     // The remaining fields may be computed on demand
490 
491     private volatile transient String schemeSpecificPart;
492     private volatile transient int hash;        // Zero ==> undefined
493 
494     private volatile transient String decodedUserInfo = null;
495     private volatile transient String decodedAuthority = null;
496     private volatile transient String decodedPath = null;
497     private volatile transient String decodedQuery = null;
498     private volatile transient String decodedFragment = null;
499     private volatile transient String decodedSchemeSpecificPart = null;
500 
501     /**
502      * The string form of this URI.
503      *
504      * @serial
505      */
506     private volatile String string;             // The only serializable field
507 
508 
509 
510     // -- Constructors and factories --
511 
URI()512     private URI() { }                           // Used internally
513 
514     /**
515      * Constructs a URI by parsing the given string.
516      *
517      * <p> This constructor parses the given string exactly as specified by the
518      * grammar in <a
519      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
520      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
521      *
522      * <ul>
523      *
524      *   <li><p> An empty authority component is permitted as long as it is
525      *   followed by a non-empty path, a query component, or a fragment
526      *   component.  This allows the parsing of URIs such as
527      *   {@code "file:///foo/bar"}, which seems to be the intent of
528      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
529      *   authority component is empty then the user-information, host, and port
530      *   components are undefined. </p></li>
531      *
532      *   <li><p> Empty relative paths are permitted; this seems to be the
533      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
534      *   primary consequence of this deviation is that a standalone fragment
535      *   such as {@code "#foo"} parses as a relative URI with an empty path
536      *   and the given fragment, and can be usefully <a
537      *   href="#resolve-frag">resolved</a> against a base URI.
538      *
539      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
540      *   specified by <a
541      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
542      *   element of a dotted-quad address must contain no more than three
543      *   decimal digits.  Each element is further constrained to have a value
544      *   no greater than 255. </p></li>
545      *
546      *   <li> <p> Hostnames in host components that comprise only a single
547      *   domain label are permitted to start with an <i>alphanum</i>
548      *   character. This seems to be the intent of <a
549      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
550      *   section&nbsp;3.2.2 although the grammar does not permit it. The
551      *   consequence of this deviation is that the authority component of a
552      *   hierarchical URI such as {@code s://123}, will parse as a server-based
553      *   authority. </p></li>
554      *
555      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
556      *   address must be enclosed in square brackets ({@code '['} and
557      *   {@code ']'}) as specified by <a
558      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
559      *   IPv6 address itself must parse according to <a
560      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
561      *   addresses are further constrained to describe no more than sixteen
562      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
563      *   but not expressible in the grammar. </p></li>
564      *
565      *   <li><p> Characters in the <i>other</i> category are permitted wherever
566      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
567      *   user-information, path, query, and fragment components, as well as in
568      *   the authority component if the authority is registry-based.  This
569      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
570      *   character set. </p></li>
571      *
572      * </ul>
573      *
574      * @param  str   The string to be parsed into a URI
575      *
576      * @throws  NullPointerException
577      *          If {@code str} is {@code null}
578      *
579      * @throws  URISyntaxException
580      *          If the given string violates RFC&nbsp;2396, as augmented
581      *          by the above deviations
582      */
URI(String str)583     public URI(String str) throws URISyntaxException {
584         new Parser(str).parse(false);
585     }
586 
587     /**
588      * Constructs a hierarchical URI from the given components.
589      *
590      * <p> If a scheme is given then the path, if also given, must either be
591      * empty or begin with a slash character ({@code '/'}).  Otherwise a
592      * component of the new URI may be left undefined by passing {@code null}
593      * for the corresponding parameter or, in the case of the {@code port}
594      * parameter, by passing {@code -1}.
595      *
596      * <p> This constructor first builds a URI string from the given components
597      * according to the rules specified in <a
598      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
599      * section&nbsp;5.2, step&nbsp;7: </p>
600      *
601      * <ol>
602      *
603      *   <li><p> Initially, the result string is empty. </p></li>
604      *
605      *   <li><p> If a scheme is given then it is appended to the result,
606      *   followed by a colon character ({@code ':'}).  </p></li>
607      *
608      *   <li><p> If user information, a host, or a port are given then the
609      *   string {@code "//"} is appended.  </p></li>
610      *
611      *   <li><p> If user information is given then it is appended, followed by
612      *   a commercial-at character ({@code '@'}).  Any character not in the
613      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
614      *   categories is <a href="#quote">quoted</a>.  </p></li>
615      *
616      *   <li><p> If a host is given then it is appended.  If the host is a
617      *   literal IPv6 address but is not enclosed in square brackets
618      *   ({@code '['} and {@code ']'}) then the square brackets are added.
619      *   </p></li>
620      *
621      *   <li><p> If a port number is given then a colon character
622      *   ({@code ':'}) is appended, followed by the port number in decimal.
623      *   </p></li>
624      *
625      *   <li><p> If a path is given then it is appended.  Any character not in
626      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
627      *   categories, and not equal to the slash character ({@code '/'}) or the
628      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
629      *
630      *   <li><p> If a query is given then a question-mark character
631      *   ({@code '?'}) is appended, followed by the query.  Any character that
632      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
633      *   </p></li>
634      *
635      *   <li><p> Finally, if a fragment is given then a hash character
636      *   ({@code '#'}) is appended, followed by the fragment.  Any character
637      *   that is not a legal URI character is quoted.  </p></li>
638      *
639      * </ol>
640      *
641      * <p> The resulting URI string is then parsed as if by invoking the {@link
642      * #URI(String)} constructor and then invoking the {@link
643      * #parseServerAuthority()} method upon the result; this may cause a {@link
644      * URISyntaxException} to be thrown.  </p>
645      *
646      * @param   scheme    Scheme name
647      * @param   userInfo  User name and authorization information
648      * @param   host      Host name
649      * @param   port      Port number
650      * @param   path      Path
651      * @param   query     Query
652      * @param   fragment  Fragment
653      *
654      * @throws URISyntaxException
655      *         If both a scheme and a path are given but the path is relative,
656      *         if the URI string constructed from the given components violates
657      *         RFC&nbsp;2396, or if the authority component of the string is
658      *         present but cannot be parsed as a server-based authority
659      */
URI(String scheme, String userInfo, String host, int port, String path, String query, String fragment)660     public URI(String scheme,
661                String userInfo, String host, int port,
662                String path, String query, String fragment)
663         throws URISyntaxException
664     {
665         String s = toString(scheme, null,
666                             null, userInfo, host, port,
667                             path, query, fragment);
668         checkPath(s, scheme, path);
669         new Parser(s).parse(true);
670     }
671 
672     /**
673      * Constructs a hierarchical URI from the given components.
674      *
675      * <p> If a scheme is given then the path, if also given, must either be
676      * empty or begin with a slash character ({@code '/'}).  Otherwise a
677      * component of the new URI may be left undefined by passing {@code null}
678      * for the corresponding parameter.
679      *
680      * <p> This constructor first builds a URI string from the given components
681      * according to the rules specified in <a
682      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
683      * section&nbsp;5.2, step&nbsp;7: </p>
684      *
685      * <ol>
686      *
687      *   <li><p> Initially, the result string is empty.  </p></li>
688      *
689      *   <li><p> If a scheme is given then it is appended to the result,
690      *   followed by a colon character ({@code ':'}).  </p></li>
691      *
692      *   <li><p> If an authority is given then the string {@code "//"} is
693      *   appended, followed by the authority.  If the authority contains a
694      *   literal IPv6 address then the address must be enclosed in square
695      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
696      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
697      *   categories, and not equal to the commercial-at character
698      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
699      *
700      *   <li><p> If a path is given then it is appended.  Any character not in
701      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
702      *   categories, and not equal to the slash character ({@code '/'}) or the
703      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
704      *
705      *   <li><p> If a query is given then a question-mark character
706      *   ({@code '?'}) is appended, followed by the query.  Any character that
707      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
708      *   </p></li>
709      *
710      *   <li><p> Finally, if a fragment is given then a hash character
711      *   ({@code '#'}) is appended, followed by the fragment.  Any character
712      *   that is not a legal URI character is quoted.  </p></li>
713      *
714      * </ol>
715      *
716      * <p> The resulting URI string is then parsed as if by invoking the {@link
717      * #URI(String)} constructor and then invoking the {@link
718      * #parseServerAuthority()} method upon the result; this may cause a {@link
719      * URISyntaxException} to be thrown.  </p>
720      *
721      * @param   scheme     Scheme name
722      * @param   authority  Authority
723      * @param   path       Path
724      * @param   query      Query
725      * @param   fragment   Fragment
726      *
727      * @throws URISyntaxException
728      *         If both a scheme and a path are given but the path is relative,
729      *         if the URI string constructed from the given components violates
730      *         RFC&nbsp;2396, or if the authority component of the string is
731      *         present but cannot be parsed as a server-based authority
732      */
URI(String scheme, String authority, String path, String query, String fragment)733     public URI(String scheme,
734                String authority,
735                String path, String query, String fragment)
736         throws URISyntaxException
737     {
738         String s = toString(scheme, null,
739                             authority, null, null, -1,
740                             path, query, fragment);
741         checkPath(s, scheme, path);
742         new Parser(s).parse(false);
743     }
744 
745     /**
746      * Constructs a hierarchical URI from the given components.
747      *
748      * <p> A component may be left undefined by passing {@code null}.
749      *
750      * <p> This convenience constructor works as if by invoking the
751      * seven-argument constructor as follows:
752      *
753      * <blockquote>
754      * {@code new} {@link #URI(String, String, String, int, String, String, String)
755      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
756      * </blockquote>
757      *
758      * @param   scheme    Scheme name
759      * @param   host      Host name
760      * @param   path      Path
761      * @param   fragment  Fragment
762      *
763      * @throws  URISyntaxException
764      *          If the URI string constructed from the given components
765      *          violates RFC&nbsp;2396
766      */
URI(String scheme, String host, String path, String fragment)767     public URI(String scheme, String host, String path, String fragment)
768         throws URISyntaxException
769     {
770         this(scheme, null, host, -1, path, null, fragment);
771     }
772 
773     /**
774      * Constructs a URI from the given components.
775      *
776      * <p> A component may be left undefined by passing {@code null}.
777      *
778      * <p> This constructor first builds a URI in string form using the given
779      * components as follows:  </p>
780      *
781      * <ol>
782      *
783      *   <li><p> Initially, the result string is empty.  </p></li>
784      *
785      *   <li><p> If a scheme is given then it is appended to the result,
786      *   followed by a colon character ({@code ':'}).  </p></li>
787      *
788      *   <li><p> If a scheme-specific part is given then it is appended.  Any
789      *   character that is not a <a href="#legal-chars">legal URI character</a>
790      *   is <a href="#quote">quoted</a>.  </p></li>
791      *
792      *   <li><p> Finally, if a fragment is given then a hash character
793      *   ({@code '#'}) is appended to the string, followed by the fragment.
794      *   Any character that is not a legal URI character is quoted.  </p></li>
795      *
796      * </ol>
797      *
798      * <p> The resulting URI string is then parsed in order to create the new
799      * URI instance as if by invoking the {@link #URI(String)} constructor;
800      * this may cause a {@link URISyntaxException} to be thrown.  </p>
801      *
802      * @param   scheme    Scheme name
803      * @param   ssp       Scheme-specific part
804      * @param   fragment  Fragment
805      *
806      * @throws  URISyntaxException
807      *          If the URI string constructed from the given components
808      *          violates RFC&nbsp;2396
809      */
URI(String scheme, String ssp, String fragment)810     public URI(String scheme, String ssp, String fragment)
811         throws URISyntaxException
812     {
813         new Parser(toString(scheme, ssp,
814                             null, null, null, -1,
815                             null, null, fragment))
816             .parse(false);
817     }
818 
819     /**
820      * Creates a URI by parsing the given string.
821      *
822      * <p> This convenience factory method works as if by invoking the {@link
823      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
824      * constructor is caught and wrapped in a new {@link
825      * IllegalArgumentException} object, which is then thrown.
826      *
827      * <p> This method is provided for use in situations where it is known that
828      * the given string is a legal URI, for example for URI constants declared
829      * within in a program, and so it would be considered a programming error
830      * for the string not to parse as such.  The constructors, which throw
831      * {@link URISyntaxException} directly, should be used situations where a
832      * URI is being constructed from user input or from some other source that
833      * may be prone to errors.  </p>
834      *
835      * @param  str   The string to be parsed into a URI
836      * @return The new URI
837      *
838      * @throws  NullPointerException
839      *          If {@code str} is {@code null}
840      *
841      * @throws  IllegalArgumentException
842      *          If the given string violates RFC&nbsp;2396
843      */
create(String str)844     public static URI create(String str) {
845         try {
846             return new URI(str);
847         } catch (URISyntaxException x) {
848             throw new IllegalArgumentException(x.getMessage(), x);
849         }
850     }
851 
852 
853     // -- Operations --
854 
855     /**
856      * Attempts to parse this URI's authority component, if defined, into
857      * user-information, host, and port components.
858      *
859      * <p> If this URI's authority component has already been recognized as
860      * being server-based then it will already have been parsed into
861      * user-information, host, and port components.  In this case, or if this
862      * URI has no authority component, this method simply returns this URI.
863      *
864      * <p> Otherwise this method attempts once more to parse the authority
865      * component into user-information, host, and port components, and throws
866      * an exception describing why the authority component could not be parsed
867      * in that way.
868      *
869      * <p> This method is provided because the generic URI syntax specified in
870      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
871      * cannot always distinguish a malformed server-based authority from a
872      * legitimate registry-based authority.  It must therefore treat some
873      * instances of the former as instances of the latter.  The authority
874      * component in the URI string {@code "//foo:bar"}, for example, is not a
875      * legal server-based authority but it is legal as a registry-based
876      * authority.
877      *
878      * <p> In many common situations, for example when working URIs that are
879      * known to be either URNs or URLs, the hierarchical URIs being used will
880      * always be server-based.  They therefore must either be parsed as such or
881      * treated as an error.  In these cases a statement such as
882      *
883      * <blockquote>
884      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
885      * </blockquote>
886      *
887      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
888      * it has an authority component, has a server-based authority with proper
889      * user-information, host, and port components.  Invoking this method also
890      * ensures that if the authority could not be parsed in that way then an
891      * appropriate diagnostic message can be issued based upon the exception
892      * that is thrown. </p>
893      *
894      * @return  A URI whose authority field has been parsed
895      *          as a server-based authority
896      *
897      * @throws  URISyntaxException
898      *          If the authority component of this URI is defined
899      *          but cannot be parsed as a server-based authority
900      *          according to RFC&nbsp;2396
901      */
parseServerAuthority()902     public URI parseServerAuthority()
903         throws URISyntaxException
904     {
905         // We could be clever and cache the error message and index from the
906         // exception thrown during the original parse, but that would require
907         // either more fields or a more-obscure representation.
908         if ((host != null) || (authority == null))
909             return this;
910         defineString();
911         new Parser(string).parse(true);
912         return this;
913     }
914 
915     /**
916      * Normalizes this URI's path.
917      *
918      * <p> If this URI is opaque, or if its path is already in normal form,
919      * then this URI is returned.  Otherwise a new URI is constructed that is
920      * identical to this URI except that its path is computed by normalizing
921      * this URI's path in a manner consistent with <a
922      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
923      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
924      * </p>
925      *
926      * <ol>
927      *
928      *   <li><p> All {@code "."} segments are removed. </p></li>
929      *
930      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
931      *   segment then both of these segments are removed.  This step is
932      *   repeated until it is no longer applicable. </p></li>
933      *
934      *   <li><p> If the path is relative, and if its first segment contains a
935      *   colon character ({@code ':'}), then a {@code "."} segment is
936      *   prepended.  This prevents a relative URI with a path such as
937      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
938      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
939      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
940      *
941      * </ol>
942      *
943      * <p> A normalized path will begin with one or more {@code ".."} segments
944      * if there were insufficient non-{@code ".."} segments preceding them to
945      * allow their removal.  A normalized path will begin with a {@code "."}
946      * segment if one was inserted by step 3 above.  Otherwise, a normalized
947      * path will not contain any {@code "."} or {@code ".."} segments. </p>
948      *
949      * @return  A URI equivalent to this URI,
950      *          but whose path is in normal form
951      */
normalize()952     public URI normalize() {
953         return normalize(this);
954     }
955 
956     /**
957      * Resolves the given URI against this URI.
958      *
959      * <p> If the given URI is already absolute, or if this URI is opaque, then
960      * the given URI is returned.
961      *
962      * <p><a name="resolve-frag"></a> If the given URI's fragment component is
963      * defined, its path component is empty, and its scheme, authority, and
964      * query components are undefined, then a URI with the given fragment but
965      * with all other components equal to those of this URI is returned.  This
966      * allows a URI representing a standalone fragment reference, such as
967      * {@code "#foo"}, to be usefully resolved against a base URI.
968      *
969      * <p> Otherwise this method constructs a new hierarchical URI in a manner
970      * consistent with <a
971      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
972      * section&nbsp;5.2; that is: </p>
973      *
974      * <ol>
975      *
976      *   <li><p> A new URI is constructed with this URI's scheme and the given
977      *   URI's query and fragment components. </p></li>
978      *
979      *   <li><p> If the given URI has an authority component then the new URI's
980      *   authority and path are taken from the given URI. </p></li>
981      *
982      *   <li><p> Otherwise the new URI's authority component is copied from
983      *   this URI, and its path is computed as follows: </p>
984      *
985      *   <ol>
986      *
987      *     <li><p> If the given URI's path is absolute then the new URI's path
988      *     is taken from the given URI. </p></li>
989      *
990      *     <li><p> Otherwise the given URI's path is relative, and so the new
991      *     URI's path is computed by resolving the path of the given URI
992      *     against the path of this URI.  This is done by concatenating all but
993      *     the last segment of this URI's path, if any, with the given URI's
994      *     path and then normalizing the result as if by invoking the {@link
995      *     #normalize() normalize} method. </p></li>
996      *
997      *   </ol></li>
998      *
999      * </ol>
1000      *
1001      * <p> The result of this method is absolute if, and only if, either this
1002      * URI is absolute or the given URI is absolute.  </p>
1003      *
1004      * @param  uri  The URI to be resolved against this URI
1005      * @return The resulting URI
1006      *
1007      * @throws  NullPointerException
1008      *          If {@code uri} is {@code null}
1009      */
resolve(URI uri)1010     public URI resolve(URI uri) {
1011         return resolve(this, uri);
1012     }
1013 
1014     /**
1015      * Constructs a new URI by parsing the given string and then resolving it
1016      * against this URI.
1017      *
1018      * <p> This convenience method works as if invoking it were equivalent to
1019      * evaluating the expression {@link #resolve(java.net.URI)
1020      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1021      *
1022      * @param  str   The string to be parsed into a URI
1023      * @return The resulting URI
1024      *
1025      * @throws  NullPointerException
1026      *          If {@code str} is {@code null}
1027      *
1028      * @throws  IllegalArgumentException
1029      *          If the given string violates RFC&nbsp;2396
1030      */
resolve(String str)1031     public URI resolve(String str) {
1032         return resolve(URI.create(str));
1033     }
1034 
1035     /**
1036      * Relativizes the given URI against this URI.
1037      *
1038      * <p> The relativization of the given URI against this URI is computed as
1039      * follows: </p>
1040      *
1041      * <ol>
1042      *
1043      *   <li><p> If either this URI or the given URI are opaque, or if the
1044      *   scheme and authority components of the two URIs are not identical, or
1045      *   if the path of this URI is not a prefix of the path of the given URI,
1046      *   then the given URI is returned. </p></li>
1047      *
1048      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1049      *   query and fragment components taken from the given URI and with a path
1050      *   component computed by removing this URI's path from the beginning of
1051      *   the given URI's path. </p></li>
1052      *
1053      * </ol>
1054      *
1055      * @param  uri  The URI to be relativized against this URI
1056      * @return The resulting URI
1057      *
1058      * @throws  NullPointerException
1059      *          If {@code uri} is {@code null}
1060      */
relativize(URI uri)1061     public URI relativize(URI uri) {
1062         return relativize(this, uri);
1063     }
1064 
1065     /**
1066      * Constructs a URL from this URI.
1067      *
1068      * <p> This convenience method works as if invoking it were equivalent to
1069      * evaluating the expression {@code new URL(this.toString())} after
1070      * first checking that this URI is absolute. </p>
1071      *
1072      * @return  A URL constructed from this URI
1073      *
1074      * @throws  IllegalArgumentException
1075      *          If this URL is not absolute
1076      *
1077      * @throws  MalformedURLException
1078      *          If a protocol handler for the URL could not be found,
1079      *          or if some other error occurred while constructing the URL
1080      */
toURL()1081     public URL toURL()
1082         throws MalformedURLException {
1083         if (!isAbsolute())
1084             throw new IllegalArgumentException("URI is not absolute");
1085         return new URL(toString());
1086     }
1087 
1088     // -- Component access methods --
1089 
1090     /**
1091      * Returns the scheme component of this URI.
1092      *
1093      * <p> The scheme component of a URI, if defined, only contains characters
1094      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1095      * scheme always starts with an <i>alpha</i> character. <p>
1096      *
1097      * The scheme component of a URI cannot contain escaped octets, hence this
1098      * method does not perform any decoding.
1099      *
1100      * @return  The scheme component of this URI,
1101      *          or {@code null} if the scheme is undefined
1102      */
getScheme()1103     public String getScheme() {
1104         return scheme;
1105     }
1106 
1107     /**
1108      * Tells whether or not this URI is absolute.
1109      *
1110      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1111      *
1112      * @return  {@code true} if, and only if, this URI is absolute
1113      */
isAbsolute()1114     public boolean isAbsolute() {
1115         return scheme != null;
1116     }
1117 
1118     /**
1119      * Tells whether or not this URI is opaque.
1120      *
1121      * <p> A URI is opaque if, and only if, it is absolute and its
1122      * scheme-specific part does not begin with a slash character ('/').
1123      * An opaque URI has a scheme, a scheme-specific part, and possibly
1124      * a fragment; all other components are undefined. </p>
1125      *
1126      * @return  {@code true} if, and only if, this URI is opaque
1127      */
isOpaque()1128     public boolean isOpaque() {
1129         return path == null;
1130     }
1131 
1132     /**
1133      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1134      * part is never undefined, though it may be empty.
1135      *
1136      * <p> The scheme-specific part of a URI only contains legal URI
1137      * characters. </p>
1138      *
1139      * @return  The raw scheme-specific part of this URI
1140      *          (never {@code null})
1141      */
getRawSchemeSpecificPart()1142     public String getRawSchemeSpecificPart() {
1143         defineSchemeSpecificPart();
1144         return schemeSpecificPart;
1145     }
1146 
1147     /**
1148      * Returns the decoded scheme-specific part of this URI.
1149      *
1150      * <p> The string returned by this method is equal to that returned by the
1151      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1152      * except that all sequences of escaped octets are <a
1153      * href="#decode">decoded</a>.  </p>
1154      *
1155      * @return  The decoded scheme-specific part of this URI
1156      *          (never {@code null})
1157      */
getSchemeSpecificPart()1158     public String getSchemeSpecificPart() {
1159         if (decodedSchemeSpecificPart == null)
1160             decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
1161         return decodedSchemeSpecificPart;
1162     }
1163 
1164     /**
1165      * Returns the raw authority component of this URI.
1166      *
1167      * <p> The authority component of a URI, if defined, only contains the
1168      * commercial-at character ({@code '@'}) and characters in the
1169      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1170      * categories.  If the authority is server-based then it is further
1171      * constrained to have valid user-information, host, and port
1172      * components. </p>
1173      *
1174      * @return  The raw authority component of this URI,
1175      *          or {@code null} if the authority is undefined
1176      */
getRawAuthority()1177     public String getRawAuthority() {
1178         return authority;
1179     }
1180 
1181     /**
1182      * Returns the decoded authority component of this URI.
1183      *
1184      * <p> The string returned by this method is equal to that returned by the
1185      * {@link #getRawAuthority() getRawAuthority} method except that all
1186      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1187      *
1188      * @return  The decoded authority component of this URI,
1189      *          or {@code null} if the authority is undefined
1190      */
getAuthority()1191     public String getAuthority() {
1192         if (decodedAuthority == null)
1193             decodedAuthority = decode(authority);
1194         return decodedAuthority;
1195     }
1196 
1197     /**
1198      * Returns the raw user-information component of this URI.
1199      *
1200      * <p> The user-information component of a URI, if defined, only contains
1201      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1202      * <i>other</i> categories. </p>
1203      *
1204      * @return  The raw user-information component of this URI,
1205      *          or {@code null} if the user information is undefined
1206      */
getRawUserInfo()1207     public String getRawUserInfo() {
1208         return userInfo;
1209     }
1210 
1211     /**
1212      * Returns the decoded user-information component of this URI.
1213      *
1214      * <p> The string returned by this method is equal to that returned by the
1215      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1216      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1217      *
1218      * @return  The decoded user-information component of this URI,
1219      *          or {@code null} if the user information is undefined
1220      */
getUserInfo()1221     public String getUserInfo() {
1222         if ((decodedUserInfo == null) && (userInfo != null))
1223             decodedUserInfo = decode(userInfo);
1224         return decodedUserInfo;
1225     }
1226 
1227     /**
1228      * Returns the host component of this URI.
1229      *
1230      * <p> The host component of a URI, if defined, will have one of the
1231      * following forms: </p>
1232      *
1233      * <ul>
1234      *
1235      *   <li><p> A domain name consisting of one or more <i>labels</i>
1236      *   separated by period characters ({@code '.'}), optionally followed by
1237      *   a period character.  Each label consists of <i>alphanum</i> characters
1238      *   as well as hyphen characters ({@code '-'}), though hyphens never
1239      *   occur as the first or last characters in a label. The rightmost
1240      *   label of a domain name consisting of two or more labels, begins
1241      *   with an <i>alpha</i> character. </li>
1242      *
1243      *   <li><p> A dotted-quad IPv4 address of the form
1244      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1245      *   where no <i>digit</i> sequence is longer than three characters and no
1246      *   sequence has a value larger than 255. </p></li>
1247      *
1248      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1249      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1250      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1251      *   syntax of IPv6 addresses is specified in <a
1252      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1253      *   Addressing Architecture</i></a>.  </p></li>
1254      *
1255      * </ul>
1256      *
1257      * The host component of a URI cannot contain escaped octets, hence this
1258      * method does not perform any decoding.
1259      *
1260      * @return  The host component of this URI,
1261      *          or {@code null} if the host is undefined
1262      */
getHost()1263     public String getHost() {
1264         return host;
1265     }
1266 
1267     /**
1268      * Returns the port number of this URI.
1269      *
1270      * <p> The port component of a URI, if defined, is a non-negative
1271      * integer. </p>
1272      *
1273      * @return  The port component of this URI,
1274      *          or {@code -1} if the port is undefined
1275      */
getPort()1276     public int getPort() {
1277         return port;
1278     }
1279 
1280     /**
1281      * Returns the raw path component of this URI.
1282      *
1283      * <p> The path component of a URI, if defined, only contains the slash
1284      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1285      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1286      * and <i>other</i> categories. </p>
1287      *
1288      * @return  The path component of this URI,
1289      *          or {@code null} if the path is undefined
1290      */
getRawPath()1291     public String getRawPath() {
1292         return path;
1293     }
1294 
1295     /**
1296      * Returns the decoded path component of this URI.
1297      *
1298      * <p> The string returned by this method is equal to that returned by the
1299      * {@link #getRawPath() getRawPath} method except that all sequences of
1300      * escaped octets are <a href="#decode">decoded</a>.  </p>
1301      *
1302      * @return  The decoded path component of this URI,
1303      *          or {@code null} if the path is undefined
1304      */
getPath()1305     public String getPath() {
1306         if ((decodedPath == null) && (path != null))
1307             decodedPath = decode(path);
1308         return decodedPath;
1309     }
1310 
1311     /**
1312      * Returns the raw query component of this URI.
1313      *
1314      * <p> The query component of a URI, if defined, only contains legal URI
1315      * characters. </p>
1316      *
1317      * @return  The raw query component of this URI,
1318      *          or {@code null} if the query is undefined
1319      */
getRawQuery()1320     public String getRawQuery() {
1321         return query;
1322     }
1323 
1324     /**
1325      * Returns the decoded query component of this URI.
1326      *
1327      * <p> The string returned by this method is equal to that returned by the
1328      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1329      * escaped octets are <a href="#decode">decoded</a>.  </p>
1330      *
1331      * @return  The decoded query component of this URI,
1332      *          or {@code null} if the query is undefined
1333      */
getQuery()1334     public String getQuery() {
1335         if ((decodedQuery == null) && (query != null))
1336             decodedQuery = decode(query);
1337         return decodedQuery;
1338     }
1339 
1340     /**
1341      * Returns the raw fragment component of this URI.
1342      *
1343      * <p> The fragment component of a URI, if defined, only contains legal URI
1344      * characters. </p>
1345      *
1346      * @return  The raw fragment component of this URI,
1347      *          or {@code null} if the fragment is undefined
1348      */
getRawFragment()1349     public String getRawFragment() {
1350         return fragment;
1351     }
1352 
1353     /**
1354      * Returns the decoded fragment component of this URI.
1355      *
1356      * <p> The string returned by this method is equal to that returned by the
1357      * {@link #getRawFragment() getRawFragment} method except that all
1358      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1359      *
1360      * @return  The decoded fragment component of this URI,
1361      *          or {@code null} if the fragment is undefined
1362      */
getFragment()1363     public String getFragment() {
1364         if ((decodedFragment == null) && (fragment != null))
1365             decodedFragment = decode(fragment);
1366         return decodedFragment;
1367     }
1368 
1369 
1370     // -- Equality, comparison, hash code, toString, and serialization --
1371 
1372     /**
1373      * Tests this URI for equality with another object.
1374      *
1375      * <p> If the given object is not a URI then this method immediately
1376      * returns {@code false}.
1377      *
1378      * <p> For two URIs to be considered equal requires that either both are
1379      * opaque or both are hierarchical.  Their schemes must either both be
1380      * undefined or else be equal without regard to case. Their fragments
1381      * must either both be undefined or else be equal.
1382      *
1383      * <p> For two opaque URIs to be considered equal, their scheme-specific
1384      * parts must be equal.
1385      *
1386      * <p> For two hierarchical URIs to be considered equal, their paths must
1387      * be equal and their queries must either both be undefined or else be
1388      * equal.  Their authorities must either both be undefined, or both be
1389      * registry-based, or both be server-based.  If their authorities are
1390      * defined and are registry-based, then they must be equal.  If their
1391      * authorities are defined and are server-based, then their hosts must be
1392      * equal without regard to case, their port numbers must be equal, and
1393      * their user-information components must be equal.
1394      *
1395      * <p> When testing the user-information, path, query, fragment, authority,
1396      * or scheme-specific parts of two URIs for equality, the raw forms rather
1397      * than the encoded forms of these components are compared and the
1398      * hexadecimal digits of escaped octets are compared without regard to
1399      * case.
1400      *
1401      * <p> This method satisfies the general contract of the {@link
1402      * java.lang.Object#equals(Object) Object.equals} method. </p>
1403      *
1404      * @param   ob   The object to which this object is to be compared
1405      *
1406      * @return  {@code true} if, and only if, the given object is a URI that
1407      *          is identical to this URI
1408      */
equals(Object ob)1409     public boolean equals(Object ob) {
1410         if (ob == this)
1411             return true;
1412         if (!(ob instanceof URI))
1413             return false;
1414         URI that = (URI)ob;
1415         if (this.isOpaque() != that.isOpaque()) return false;
1416         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1417         if (!equal(this.fragment, that.fragment)) return false;
1418 
1419         // Opaque
1420         if (this.isOpaque())
1421             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1422 
1423         // Hierarchical
1424         if (!equal(this.path, that.path)) return false;
1425         if (!equal(this.query, that.query)) return false;
1426 
1427         // Authorities
1428         if (this.authority == that.authority) return true;
1429         if (this.host != null) {
1430             // Server-based
1431             if (!equal(this.userInfo, that.userInfo)) return false;
1432             if (!equalIgnoringCase(this.host, that.host)) return false;
1433             if (this.port != that.port) return false;
1434         } else if (this.authority != null) {
1435             // Registry-based
1436             if (!equal(this.authority, that.authority)) return false;
1437         } else if (this.authority != that.authority) {
1438             return false;
1439         }
1440 
1441         return true;
1442     }
1443 
1444     /**
1445      * Returns a hash-code value for this URI.  The hash code is based upon all
1446      * of the URI's components, and satisfies the general contract of the
1447      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1448      *
1449      * @return  A hash-code value for this URI
1450      */
hashCode()1451     public int hashCode() {
1452         if (hash != 0)
1453             return hash;
1454         int h = hashIgnoringCase(0, scheme);
1455         h = hash(h, fragment);
1456         if (isOpaque()) {
1457             h = hash(h, schemeSpecificPart);
1458         } else {
1459             h = hash(h, path);
1460             h = hash(h, query);
1461             if (host != null) {
1462                 h = hash(h, userInfo);
1463                 h = hashIgnoringCase(h, host);
1464                 h += 1949 * port;
1465             } else {
1466                 h = hash(h, authority);
1467             }
1468         }
1469         hash = h;
1470         return h;
1471     }
1472 
1473     /**
1474      * Compares this URI to another object, which must be a URI.
1475      *
1476      * <p> When comparing corresponding components of two URIs, if one
1477      * component is undefined but the other is defined then the first is
1478      * considered to be less than the second.  Unless otherwise noted, string
1479      * components are ordered according to their natural, case-sensitive
1480      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1481      * String.compareTo} method.  String components that are subject to
1482      * encoding are compared by comparing their raw forms rather than their
1483      * encoded forms.
1484      *
1485      * <p> The ordering of URIs is defined as follows: </p>
1486      *
1487      * <ul>
1488      *
1489      *   <li><p> Two URIs with different schemes are ordered according the
1490      *   ordering of their schemes, without regard to case. </p></li>
1491      *
1492      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1493      *   with an identical scheme. </p></li>
1494      *
1495      *   <li><p> Two opaque URIs with identical schemes are ordered according
1496      *   to the ordering of their scheme-specific parts. </p></li>
1497      *
1498      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1499      *   parts are ordered according to the ordering of their
1500      *   fragments. </p></li>
1501      *
1502      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1503      *   according to the ordering of their authority components: </p>
1504      *
1505      *   <ul>
1506      *
1507      *     <li><p> If both authority components are server-based then the URIs
1508      *     are ordered according to their user-information components; if these
1509      *     components are identical then the URIs are ordered according to the
1510      *     ordering of their hosts, without regard to case; if the hosts are
1511      *     identical then the URIs are ordered according to the ordering of
1512      *     their ports. </p></li>
1513      *
1514      *     <li><p> If one or both authority components are registry-based then
1515      *     the URIs are ordered according to the ordering of their authority
1516      *     components. </p></li>
1517      *
1518      *   </ul></li>
1519      *
1520      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1521      *   authority components are ordered according to the ordering of their
1522      *   paths; if their paths are identical then they are ordered according to
1523      *   the ordering of their queries; if the queries are identical then they
1524      *   are ordered according to the order of their fragments. </p></li>
1525      *
1526      * </ul>
1527      *
1528      * <p> This method satisfies the general contract of the {@link
1529      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1530      * method. </p>
1531      *
1532      * @param   that
1533      *          The object to which this URI is to be compared
1534      *
1535      * @return  A negative integer, zero, or a positive integer as this URI is
1536      *          less than, equal to, or greater than the given URI
1537      *
1538      * @throws  ClassCastException
1539      *          If the given object is not a URI
1540      */
compareTo(URI that)1541     public int compareTo(URI that) {
1542         int c;
1543 
1544         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1545             return c;
1546 
1547         if (this.isOpaque()) {
1548             if (that.isOpaque()) {
1549                 // Both opaque
1550                 if ((c = compare(this.schemeSpecificPart,
1551                                  that.schemeSpecificPart)) != 0)
1552                     return c;
1553                 return compare(this.fragment, that.fragment);
1554             }
1555             return +1;                  // Opaque > hierarchical
1556         } else if (that.isOpaque()) {
1557             return -1;                  // Hierarchical < opaque
1558         }
1559 
1560         // Hierarchical
1561         if ((this.host != null) && (that.host != null)) {
1562             // Both server-based
1563             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1564                 return c;
1565             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1566                 return c;
1567             if ((c = this.port - that.port) != 0)
1568                 return c;
1569         } else {
1570             // If one or both authorities are registry-based then we simply
1571             // compare them in the usual, case-sensitive way.  If one is
1572             // registry-based and one is server-based then the strings are
1573             // guaranteed to be unequal, hence the comparison will never return
1574             // zero and the compareTo and equals methods will remain
1575             // consistent.
1576             if ((c = compare(this.authority, that.authority)) != 0) return c;
1577         }
1578 
1579         if ((c = compare(this.path, that.path)) != 0) return c;
1580         if ((c = compare(this.query, that.query)) != 0) return c;
1581         return compare(this.fragment, that.fragment);
1582     }
1583 
1584     /**
1585      * Returns the content of this URI as a string.
1586      *
1587      * <p> If this URI was created by invoking one of the constructors in this
1588      * class then a string equivalent to the original input string, or to the
1589      * string computed from the originally-given components, as appropriate, is
1590      * returned.  Otherwise this URI was created by normalization, resolution,
1591      * or relativization, and so a string is constructed from this URI's
1592      * components according to the rules specified in <a
1593      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1594      * section&nbsp;5.2, step&nbsp;7. </p>
1595      *
1596      * @return  The string form of this URI
1597      */
toString()1598     public String toString() {
1599         defineString();
1600         return string;
1601     }
1602 
1603     /**
1604      * Returns the content of this URI as a US-ASCII string.
1605      *
1606      * <p> If this URI does not contain any characters in the <i>other</i>
1607      * category then an invocation of this method will return the same value as
1608      * an invocation of the {@link #toString() toString} method.  Otherwise
1609      * this method works as if by invoking that method and then <a
1610      * href="#encode">encoding</a> the result.  </p>
1611      *
1612      * @return  The string form of this URI, encoded as needed
1613      *          so that it only contains characters in the US-ASCII
1614      *          charset
1615      */
toASCIIString()1616     public String toASCIIString() {
1617         defineString();
1618         return encode(string);
1619     }
1620 
1621 
1622     // -- Serialization support --
1623 
1624     /**
1625      * Saves the content of this URI to the given serial stream.
1626      *
1627      * <p> The only serializable field of a URI instance is its {@code string}
1628      * field.  That field is given a value, if it does not have one already,
1629      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1630      * method of the given object-output stream is invoked. </p>
1631      *
1632      * @param  os  The object-output stream to which this object
1633      *             is to be written
1634      */
writeObject(ObjectOutputStream os)1635     private void writeObject(ObjectOutputStream os)
1636         throws IOException
1637     {
1638         defineString();
1639         os.defaultWriteObject();        // Writes the string field only
1640     }
1641 
1642     /**
1643      * Reconstitutes a URI from the given serial stream.
1644      *
1645      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1646      * invoked to read the value of the {@code string} field.  The result is
1647      * then parsed in the usual way.
1648      *
1649      * @param  is  The object-input stream from which this object
1650      *             is being read
1651      */
readObject(ObjectInputStream is)1652     private void readObject(ObjectInputStream is)
1653         throws ClassNotFoundException, IOException
1654     {
1655         port = -1;                      // Argh
1656         is.defaultReadObject();
1657         try {
1658             new Parser(string).parse(false);
1659         } catch (URISyntaxException x) {
1660             IOException y = new InvalidObjectException("Invalid URI");
1661             y.initCause(x);
1662             throw y;
1663         }
1664     }
1665 
1666 
1667     // -- End of public methods --
1668 
1669 
1670     // -- Utility methods for string-field comparison and hashing --
1671 
1672     // These methods return appropriate values for null string arguments,
1673     // thereby simplifying the equals, hashCode, and compareTo methods.
1674     //
1675     // The case-ignoring methods should only be applied to strings whose
1676     // characters are all known to be US-ASCII.  Because of this restriction,
1677     // these methods are faster than the similar methods in the String class.
1678 
1679     // US-ASCII only
toLower(char c)1680     private static int toLower(char c) {
1681         if ((c >= 'A') && (c <= 'Z'))
1682             return c + ('a' - 'A');
1683         return c;
1684     }
1685 
1686     // US-ASCII only
toUpper(char c)1687     private static int toUpper(char c) {
1688         if ((c >= 'a') && (c <= 'z'))
1689             return c - ('a' - 'A');
1690         return c;
1691     }
1692 
equal(String s, String t)1693     private static boolean equal(String s, String t) {
1694         if (s == t) return true;
1695         if ((s != null) && (t != null)) {
1696             if (s.length() != t.length())
1697                 return false;
1698             if (s.indexOf('%') < 0)
1699                 return s.equals(t);
1700             int n = s.length();
1701             for (int i = 0; i < n;) {
1702                 char c = s.charAt(i);
1703                 char d = t.charAt(i);
1704                 if (c != '%') {
1705                     if (c != d)
1706                         return false;
1707                     i++;
1708                     continue;
1709                 }
1710                 if (d != '%')
1711                     return false;
1712                 i++;
1713                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1714                     return false;
1715                 i++;
1716                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1717                     return false;
1718                 i++;
1719             }
1720             return true;
1721         }
1722         return false;
1723     }
1724 
1725     // US-ASCII only
equalIgnoringCase(String s, String t)1726     private static boolean equalIgnoringCase(String s, String t) {
1727         if (s == t) return true;
1728         if ((s != null) && (t != null)) {
1729             int n = s.length();
1730             if (t.length() != n)
1731                 return false;
1732             for (int i = 0; i < n; i++) {
1733                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1734                     return false;
1735             }
1736             return true;
1737         }
1738         return false;
1739     }
1740 
hash(int hash, String s)1741     private static int hash(int hash, String s) {
1742         if (s == null) return hash;
1743         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1744                                   : normalizedHash(hash, s);
1745     }
1746 
1747 
normalizedHash(int hash, String s)1748     private static int normalizedHash(int hash, String s) {
1749         int h = 0;
1750         for (int index = 0; index < s.length(); index++) {
1751             char ch = s.charAt(index);
1752             h = 31 * h + ch;
1753             if (ch == '%') {
1754                 /*
1755                  * Process the next two encoded characters
1756                  */
1757                 for (int i = index + 1; i < index + 3; i++)
1758                     h = 31 * h + toUpper(s.charAt(i));
1759                 index += 2;
1760             }
1761         }
1762         return hash * 127 + h;
1763     }
1764 
1765     // US-ASCII only
hashIgnoringCase(int hash, String s)1766     private static int hashIgnoringCase(int hash, String s) {
1767         if (s == null) return hash;
1768         int h = hash;
1769         int n = s.length();
1770         for (int i = 0; i < n; i++)
1771             h = 31 * h + toLower(s.charAt(i));
1772         return h;
1773     }
1774 
compare(String s, String t)1775     private static int compare(String s, String t) {
1776         if (s == t) return 0;
1777         if (s != null) {
1778             if (t != null)
1779                 return s.compareTo(t);
1780             else
1781                 return +1;
1782         } else {
1783             return -1;
1784         }
1785     }
1786 
1787     // US-ASCII only
compareIgnoringCase(String s, String t)1788     private static int compareIgnoringCase(String s, String t) {
1789         if (s == t) return 0;
1790         if (s != null) {
1791             if (t != null) {
1792                 int sn = s.length();
1793                 int tn = t.length();
1794                 int n = sn < tn ? sn : tn;
1795                 for (int i = 0; i < n; i++) {
1796                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1797                     if (c != 0)
1798                         return c;
1799                 }
1800                 return sn - tn;
1801             }
1802             return +1;
1803         } else {
1804             return -1;
1805         }
1806     }
1807 
1808 
1809     // -- String construction --
1810 
1811     // If a scheme is given then the path, if given, must be absolute
1812     //
1813     private static void checkPath(String s, String scheme, String path)
1814         throws URISyntaxException
1815     {
1816         if (scheme != null) {
1817             if ((path != null)
1818                 && ((path.length() > 0) && (path.charAt(0) != '/')))
1819                 throw new URISyntaxException(s,
1820                                              "Relative path in absolute URI");
1821         }
1822     }
1823 
1824     private void appendAuthority(StringBuffer sb,
1825                                  String authority,
1826                                  String userInfo,
1827                                  String host,
1828                                  int port)
1829     {
1830         if (host != null) {
1831             sb.append("//");
1832             if (userInfo != null) {
1833                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1834                 sb.append('@');
1835             }
1836             boolean needBrackets = ((host.indexOf(':') >= 0)
1837                                     && !host.startsWith("[")
1838                                     && !host.endsWith("]"));
1839             if (needBrackets) sb.append('[');
1840             sb.append(host);
1841             if (needBrackets) sb.append(']');
1842             if (port != -1) {
1843                 sb.append(':');
1844                 sb.append(port);
1845             }
1846         } else if (authority != null) {
1847             sb.append("//");
1848             if (authority.startsWith("[")) {
1849                 // authority should (but may not) contain an embedded IPv6 address
1850                 int end = authority.indexOf("]");
1851                 String doquote = authority, dontquote = "";
1852                 if (end != -1 && authority.indexOf(":") != -1) {
1853                     // the authority contains an IPv6 address
1854                     if (end == authority.length()) {
1855                         dontquote = authority;
1856                         doquote = "";
1857                     } else {
1858                         dontquote = authority.substring(0 , end + 1);
1859                         doquote = authority.substring(end + 1);
1860                     }
1861                 }
1862                 sb.append(dontquote);
1863                 sb.append(quote(doquote,
1864                             L_REG_NAME | L_SERVER,
1865                             H_REG_NAME | H_SERVER));
1866             } else {
1867                 sb.append(quote(authority,
1868                             L_REG_NAME | L_SERVER,
1869                             H_REG_NAME | H_SERVER));
1870             }
1871         }
1872     }
1873 
appendSchemeSpecificPart(StringBuffer sb, String opaquePart, String authority, String userInfo, String host, int port, String path, String query)1874     private void appendSchemeSpecificPart(StringBuffer sb,
1875                                           String opaquePart,
1876                                           String authority,
1877                                           String userInfo,
1878                                           String host,
1879                                           int port,
1880                                           String path,
1881                                           String query)
1882     {
1883         if (opaquePart != null) {
1884             /* check if SSP begins with an IPv6 address
1885              * because we must not quote a literal IPv6 address
1886              */
1887             if (opaquePart.startsWith("//[")) {
1888                 int end =  opaquePart.indexOf("]");
1889                 if (end != -1 && opaquePart.indexOf(":")!=-1) {
1890                     String doquote, dontquote;
1891                     if (end == opaquePart.length()) {
1892                         dontquote = opaquePart;
1893                         doquote = "";
1894                     } else {
1895                         dontquote = opaquePart.substring(0,end+1);
1896                         doquote = opaquePart.substring(end+1);
1897                     }
1898                     sb.append (dontquote);
1899                     sb.append(quote(doquote, L_URIC, H_URIC));
1900                 }
1901             } else {
1902                 sb.append(quote(opaquePart, L_URIC, H_URIC));
1903             }
1904         } else {
1905             appendAuthority(sb, authority, userInfo, host, port);
1906             if (path != null)
1907                 sb.append(quote(path, L_PATH, H_PATH));
1908             if (query != null) {
1909                 sb.append('?');
1910                 sb.append(quote(query, L_URIC, H_URIC));
1911             }
1912         }
1913     }
1914 
appendFragment(StringBuffer sb, String fragment)1915     private void appendFragment(StringBuffer sb, String fragment) {
1916         if (fragment != null) {
1917             sb.append('#');
1918             sb.append(quote(fragment, L_URIC, H_URIC));
1919         }
1920     }
1921 
toString(String scheme, String opaquePart, String authority, String userInfo, String host, int port, String path, String query, String fragment)1922     private String toString(String scheme,
1923                             String opaquePart,
1924                             String authority,
1925                             String userInfo,
1926                             String host,
1927                             int port,
1928                             String path,
1929                             String query,
1930                             String fragment)
1931     {
1932         StringBuffer sb = new StringBuffer();
1933         if (scheme != null) {
1934             sb.append(scheme);
1935             sb.append(':');
1936         }
1937         appendSchemeSpecificPart(sb, opaquePart,
1938                                  authority, userInfo, host, port,
1939                                  path, query);
1940         appendFragment(sb, fragment);
1941         return sb.toString();
1942     }
1943 
defineSchemeSpecificPart()1944     private void defineSchemeSpecificPart() {
1945         if (schemeSpecificPart != null) return;
1946         StringBuffer sb = new StringBuffer();
1947         appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1948                                  host, port, getPath(), getQuery());
1949         if (sb.length() == 0) return;
1950         schemeSpecificPart = sb.toString();
1951     }
1952 
defineString()1953     private void defineString() {
1954         if (string != null) return;
1955 
1956         StringBuffer sb = new StringBuffer();
1957         if (scheme != null) {
1958             sb.append(scheme);
1959             sb.append(':');
1960         }
1961         if (isOpaque()) {
1962             sb.append(schemeSpecificPart);
1963         } else {
1964             if (host != null) {
1965                 sb.append("//");
1966                 if (userInfo != null) {
1967                     sb.append(userInfo);
1968                     sb.append('@');
1969                 }
1970                 boolean needBrackets = ((host.indexOf(':') >= 0)
1971                                     && !host.startsWith("[")
1972                                     && !host.endsWith("]"));
1973                 if (needBrackets) sb.append('[');
1974                 sb.append(host);
1975                 if (needBrackets) sb.append(']');
1976                 if (port != -1) {
1977                     sb.append(':');
1978                     sb.append(port);
1979                 }
1980             } else if (authority != null) {
1981                 sb.append("//");
1982                 sb.append(authority);
1983             }
1984             if (path != null)
1985                 sb.append(path);
1986             if (query != null) {
1987                 sb.append('?');
1988                 sb.append(query);
1989             }
1990         }
1991         if (fragment != null) {
1992             sb.append('#');
1993             sb.append(fragment);
1994         }
1995         string = sb.toString();
1996     }
1997 
1998 
1999     // -- Normalization, resolution, and relativization --
2000 
2001     // RFC2396 5.2 (6)
resolvePath(String base, String child, boolean absolute)2002     private static String resolvePath(String base, String child,
2003                                       boolean absolute)
2004     {
2005         int i = base.lastIndexOf('/');
2006         int cn = child.length();
2007         String path = "";
2008 
2009         if (cn == 0) {
2010             // 5.2 (6a)
2011             if (i >= 0)
2012                 path = base.substring(0, i + 1);
2013         } else {
2014             StringBuffer sb = new StringBuffer(base.length() + cn);
2015             // 5.2 (6a)
2016             if (i >= 0)
2017                 sb.append(base.substring(0, i + 1));
2018             // 5.2 (6b)
2019             sb.append(child);
2020             path = sb.toString();
2021         }
2022 
2023         // 5.2 (6c-f)
2024         // Android-changed: App compat. Remove leading dots when resolving path. http://b/25897693
2025         // String np = normalize(path);
2026         String np = normalize(path, true);
2027 
2028         // 5.2 (6g): If the result is absolute but the path begins with "../",
2029         // then we simply leave the path as-is
2030 
2031         return np;
2032     }
2033 
2034     // RFC2396 5.2
resolve(URI base, URI child)2035     private static URI resolve(URI base, URI child) {
2036         // check if child if opaque first so that NPE is thrown
2037         // if child is null.
2038         if (child.isOpaque() || base.isOpaque())
2039             return child;
2040 
2041         // 5.2 (2): Reference to current document (lone fragment)
2042         if ((child.scheme == null) && (child.authority == null)
2043             && child.path.equals("") && (child.fragment != null)
2044             && (child.query == null)) {
2045             if ((base.fragment != null)
2046                 && child.fragment.equals(base.fragment)) {
2047                 return base;
2048             }
2049             URI ru = new URI();
2050             ru.scheme = base.scheme;
2051             ru.authority = base.authority;
2052             ru.userInfo = base.userInfo;
2053             ru.host = base.host;
2054             ru.port = base.port;
2055             ru.path = base.path;
2056             ru.fragment = child.fragment;
2057             ru.query = base.query;
2058             return ru;
2059         }
2060 
2061         // 5.2 (3): Child is absolute
2062         if (child.scheme != null)
2063             return child;
2064 
2065         URI ru = new URI();             // Resolved URI
2066         ru.scheme = base.scheme;
2067         ru.query = child.query;
2068         ru.fragment = child.fragment;
2069 
2070         // 5.2 (4): Authority
2071         if (child.authority == null) {
2072             ru.authority = base.authority;
2073             ru.host = base.host;
2074             ru.userInfo = base.userInfo;
2075             ru.port = base.port;
2076 
2077             // BEGIN Android-changed: App Compat. Handle null and empty path using RFC 3986 logic
2078             // http://b/25897693
2079             if (child.path == null || child.path.isEmpty()) {
2080                 // This is an additional path from RFC 3986 RI, which fixes following RFC 2396
2081                 // "normal" examples:
2082                 // Base: http://a/b/c/d;p?q
2083                 //   "?y" = "http://a/b/c/d;p?y"
2084                 //   ""   = "http://a/b/c/d;p?q"
2085                 // http://b/25897693
2086                 ru.path = base.path;
2087                 ru.query = child.query != null ? child.query : base.query;
2088             // END Android-changed: App Compat. Handle null and empty path using RFC 3986 logic
2089             } else if ((child.path.length() > 0) && (child.path.charAt(0) == '/')) {
2090                 // 5.2 (5): Child path is absolute
2091                 //
2092                 // Android-changed: App Compat. Remove leading dots in path.
2093                 // There is an additional step from RFC 3986 RI, requiring to remove dots for
2094                 // absolute path as well.
2095                 // http://b/25897693
2096                 // ru.path = child.path;
2097                 ru.path = normalize(child.path, true);
2098             } else {
2099                 // 5.2 (6): Resolve relative path
2100                 ru.path = resolvePath(base.path, child.path, base.isAbsolute());
2101             }
2102         } else {
2103             ru.authority = child.authority;
2104             ru.host = child.host;
2105             ru.userInfo = child.userInfo;
2106             ru.host = child.host;
2107             ru.port = child.port;
2108             ru.path = child.path;
2109         }
2110 
2111         // 5.2 (7): Recombine (nothing to do here)
2112         return ru;
2113     }
2114 
2115     // If the given URI's path is normal then return the URI;
2116     // o.w., return a new URI containing the normalized path.
2117     //
normalize(URI u)2118     private static URI normalize(URI u) {
2119         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2120             return u;
2121 
2122         String np = normalize(u.path);
2123         if (np == u.path)
2124             return u;
2125 
2126         URI v = new URI();
2127         v.scheme = u.scheme;
2128         v.fragment = u.fragment;
2129         v.authority = u.authority;
2130         v.userInfo = u.userInfo;
2131         v.host = u.host;
2132         v.port = u.port;
2133         v.path = np;
2134         v.query = u.query;
2135         return v;
2136     }
2137 
2138     // If both URIs are hierarchical, their scheme and authority components are
2139     // identical, and the base path is a prefix of the child's path, then
2140     // return a relative URI that, when resolved against the base, yields the
2141     // child; otherwise, return the child.
2142     //
relativize(URI base, URI child)2143     private static URI relativize(URI base, URI child) {
2144         // check if child if opaque first so that NPE is thrown
2145         // if child is null.
2146         if (child.isOpaque() || base.isOpaque())
2147             return child;
2148         if (!equalIgnoringCase(base.scheme, child.scheme)
2149             || !equal(base.authority, child.authority))
2150             return child;
2151 
2152         String bp = normalize(base.path);
2153         String cp = normalize(child.path);
2154         if (!bp.equals(cp)) {
2155             // Android-changed: App Compat. Interpret ambiguous base path as a file, not a directory
2156             // Upstream would append '/' to bp if not present, interpreting it as a directory; thus,
2157             // /a/b/c relative to /a/b would become /c, whereas Android would relativize to /b/c.
2158             // The spec is pretty vague about this but the Android behavior is kept because several
2159             // tests enforce it.
2160             // if (!bp.endsWith("/"))
2161             //     bp = bp + "/";
2162             if (bp.indexOf('/') != -1) {
2163                 bp = bp.substring(0, bp.lastIndexOf('/') + 1);
2164             }
2165 
2166             if (!cp.startsWith(bp))
2167                 return child;
2168         }
2169 
2170         URI v = new URI();
2171         v.path = cp.substring(bp.length());
2172         v.query = child.query;
2173         v.fragment = child.fragment;
2174         return v;
2175     }
2176 
2177 
2178 
2179     // -- Path normalization --
2180 
2181     // The following algorithm for path normalization avoids the creation of a
2182     // string object for each segment, as well as the use of a string buffer to
2183     // compute the final result, by using a single char array and editing it in
2184     // place.  The array is first split into segments, replacing each slash
2185     // with '\0' and creating a segment-index array, each element of which is
2186     // the index of the first char in the corresponding segment.  We then walk
2187     // through both arrays, removing ".", "..", and other segments as necessary
2188     // by setting their entries in the index array to -1.  Finally, the two
2189     // arrays are used to rejoin the segments and compute the final result.
2190     //
2191     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2192 
2193 
2194     // Check the given path to see if it might need normalization.  A path
2195     // might need normalization if it contains duplicate slashes, a "."
2196     // segment, or a ".." segment.  Return -1 if no further normalization is
2197     // possible, otherwise return the number of segments found.
2198     //
2199     // This method takes a string argument rather than a char array so that
2200     // this test can be performed without invoking path.toCharArray().
2201     //
needsNormalization(String path)2202     static private int needsNormalization(String path) {
2203         boolean normal = true;
2204         int ns = 0;                     // Number of segments
2205         int end = path.length() - 1;    // Index of last char in path
2206         int p = 0;                      // Index of next char in path
2207 
2208         // Skip initial slashes
2209         while (p <= end) {
2210             if (path.charAt(p) != '/') break;
2211             p++;
2212         }
2213         if (p > 1) normal = false;
2214 
2215         // Scan segments
2216         while (p <= end) {
2217 
2218             // Looking at "." or ".." ?
2219             if ((path.charAt(p) == '.')
2220                 && ((p == end)
2221                     || ((path.charAt(p + 1) == '/')
2222                         || ((path.charAt(p + 1) == '.')
2223                             && ((p + 1 == end)
2224                                 || (path.charAt(p + 2) == '/')))))) {
2225                 normal = false;
2226             }
2227             ns++;
2228 
2229             // Find beginning of next segment
2230             while (p <= end) {
2231                 if (path.charAt(p++) != '/')
2232                     continue;
2233 
2234                 // Skip redundant slashes
2235                 while (p <= end) {
2236                     if (path.charAt(p) != '/') break;
2237                     normal = false;
2238                     p++;
2239                 }
2240 
2241                 break;
2242             }
2243         }
2244 
2245         return normal ? -1 : ns;
2246     }
2247 
2248 
2249     // Split the given path into segments, replacing slashes with nulls and
2250     // filling in the given segment-index array.
2251     //
2252     // Preconditions:
2253     //   segs.length == Number of segments in path
2254     //
2255     // Postconditions:
2256     //   All slashes in path replaced by '\0'
2257     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2258     //
split(char[] path, int[] segs)2259     static private void split(char[] path, int[] segs) {
2260         int end = path.length - 1;      // Index of last char in path
2261         int p = 0;                      // Index of next char in path
2262         int i = 0;                      // Index of current segment
2263 
2264         // Skip initial slashes
2265         while (p <= end) {
2266             if (path[p] != '/') break;
2267             path[p] = '\0';
2268             p++;
2269         }
2270 
2271         while (p <= end) {
2272 
2273             // Note start of segment
2274             segs[i++] = p++;
2275 
2276             // Find beginning of next segment
2277             while (p <= end) {
2278                 if (path[p++] != '/')
2279                     continue;
2280                 path[p - 1] = '\0';
2281 
2282                 // Skip redundant slashes
2283                 while (p <= end) {
2284                     if (path[p] != '/') break;
2285                     path[p++] = '\0';
2286                 }
2287                 break;
2288             }
2289         }
2290 
2291         if (i != segs.length)
2292             throw new InternalError();  // ASSERT
2293     }
2294 
2295 
2296     // Join the segments in the given path according to the given segment-index
2297     // array, ignoring those segments whose index entries have been set to -1,
2298     // and inserting slashes as needed.  Return the length of the resulting
2299     // path.
2300     //
2301     // Preconditions:
2302     //   segs[i] == -1 implies segment i is to be ignored
2303     //   path computed by split, as above, with '\0' having replaced '/'
2304     //
2305     // Postconditions:
2306     //   path[0] .. path[return value] == Resulting path
2307     //
join(char[] path, int[] segs)2308     static private int join(char[] path, int[] segs) {
2309         int ns = segs.length;           // Number of segments
2310         int end = path.length - 1;      // Index of last char in path
2311         int p = 0;                      // Index of next path char to write
2312 
2313         if (path[p] == '\0') {
2314             // Restore initial slash for absolute paths
2315             path[p++] = '/';
2316         }
2317 
2318         for (int i = 0; i < ns; i++) {
2319             int q = segs[i];            // Current segment
2320             if (q == -1)
2321                 // Ignore this segment
2322                 continue;
2323 
2324             if (p == q) {
2325                 // We're already at this segment, so just skip to its end
2326                 while ((p <= end) && (path[p] != '\0'))
2327                     p++;
2328                 if (p <= end) {
2329                     // Preserve trailing slash
2330                     path[p++] = '/';
2331                 }
2332             } else if (p < q) {
2333                 // Copy q down to p
2334                 while ((q <= end) && (path[q] != '\0'))
2335                     path[p++] = path[q++];
2336                 if (q <= end) {
2337                     // Preserve trailing slash
2338                     path[p++] = '/';
2339                 }
2340             } else
2341                 throw new InternalError(); // ASSERT false
2342         }
2343 
2344         return p;
2345     }
2346 
2347 
2348     // Remove "." segments from the given path, and remove segment pairs
2349     // consisting of a non-".." segment followed by a ".." segment.
2350     //
2351     // Android-changed: App compat. Remove leading dots when resolving path. http://b/25897693
2352     // private static void removeDots(char[] path, int[] segs) {
removeDots(char[] path, int[] segs, boolean removeLeading)2353     private static void removeDots(char[] path, int[] segs, boolean removeLeading) {
2354         int ns = segs.length;
2355         int end = path.length - 1;
2356 
2357         for (int i = 0; i < ns; i++) {
2358             int dots = 0;               // Number of dots found (0, 1, or 2)
2359 
2360             // Find next occurrence of "." or ".."
2361             do {
2362                 int p = segs[i];
2363                 if (path[p] == '.') {
2364                     if (p == end) {
2365                         dots = 1;
2366                         break;
2367                     } else if (path[p + 1] == '\0') {
2368                         dots = 1;
2369                         break;
2370                     } else if ((path[p + 1] == '.')
2371                                && ((p + 1 == end)
2372                                    || (path[p + 2] == '\0'))) {
2373                         dots = 2;
2374                         break;
2375                     }
2376                 }
2377                 i++;
2378             } while (i < ns);
2379             if ((i > ns) || (dots == 0))
2380                 break;
2381 
2382             if (dots == 1) {
2383                 // Remove this occurrence of "."
2384                 segs[i] = -1;
2385             } else {
2386                 // If there is a preceding non-".." segment, remove both that
2387                 // segment and this occurrence of ".."
2388                 int j;
2389                 for (j = i - 1; j >= 0; j--) {
2390                     if (segs[j] != -1) break;
2391                 }
2392                 if (j >= 0) {
2393                     int q = segs[j];
2394                     if (!((path[q] == '.')
2395                           && (path[q + 1] == '.')
2396                           && (path[q + 2] == '\0'))) {
2397                         segs[i] = -1;
2398                         segs[j] = -1;
2399                     }
2400                 // Android-added: App compat. Remove leading dots when resolving path.
2401                 // This is a leading ".." segment. Per RFC 3986 RI, this should be removed as
2402                 // well. This fixes RFC 2396 "abnormal" examples.
2403                 // http://b/25897693
2404                 } else if (removeLeading) {
2405                     segs[i] = -1;
2406                 }
2407             }
2408         }
2409     }
2410 
2411 
2412     // DEVIATION: If the normalized path is relative, and if the first
2413     // segment could be parsed as a scheme name, then prepend a "." segment
2414     //
maybeAddLeadingDot(char[] path, int[] segs)2415     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2416 
2417         if (path[0] == '\0')
2418             // The path is absolute
2419             return;
2420 
2421         int ns = segs.length;
2422         int f = 0;                      // Index of first segment
2423         while (f < ns) {
2424             if (segs[f] >= 0)
2425                 break;
2426             f++;
2427         }
2428         if ((f >= ns) || (f == 0))
2429             // The path is empty, or else the original first segment survived,
2430             // in which case we already know that no leading "." is needed
2431             return;
2432 
2433         int p = segs[f];
2434         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2435         if (p >= path.length || path[p] == '\0')
2436             // No colon in first segment, so no "." needed
2437             return;
2438 
2439         // At this point we know that the first segment is unused,
2440         // hence we can insert a "." segment at that position
2441         path[0] = '.';
2442         path[1] = '\0';
2443         segs[0] = 0;
2444     }
2445 
2446 
2447     // Normalize the given path string.  A normal path string has no empty
2448     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2449     // segments equal to ".." that are preceded by a segment not equal to "..".
2450     // In contrast to Unix-style pathname normalization, for URI paths we
2451     // always retain trailing slashes.
2452     //
normalize(String ps)2453     private static String normalize(String ps) {
2454         // BEGIN Android-changed: App compat. Remove leading dots when resolving path.
2455         // Controlled by the "boolean removeLeading" argument added to normalize().
2456         return normalize(ps, false);
2457     }
2458 
normalize(String ps, boolean removeLeading)2459     private static String normalize(String ps, boolean removeLeading) {
2460         // END Android-changed: App compat. Remove leading dots when resolving path.
2461         // Does this path need normalization?
2462         int ns = needsNormalization(ps);        // Number of segments
2463         if (ns < 0)
2464             // Nope -- just return it
2465             return ps;
2466 
2467         char[] path = ps.toCharArray();         // Path in char-array form
2468 
2469         // Split path into segments
2470         int[] segs = new int[ns];               // Segment-index array
2471         split(path, segs);
2472 
2473         // Remove dots
2474         // Android-changed: App compat. Remove leading dots when resolving path.
2475         // removeDots(path, segs);
2476         removeDots(path, segs, removeLeading);
2477 
2478         // Prevent scheme-name confusion
2479         maybeAddLeadingDot(path, segs);
2480 
2481         // Join the remaining segments and return the result
2482         String s = new String(path, 0, join(path, segs));
2483         if (s.equals(ps)) {
2484             // string was already normalized
2485             return ps;
2486         }
2487         return s;
2488     }
2489 
2490 
2491 
2492     // -- Character classes for parsing --
2493 
2494     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2495     // permissible in the various components of a URI reference.  We here
2496     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2497     // mask pair consists of two longs, a low mask and a high mask.  Taken
2498     // together they represent a 128-bit mask, where bit i is set iff the
2499     // character with value i is permitted.
2500     //
2501     // This approach is more efficient than sequentially searching arrays of
2502     // permitted characters.  It could be made still more efficient by
2503     // precompiling the mask information so that a character's presence in a
2504     // given mask could be determined by a single table lookup.
2505 
2506     // Compute the low-order mask for the characters in the given string
lowMask(String chars)2507     private static long lowMask(String chars) {
2508         int n = chars.length();
2509         long m = 0;
2510         for (int i = 0; i < n; i++) {
2511             char c = chars.charAt(i);
2512             if (c < 64)
2513                 m |= (1L << c);
2514         }
2515         return m;
2516     }
2517 
2518     // Compute the high-order mask for the characters in the given string
highMask(String chars)2519     private static long highMask(String chars) {
2520         int n = chars.length();
2521         long m = 0;
2522         for (int i = 0; i < n; i++) {
2523             char c = chars.charAt(i);
2524             if ((c >= 64) && (c < 128))
2525                 m |= (1L << (c - 64));
2526         }
2527         return m;
2528     }
2529 
2530     // Compute a low-order mask for the characters
2531     // between first and last, inclusive
lowMask(char first, char last)2532     private static long lowMask(char first, char last) {
2533         long m = 0;
2534         int f = Math.max(Math.min(first, 63), 0);
2535         int l = Math.max(Math.min(last, 63), 0);
2536         for (int i = f; i <= l; i++)
2537             m |= 1L << i;
2538         return m;
2539     }
2540 
2541     // Compute a high-order mask for the characters
2542     // between first and last, inclusive
highMask(char first, char last)2543     private static long highMask(char first, char last) {
2544         long m = 0;
2545         int f = Math.max(Math.min(first, 127), 64) - 64;
2546         int l = Math.max(Math.min(last, 127), 64) - 64;
2547         for (int i = f; i <= l; i++)
2548             m |= 1L << i;
2549         return m;
2550     }
2551 
2552     // Tell whether the given character is permitted by the given mask pair
match(char c, long lowMask, long highMask)2553     private static boolean match(char c, long lowMask, long highMask) {
2554         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2555             return false;
2556         if (c < 64)
2557             return ((1L << c) & lowMask) != 0;
2558         if (c < 128)
2559             return ((1L << (c - 64)) & highMask) != 0;
2560         return false;
2561     }
2562 
2563     // Character-class masks, in reverse order from RFC2396 because
2564     // initializers for static fields cannot make forward references.
2565 
2566     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2567     //            "8" | "9"
2568     private static final long L_DIGIT = lowMask('0', '9');
2569     private static final long H_DIGIT = 0L;
2570 
2571     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2572     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2573     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2574     private static final long L_UPALPHA = 0L;
2575     private static final long H_UPALPHA = highMask('A', 'Z');
2576 
2577     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2578     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2579     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2580     private static final long L_LOWALPHA = 0L;
2581     private static final long H_LOWALPHA = highMask('a', 'z');
2582 
2583     // alpha         = lowalpha | upalpha
2584     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2585     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2586 
2587     // alphanum      = alpha | digit
2588     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2589     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2590 
2591     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2592     //                         "a" | "b" | "c" | "d" | "e" | "f"
2593     private static final long L_HEX = L_DIGIT;
2594     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2595 
2596     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2597     //                 "(" | ")"
2598     private static final long L_MARK = lowMask("-_.!~*'()");
2599     private static final long H_MARK = highMask("-_.!~*'()");
2600 
2601     // unreserved    = alphanum | mark
2602     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2603     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2604 
2605     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2606     //                 "$" | "," | "[" | "]"
2607     // Added per RFC2732: "[", "]"
2608     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2609     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2610 
2611     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2612     // characters are allowed; this is handled by the scanEscape method below.
2613     private static final long L_ESCAPED = 1L;
2614     private static final long H_ESCAPED = 0L;
2615 
2616     // uric          = reserved | unreserved | escaped
2617     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2618     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2619 
2620     // pchar         = unreserved | escaped |
2621     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2622     private static final long L_PCHAR
2623         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2624     private static final long H_PCHAR
2625         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2626 
2627     // All valid path characters
2628     private static final long L_PATH = L_PCHAR | lowMask(";/");
2629     private static final long H_PATH = H_PCHAR | highMask(";/");
2630 
2631     // Dash, for use in domainlabel and toplabel
2632     private static final long L_DASH = lowMask("-");
2633     private static final long H_DASH = highMask("-");
2634 
2635     // BEGIN Android-added: Allow underscore in hostname.
2636     // UNDERSCORE, for use in domainlabel and toplabel
2637     private static final long L_UNDERSCORE = lowMask("_");
2638     private static final long H_UNDERSCORE = highMask("_");
2639     // END Android-added: Allow underscore in hostname.
2640 
2641     // Dot, for use in hostnames
2642     private static final long L_DOT = lowMask(".");
2643     private static final long H_DOT = highMask(".");
2644 
2645     // userinfo      = *( unreserved | escaped |
2646     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2647     private static final long L_USERINFO
2648         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2649     private static final long H_USERINFO
2650         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2651 
2652     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2653     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2654     private static final long L_REG_NAME
2655         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2656     private static final long H_REG_NAME
2657         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2658 
2659     // All valid characters for server-based authorities
2660     private static final long L_SERVER
2661         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2662     private static final long H_SERVER
2663         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2664 
2665     // Special case of server authority that represents an IPv6 address
2666     // In this case, a % does not signify an escape sequence
2667     private static final long L_SERVER_PERCENT
2668         = L_SERVER | lowMask("%");
2669     private static final long H_SERVER_PERCENT
2670         = H_SERVER | highMask("%");
2671     private static final long L_LEFT_BRACKET = lowMask("[");
2672     private static final long H_LEFT_BRACKET = highMask("[");
2673 
2674     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2675     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2676     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2677 
2678     // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
2679     //                 "&" | "=" | "+" | "$" | ","
2680     private static final long L_URIC_NO_SLASH
2681         = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
2682     private static final long H_URIC_NO_SLASH
2683         = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
2684 
2685 
2686     // -- Escaping and encoding --
2687 
2688     private final static char[] hexDigits = {
2689         '0', '1', '2', '3', '4', '5', '6', '7',
2690         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2691     };
2692 
appendEscape(StringBuffer sb, byte b)2693     private static void appendEscape(StringBuffer sb, byte b) {
2694         sb.append('%');
2695         sb.append(hexDigits[(b >> 4) & 0x0f]);
2696         sb.append(hexDigits[(b >> 0) & 0x0f]);
2697     }
2698 
appendEncoded(StringBuffer sb, char c)2699     private static void appendEncoded(StringBuffer sb, char c) {
2700         ByteBuffer bb = null;
2701         try {
2702             bb = ThreadLocalCoders.encoderFor("UTF-8")
2703                 .encode(CharBuffer.wrap("" + c));
2704         } catch (CharacterCodingException x) {
2705             assert false;
2706         }
2707         while (bb.hasRemaining()) {
2708             int b = bb.get() & 0xff;
2709             if (b >= 0x80)
2710                 appendEscape(sb, (byte)b);
2711             else
2712                 sb.append((char)b);
2713         }
2714     }
2715 
2716     // Quote any characters in s that are not permitted
2717     // by the given mask pair
2718     //
quote(String s, long lowMask, long highMask)2719     private static String quote(String s, long lowMask, long highMask) {
2720         int n = s.length();
2721         StringBuffer sb = null;
2722         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2723         for (int i = 0; i < s.length(); i++) {
2724             char c = s.charAt(i);
2725             if (c < '\u0080') {
2726                 if (!match(c, lowMask, highMask)) {
2727                     if (sb == null) {
2728                         sb = new StringBuffer();
2729                         sb.append(s.substring(0, i));
2730                     }
2731                     appendEscape(sb, (byte)c);
2732                 } else {
2733                     if (sb != null)
2734                         sb.append(c);
2735                 }
2736             } else if (allowNonASCII
2737                        && (Character.isSpaceChar(c)
2738                            || Character.isISOControl(c))) {
2739                 if (sb == null) {
2740                     sb = new StringBuffer();
2741                     sb.append(s.substring(0, i));
2742                 }
2743                 appendEncoded(sb, c);
2744             } else {
2745                 if (sb != null)
2746                     sb.append(c);
2747             }
2748         }
2749         return (sb == null) ? s : sb.toString();
2750     }
2751 
2752     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2753     // assuming that s is otherwise legal
2754     //
encode(String s)2755     private static String encode(String s) {
2756         int n = s.length();
2757         if (n == 0)
2758             return s;
2759 
2760         // First check whether we actually need to encode
2761         for (int i = 0;;) {
2762             if (s.charAt(i) >= '\u0080')
2763                 break;
2764             if (++i >= n)
2765                 return s;
2766         }
2767 
2768         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2769         ByteBuffer bb = null;
2770         try {
2771             bb = ThreadLocalCoders.encoderFor("UTF-8")
2772                 .encode(CharBuffer.wrap(ns));
2773         } catch (CharacterCodingException x) {
2774             assert false;
2775         }
2776 
2777         StringBuffer sb = new StringBuffer();
2778         while (bb.hasRemaining()) {
2779             int b = bb.get() & 0xff;
2780             if (b >= 0x80)
2781                 appendEscape(sb, (byte)b);
2782             else
2783                 sb.append((char)b);
2784         }
2785         return sb.toString();
2786     }
2787 
decode(char c)2788     private static int decode(char c) {
2789         if ((c >= '0') && (c <= '9'))
2790             return c - '0';
2791         if ((c >= 'a') && (c <= 'f'))
2792             return c - 'a' + 10;
2793         if ((c >= 'A') && (c <= 'F'))
2794             return c - 'A' + 10;
2795         assert false;
2796         return -1;
2797     }
2798 
decode(char c1, char c2)2799     private static byte decode(char c1, char c2) {
2800         return (byte)(  ((decode(c1) & 0xf) << 4)
2801                       | ((decode(c2) & 0xf) << 0));
2802     }
2803 
2804     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2805     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2806     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2807     // are replaced with '\uFFFD'.
2808     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2809     //            with a scope_id
2810     //
decode(String s)2811     private static String decode(String s) {
2812         if (s == null)
2813             return s;
2814         int n = s.length();
2815         if (n == 0)
2816             return s;
2817         if (s.indexOf('%') < 0)
2818             return s;
2819 
2820         StringBuffer sb = new StringBuffer(n);
2821         ByteBuffer bb = ByteBuffer.allocate(n);
2822         CharBuffer cb = CharBuffer.allocate(n);
2823         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2824             .onMalformedInput(CodingErrorAction.REPLACE)
2825             .onUnmappableCharacter(CodingErrorAction.REPLACE);
2826 
2827         // This is not horribly efficient, but it will do for now
2828         char c = s.charAt(0);
2829         boolean betweenBrackets = false;
2830 
2831         for (int i = 0; i < n;) {
2832             assert c == s.charAt(i);    // Loop invariant
2833             if (c == '[') {
2834                 betweenBrackets = true;
2835             } else if (betweenBrackets && c == ']') {
2836                 betweenBrackets = false;
2837             }
2838             if (c != '%' || betweenBrackets) {
2839                 sb.append(c);
2840                 if (++i >= n)
2841                     break;
2842                 c = s.charAt(i);
2843                 continue;
2844             }
2845             bb.clear();
2846             int ui = i;
2847             for (;;) {
2848                 assert (n - i >= 2);
2849                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2850                 if (++i >= n)
2851                     break;
2852                 c = s.charAt(i);
2853                 if (c != '%')
2854                     break;
2855             }
2856             bb.flip();
2857             cb.clear();
2858             dec.reset();
2859             CoderResult cr = dec.decode(bb, cb, true);
2860             assert cr.isUnderflow();
2861             cr = dec.flush(cb);
2862             assert cr.isUnderflow();
2863             sb.append(cb.flip().toString());
2864         }
2865 
2866         return sb.toString();
2867     }
2868 
2869 
2870     // -- Parsing --
2871 
2872     // For convenience we wrap the input URI string in a new instance of the
2873     // following internal class.  This saves always having to pass the input
2874     // string as an argument to each internal scan/parse method.
2875 
2876     private class Parser {
2877 
2878         private String input;           // URI input string
2879         private boolean requireServerAuthority = false;
2880 
Parser(String s)2881         Parser(String s) {
2882             input = s;
2883             string = s;
2884         }
2885 
2886         // -- Methods for throwing URISyntaxException in various ways --
2887 
fail(String reason)2888         private void fail(String reason) throws URISyntaxException {
2889             throw new URISyntaxException(input, reason);
2890         }
2891 
fail(String reason, int p)2892         private void fail(String reason, int p) throws URISyntaxException {
2893             throw new URISyntaxException(input, reason, p);
2894         }
2895 
failExpecting(String expected, int p)2896         private void failExpecting(String expected, int p)
2897             throws URISyntaxException
2898         {
2899             fail("Expected " + expected, p);
2900         }
2901 
failExpecting(String expected, String prior, int p)2902         private void failExpecting(String expected, String prior, int p)
2903             throws URISyntaxException
2904         {
2905             fail("Expected " + expected + " following " + prior, p);
2906         }
2907 
2908 
2909         // -- Simple access to the input string --
2910 
2911         // Return a substring of the input string
2912         //
substring(int start, int end)2913         private String substring(int start, int end) {
2914             return input.substring(start, end);
2915         }
2916 
2917         // Return the char at position p,
2918         // assuming that p < input.length()
2919         //
charAt(int p)2920         private char charAt(int p) {
2921             return input.charAt(p);
2922         }
2923 
2924         // Tells whether start < end and, if so, whether charAt(start) == c
2925         //
at(int start, int end, char c)2926         private boolean at(int start, int end, char c) {
2927             return (start < end) && (charAt(start) == c);
2928         }
2929 
2930         // Tells whether start + s.length() < end and, if so,
2931         // whether the chars at the start position match s exactly
2932         //
at(int start, int end, String s)2933         private boolean at(int start, int end, String s) {
2934             int p = start;
2935             int sn = s.length();
2936             if (sn > end - p)
2937                 return false;
2938             int i = 0;
2939             while (i < sn) {
2940                 if (charAt(p++) != s.charAt(i)) {
2941                     break;
2942                 }
2943                 i++;
2944             }
2945             return (i == sn);
2946         }
2947 
2948 
2949         // -- Scanning --
2950 
2951         // The various scan and parse methods that follow use a uniform
2952         // convention of taking the current start position and end index as
2953         // their first two arguments.  The start is inclusive while the end is
2954         // exclusive, just as in the String class, i.e., a start/end pair
2955         // denotes the left-open interval [start, end) of the input string.
2956         //
2957         // These methods never proceed past the end position.  They may return
2958         // -1 to indicate outright failure, but more often they simply return
2959         // the position of the first char after the last char scanned.  Thus
2960         // a typical idiom is
2961         //
2962         //     int p = start;
2963         //     int q = scan(p, end, ...);
2964         //     if (q > p)
2965         //         // We scanned something
2966         //         ...;
2967         //     else if (q == p)
2968         //         // We scanned nothing
2969         //         ...;
2970         //     else if (q == -1)
2971         //         // Something went wrong
2972         //         ...;
2973 
2974 
2975         // Scan a specific char: If the char at the given start position is
2976         // equal to c, return the index of the next char; otherwise, return the
2977         // start position.
2978         //
scan(int start, int end, char c)2979         private int scan(int start, int end, char c) {
2980             if ((start < end) && (charAt(start) == c))
2981                 return start + 1;
2982             return start;
2983         }
2984 
2985         // Scan forward from the given start position.  Stop at the first char
2986         // in the err string (in which case -1 is returned), or the first char
2987         // in the stop string (in which case the index of the preceding char is
2988         // returned), or the end of the input string (in which case the length
2989         // of the input string is returned).  May return the start position if
2990         // nothing matches.
2991         //
scan(int start, int end, String err, String stop)2992         private int scan(int start, int end, String err, String stop) {
2993             int p = start;
2994             while (p < end) {
2995                 char c = charAt(p);
2996                 if (err.indexOf(c) >= 0)
2997                     return -1;
2998                 if (stop.indexOf(c) >= 0)
2999                     break;
3000                 p++;
3001             }
3002             return p;
3003         }
3004 
3005         // Scan a potential escape sequence, starting at the given position,
3006         // with the given first char (i.e., charAt(start) == c).
3007         //
3008         // This method assumes that if escapes are allowed then visible
3009         // non-US-ASCII chars are also allowed.
3010         //
scanEscape(int start, int n, char first)3011         private int scanEscape(int start, int n, char first)
3012             throws URISyntaxException
3013         {
3014             int p = start;
3015             char c = first;
3016             if (c == '%') {
3017                 // Process escape pair
3018                 if ((p + 3 <= n)
3019                     && match(charAt(p + 1), L_HEX, H_HEX)
3020                     && match(charAt(p + 2), L_HEX, H_HEX)) {
3021                     return p + 3;
3022                 }
3023                 fail("Malformed escape pair", p);
3024             } else if ((c > 128)
3025                        && !Character.isSpaceChar(c)
3026                        && !Character.isISOControl(c)) {
3027                 // Allow unescaped but visible non-US-ASCII chars
3028                 return p + 1;
3029             }
3030             return p;
3031         }
3032 
3033         // Scan chars that match the given mask pair
3034         //
scan(int start, int n, long lowMask, long highMask)3035         private int scan(int start, int n, long lowMask, long highMask)
3036             throws URISyntaxException
3037         {
3038             int p = start;
3039             while (p < n) {
3040                 char c = charAt(p);
3041                 if (match(c, lowMask, highMask)) {
3042                     p++;
3043                     continue;
3044                 }
3045                 if ((lowMask & L_ESCAPED) != 0) {
3046                     int q = scanEscape(p, n, c);
3047                     if (q > p) {
3048                         p = q;
3049                         continue;
3050                     }
3051                 }
3052                 break;
3053             }
3054             return p;
3055         }
3056 
3057         // Check that each of the chars in [start, end) matches the given mask
3058         //
checkChars(int start, int end, long lowMask, long highMask, String what)3059         private void checkChars(int start, int end,
3060                                 long lowMask, long highMask,
3061                                 String what)
3062             throws URISyntaxException
3063         {
3064             int p = scan(start, end, lowMask, highMask);
3065             if (p < end)
3066                 fail("Illegal character in " + what, p);
3067         }
3068 
3069         // Check that the char at position p matches the given mask
3070         //
checkChar(int p, long lowMask, long highMask, String what)3071         private void checkChar(int p,
3072                                long lowMask, long highMask,
3073                                String what)
3074             throws URISyntaxException
3075         {
3076             checkChars(p, p + 1, lowMask, highMask, what);
3077         }
3078 
3079 
3080         // -- Parsing --
3081 
3082         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3083         //
parse(boolean rsa)3084         void parse(boolean rsa) throws URISyntaxException {
3085             requireServerAuthority = rsa;
3086             int ssp;                    // Start of scheme-specific part
3087             int n = input.length();
3088             int p = scan(0, n, "/?#", ":");
3089             if ((p >= 0) && at(p, n, ':')) {
3090                 if (p == 0)
3091                     failExpecting("scheme name", 0);
3092                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3093                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3094                 scheme = substring(0, p);
3095                 p++;                    // Skip ':'
3096                 ssp = p;
3097                 if (at(p, n, '/')) {
3098                     p = parseHierarchical(p, n);
3099                 } else {
3100                     int q = scan(p, n, "", "#");
3101                     if (q <= p)
3102                         failExpecting("scheme-specific part", p);
3103                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3104                     p = q;
3105                 }
3106             } else {
3107                 ssp = 0;
3108                 p = parseHierarchical(0, n);
3109             }
3110             schemeSpecificPart = substring(ssp, p);
3111             if (at(p, n, '#')) {
3112                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3113                 fragment = substring(p + 1, n);
3114                 p = n;
3115             }
3116             if (p < n)
3117                 fail("end of URI", p);
3118         }
3119 
3120         // [//authority]<path>[?<query>]
3121         //
3122         // DEVIATION from RFC2396: We allow an empty authority component as
3123         // long as it's followed by a non-empty path, query component, or
3124         // fragment component.  This is so that URIs such as "file:///foo/bar"
3125         // will parse.  This seems to be the intent of RFC2396, though the
3126         // grammar does not permit it.  If the authority is empty then the
3127         // userInfo, host, and port components are undefined.
3128         //
3129         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3130         // to be the intent of RFC2396, but the grammar does not permit it.
3131         // The primary consequence of this deviation is that "#f" parses as a
3132         // relative URI with an empty path.
3133         //
parseHierarchical(int start, int n)3134         private int parseHierarchical(int start, int n)
3135             throws URISyntaxException
3136         {
3137             int p = start;
3138             if (at(p, n, '/') && at(p + 1, n, '/')) {
3139                 p += 2;
3140                 int q = scan(p, n, "", "/?#");
3141                 if (q > p) {
3142                     p = parseAuthority(p, q);
3143                 } else if (q < n) {
3144                     // DEVIATION: Allow empty authority prior to non-empty
3145                     // path, query component or fragment identifier
3146                 } else
3147                     failExpecting("authority", p);
3148             }
3149             int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
3150             checkChars(p, q, L_PATH, H_PATH, "path");
3151             path = substring(p, q);
3152             p = q;
3153             if (at(p, n, '?')) {
3154                 p++;
3155                 q = scan(p, n, "", "#");
3156                 checkChars(p, q, L_URIC, H_URIC, "query");
3157                 query = substring(p, q);
3158                 p = q;
3159             }
3160             return p;
3161         }
3162 
3163         // authority     = server | reg_name
3164         //
3165         // Ambiguity: An authority that is a registry name rather than a server
3166         // might have a prefix that parses as a server.  We use the fact that
3167         // the authority component is always followed by '/' or the end of the
3168         // input string to resolve this: If the complete authority did not
3169         // parse as a server then we try to parse it as a registry name.
3170         //
parseAuthority(int start, int n)3171         private int parseAuthority(int start, int n)
3172             throws URISyntaxException
3173         {
3174             int p = start;
3175             int q = p;
3176             URISyntaxException ex = null;
3177 
3178             boolean serverChars;
3179             boolean regChars;
3180 
3181             if (scan(p, n, "", "]") > p) {
3182                 // contains a literal IPv6 address, therefore % is allowed
3183                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3184             } else {
3185                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3186             }
3187             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3188 
3189             if (regChars && !serverChars) {
3190                 // Must be a registry-based authority
3191                 authority = substring(p, n);
3192                 return n;
3193             }
3194 
3195             if (serverChars) {
3196                 // Might be (probably is) a server-based authority, so attempt
3197                 // to parse it as such.  If the attempt fails, try to treat it
3198                 // as a registry-based authority.
3199                 try {
3200                     q = parseServer(p, n);
3201                     if (q < n)
3202                         failExpecting("end of authority", q);
3203                     authority = substring(p, n);
3204                 } catch (URISyntaxException x) {
3205                     // Undo results of failed parse
3206                     userInfo = null;
3207                     host = null;
3208                     port = -1;
3209                     if (requireServerAuthority) {
3210                         // If we're insisting upon a server-based authority,
3211                         // then just re-throw the exception
3212                         throw x;
3213                     } else {
3214                         // Save the exception in case it doesn't parse as a
3215                         // registry either
3216                         ex = x;
3217                         q = p;
3218                     }
3219                 }
3220             }
3221 
3222             if (q < n) {
3223                 if (regChars) {
3224                     // Registry-based authority
3225                     authority = substring(p, n);
3226                 } else if (ex != null) {
3227                     // Re-throw exception; it was probably due to
3228                     // a malformed IPv6 address
3229                     throw ex;
3230                 } else {
3231                     fail("Illegal character in authority", q);
3232                 }
3233             }
3234 
3235             return n;
3236         }
3237 
3238 
3239         // [<userinfo>@]<host>[:<port>]
3240         //
parseServer(int start, int n)3241         private int parseServer(int start, int n)
3242             throws URISyntaxException
3243         {
3244             int p = start;
3245             int q;
3246 
3247             // userinfo
3248             q = scan(p, n, "/?#", "@");
3249             if ((q >= p) && at(q, n, '@')) {
3250                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3251                 userInfo = substring(p, q);
3252                 p = q + 1;              // Skip '@'
3253             }
3254 
3255             // hostname, IPv4 address, or IPv6 address
3256             if (at(p, n, '[')) {
3257                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3258                 p++;
3259                 q = scan(p, n, "/?#", "]");
3260                 if ((q > p) && at(q, n, ']')) {
3261                     // look for a "%" scope id
3262                     int r = scan (p, q, "", "%");
3263                     if (r > p) {
3264                         parseIPv6Reference(p, r);
3265                         if (r+1 == q) {
3266                             fail ("scope id expected");
3267                         }
3268                         checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
3269                                                 "scope id");
3270                     } else {
3271                         parseIPv6Reference(p, q);
3272                     }
3273                     host = substring(p-1, q+1);
3274                     p = q + 1;
3275                 } else {
3276                     failExpecting("closing bracket for IPv6 address", q);
3277                 }
3278             } else {
3279                 q = parseIPv4Address(p, n);
3280                 if (q <= p)
3281                     q = parseHostname(p, n);
3282                 p = q;
3283             }
3284 
3285             // port
3286             if (at(p, n, ':')) {
3287                 p++;
3288                 q = scan(p, n, "", "/");
3289                 if (q > p) {
3290                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3291                     try {
3292                         port = Integer.parseInt(substring(p, q));
3293                     } catch (NumberFormatException x) {
3294                         fail("Malformed port number", p);
3295                     }
3296                     p = q;
3297                 }
3298             }
3299             if (p < n)
3300                 failExpecting("port number", p);
3301 
3302             return p;
3303         }
3304 
3305         // Scan a string of decimal digits whose value fits in a byte
3306         //
scanByte(int start, int n)3307         private int scanByte(int start, int n)
3308             throws URISyntaxException
3309         {
3310             int p = start;
3311             int q = scan(p, n, L_DIGIT, H_DIGIT);
3312             if (q <= p) return q;
3313             if (Integer.parseInt(substring(p, q)) > 255) return p;
3314             return q;
3315         }
3316 
3317         // Scan an IPv4 address.
3318         //
3319         // If the strict argument is true then we require that the given
3320         // interval contain nothing besides an IPv4 address; if it is false
3321         // then we only require that it start with an IPv4 address.
3322         //
3323         // If the interval does not contain or start with (depending upon the
3324         // strict argument) a legal IPv4 address characters then we return -1
3325         // immediately; otherwise we insist that these characters parse as a
3326         // legal IPv4 address and throw an exception on failure.
3327         //
3328         // We assume that any string of decimal digits and dots must be an IPv4
3329         // address.  It won't parse as a hostname anyway, so making that
3330         // assumption here allows more meaningful exceptions to be thrown.
3331         //
scanIPv4Address(int start, int n, boolean strict)3332         private int scanIPv4Address(int start, int n, boolean strict)
3333             throws URISyntaxException
3334         {
3335             int p = start;
3336             int q;
3337             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3338             if ((m <= p) || (strict && (m != n)))
3339                 return -1;
3340             for (;;) {
3341                 // Per RFC2732: At most three digits per byte
3342                 // Further constraint: Each element fits in a byte
3343                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3344                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3345                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3346                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3347                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3348                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3349                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3350                 if (q < m) break;
3351                 return q;
3352             }
3353             fail("Malformed IPv4 address", q);
3354             return -1;
3355         }
3356 
3357         // Take an IPv4 address: Throw an exception if the given interval
3358         // contains anything except an IPv4 address
3359         //
takeIPv4Address(int start, int n, String expected)3360         private int takeIPv4Address(int start, int n, String expected)
3361             throws URISyntaxException
3362         {
3363             int p = scanIPv4Address(start, n, true);
3364             if (p <= start)
3365                 failExpecting(expected, start);
3366             return p;
3367         }
3368 
3369         // Attempt to parse an IPv4 address, returning -1 on failure but
3370         // allowing the given interval to contain [:<characters>] after
3371         // the IPv4 address.
3372         //
parseIPv4Address(int start, int n)3373         private int parseIPv4Address(int start, int n) {
3374             int p;
3375 
3376             try {
3377                 p = scanIPv4Address(start, n, false);
3378             } catch (URISyntaxException x) {
3379                 return -1;
3380             } catch (NumberFormatException nfe) {
3381                 return -1;
3382             }
3383 
3384             if (p > start && p < n) {
3385                 // IPv4 address is followed by something - check that
3386                 // it's a ":" as this is the only valid character to
3387                 // follow an address.
3388                 if (charAt(p) != ':') {
3389                     p = -1;
3390                 }
3391             }
3392 
3393             if (p > start)
3394                 host = substring(start, p);
3395 
3396             return p;
3397         }
3398 
3399         // Android-changed: Allow underscore in hostname.
3400         // Added "_" to the grammars for domainLabel and topLabel.
3401         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3402         // domainlabel   = alphanum | alphanum *( alphanum | "-" | "_" ) alphanum
3403         // toplabel      = alpha | alpha *( alphanum | "-" | "_" ) alphanum
3404         //
parseHostname(int start, int n)3405         private int parseHostname(int start, int n)
3406             throws URISyntaxException
3407         {
3408             int p = start;
3409             int q;
3410             int l = -1;                 // Start of last parsed label
3411 
3412             do {
3413                 // Android-changed: Allow underscore in hostname.
3414                 // RFC 2396 only allows alphanumeric characters and hyphens, but real,
3415                 // large Internet hosts in the wild use underscore, so we have to allow it.
3416                 // http://code.google.com/p/android/issues/detail?id=37577
3417                 // http://b/17579865
3418                 // http://b/18016625
3419                 // http://b/18023709
3420 
3421                 // domainlabel = alphanum [ *( alphanum | "-" | "_" ) alphanum ]
3422                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3423                 if (q <= p)
3424                     break;
3425                 l = p;
3426                 if (q > p) {
3427                     p = q;
3428                     // Android-changed: Allow underscore in hostname.
3429                     // q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3430                     q = scan(p, n, L_ALPHANUM | L_DASH | L_UNDERSCORE, H_ALPHANUM | H_DASH | H_UNDERSCORE);
3431                     if (q > p) {
3432                         if (charAt(q - 1) == '-')
3433                             fail("Illegal character in hostname", q - 1);
3434                         p = q;
3435                     }
3436                 }
3437                 q = scan(p, n, '.');
3438                 if (q <= p)
3439                     break;
3440                 p = q;
3441             } while (p < n);
3442 
3443             if ((p < n) && !at(p, n, ':'))
3444                 fail("Illegal character in hostname", p);
3445 
3446             if (l < 0)
3447                 failExpecting("hostname", start);
3448 
3449             // for a fully qualified hostname check that the rightmost
3450             // label starts with an alpha character.
3451             if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
3452                 fail("Illegal character in hostname", l);
3453             }
3454 
3455             host = substring(start, p);
3456             return p;
3457         }
3458 
3459 
3460         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3461         //
3462         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3463         // the form ::12.34.56.78, which are clearly shown in the examples
3464         // earlier in the document.  Here is the original grammar:
3465         //
3466         //   IPv6address = hexpart [ ":" IPv4address ]
3467         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3468         //   hexseq      = hex4 *( ":" hex4)
3469         //   hex4        = 1*4HEXDIG
3470         //
3471         // We therefore use the following revised grammar:
3472         //
3473         //   IPv6address = hexseq [ ":" IPv4address ]
3474         //                 | hexseq [ "::" [ hexpost ] ]
3475         //                 | "::" [ hexpost ]
3476         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3477         //   hexseq      = hex4 *( ":" hex4)
3478         //   hex4        = 1*4HEXDIG
3479         //
3480         // This covers all and only the following cases:
3481         //
3482         //   hexseq
3483         //   hexseq : IPv4address
3484         //   hexseq ::
3485         //   hexseq :: hexseq
3486         //   hexseq :: hexseq : IPv4address
3487         //   hexseq :: IPv4address
3488         //   :: hexseq
3489         //   :: hexseq : IPv4address
3490         //   :: IPv4address
3491         //   ::
3492         //
3493         // Additionally we constrain the IPv6 address as follows :-
3494         //
3495         //  i.  IPv6 addresses without compressed zeros should contain
3496         //      exactly 16 bytes.
3497         //
3498         //  ii. IPv6 addresses with compressed zeros should contain
3499         //      less than 16 bytes.
3500 
3501         private int ipv6byteCount = 0;
3502 
parseIPv6Reference(int start, int n)3503         private int parseIPv6Reference(int start, int n)
3504             throws URISyntaxException
3505         {
3506             int p = start;
3507             int q;
3508             boolean compressedZeros = false;
3509 
3510             q = scanHexSeq(p, n);
3511 
3512             if (q > p) {
3513                 p = q;
3514                 if (at(p, n, "::")) {
3515                     compressedZeros = true;
3516                     p = scanHexPost(p + 2, n);
3517                 } else if (at(p, n, ':')) {
3518                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3519                     ipv6byteCount += 4;
3520                 }
3521             } else if (at(p, n, "::")) {
3522                 compressedZeros = true;
3523                 p = scanHexPost(p + 2, n);
3524             }
3525             if (p < n)
3526                 fail("Malformed IPv6 address", start);
3527             if (ipv6byteCount > 16)
3528                 fail("IPv6 address too long", start);
3529             if (!compressedZeros && ipv6byteCount < 16)
3530                 fail("IPv6 address too short", start);
3531             if (compressedZeros && ipv6byteCount == 16)
3532                 fail("Malformed IPv6 address", start);
3533 
3534             return p;
3535         }
3536 
scanHexPost(int start, int n)3537         private int scanHexPost(int start, int n)
3538             throws URISyntaxException
3539         {
3540             int p = start;
3541             int q;
3542 
3543             if (p == n)
3544                 return p;
3545 
3546             q = scanHexSeq(p, n);
3547             if (q > p) {
3548                 p = q;
3549                 if (at(p, n, ':')) {
3550                     p++;
3551                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3552                     ipv6byteCount += 4;
3553                 }
3554             } else {
3555                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3556                 ipv6byteCount += 4;
3557             }
3558             return p;
3559         }
3560 
3561         // Scan a hex sequence; return -1 if one could not be scanned
3562         //
scanHexSeq(int start, int n)3563         private int scanHexSeq(int start, int n)
3564             throws URISyntaxException
3565         {
3566             int p = start;
3567             int q;
3568 
3569             q = scan(p, n, L_HEX, H_HEX);
3570             if (q <= p)
3571                 return -1;
3572             if (at(q, n, '.'))          // Beginning of IPv4 address
3573                 return -1;
3574             if (q > p + 4)
3575                 fail("IPv6 hexadecimal digit sequence too long", p);
3576             ipv6byteCount += 2;
3577             p = q;
3578             while (p < n) {
3579                 if (!at(p, n, ':'))
3580                     break;
3581                 if (at(p + 1, n, ':'))
3582                     break;              // "::"
3583                 p++;
3584                 q = scan(p, n, L_HEX, H_HEX);
3585                 if (q <= p)
3586                     failExpecting("digits for an IPv6 address", p);
3587                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3588                     p--;
3589                     break;
3590                 }
3591                 if (q > p + 4)
3592                     fail("IPv6 hexadecimal digit sequence too long", p);
3593                 ipv6byteCount += 2;
3594                 p = q;
3595             }
3596 
3597             return p;
3598         }
3599 
3600     }
3601 
3602 }
3603