001    /* URI.java -- An URI class
002       Copyright (C) 2002, 2004, 2005, 2006, 2008  Free Software Foundation, Inc.
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    
039    package java.net;
040    
041    import java.io.IOException;
042    import java.io.ObjectInputStream;
043    import java.io.ObjectOutputStream;
044    import java.io.Serializable;
045    import java.util.regex.Matcher;
046    import java.util.regex.Pattern;
047    
048    /**
049     * <p>
050     * A URI instance represents that defined by 
051     * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>,
052     * with some deviations.
053     * </p>
054     * <p>
055     * At its highest level, a URI consists of:
056     * </p>
057     * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em>
058     * [<strong>#</strong><em>fragment</em>]</code>
059     * </p>
060     * <p>
061     * where <strong>#</strong> and <strong>:</strong> are literal characters,
062     * and those parts enclosed in square brackets are optional.
063     * </p>
064     * <p>
065     * There are two main types of URI.  An <em>opaque</em> URI is one
066     * which just consists of the above three parts, and is not further
067     * defined.  An example of such a URI would be <em>mailto:</em> URI.
068     * In contrast, <em>hierarchical</em> URIs give further definition
069     * to the scheme-specific part, so as represent some part of a hierarchical
070     * structure.
071     * </p>
072     * <p>
073     * <code>[<strong>//</strong><em>authority</em>][<em>path</em>]
074     * [<strong>?</strong><em>query</em>]</code>
075     * </p>
076     * <p>
077     * with <strong>/</strong> and <strong>?</strong> being literal characters.
078     * When server-based, the authority section is further subdivided into:
079     * </p>
080     * <p>
081     * <code>[<em>user-info</em><strong>@</strong>]<em>host</em>
082     * [<strong>:</strong><em>port</em>]</code>
083     * </p>
084     * <p>
085     * with <strong>@</strong> and <strong>:</strong> as literal characters.
086     * Authority sections that are not server-based are said to be registry-based.
087     * </p>
088     * <p>
089     * Hierarchical URIs can be either relative or absolute.  Absolute URIs
090     * always start with a `<strong>/</strong>', while relative URIs don't
091     * specify a scheme.  Opaque URIs are always absolute.
092     * </p>
093     * <p>
094     * Each part of the URI may have one of three states: undefined, empty
095     * or containing some content.  The former two of these are represented
096     * by <code>null</code> and the empty string in Java, respectively.
097     * The scheme-specific part may never be undefined.  It also follows from
098     * this that the path sub-part may also not be undefined, so as to ensure
099     * the former.
100     * </p>
101     * <h2>Character Escaping and Quoting</h2>
102     * <p>
103     * The characters that can be used within a valid URI are restricted.
104     * There are two main classes of characters which can't be used as is
105     * within the URI:
106     * </p>
107     * <ol>
108     * <li><strong>Characters outside the US-ASCII character set</strong>.
109     * These have to be <strong>escaped</strong> in order to create
110     * an RFC-compliant URI; this means replacing the character with the
111     * appropriate hexadecimal value, preceded by a `%'.</li>
112     * <li><strong>Illegal characters</strong> (e.g. space characters,
113     * control characters) are quoted, which results in them being encoded
114     * in the same way as non-US-ASCII characters.</li>
115     * </ol>
116     * <p>
117     * The set of valid characters differs depending on the section of the URI:
118     * </p>
119     * <ul>
120     * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li>
121     * <li><strong>Authority</strong>:Composed of the username, host, port, `@'
122     * and `:'.</li>
123     * <li><strong>Username</strong>: Allows unreserved or percent-encoded
124     * characters, sub-delimiters and `:'.</li>
125     * <li><strong>Host</strong>: Allows unreserved or percent-encoded
126     * characters, sub-delimiters and square brackets (`[' and `]') for IPv6
127     * addresses.</li>
128     * <li><strong>Port</strong>: Digits only.</li>
129     * <li><strong>Path</strong>: Allows the path characters and `/'.
130     * <li><strong>Query</strong>: Allows the path characters, `?' and '/'.
131     * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'.
132     * </ul>
133     * <p>
134     * These definitions reference the following sets of characters:
135     * </p>
136     * <ul>
137     * <li><strong>Unreserved characters</strong>: The alphanumerics plus
138     * `-', `.', `_', and `~'.</li>
139     * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*',
140     * `+', `,', `;', `=' and the single-quote itself.</li>
141     * <li><strong>Path characters</strong>: Unreserved and percent-encoded
142     * characters and the sub-delimiters along with `@' and `:'.</li>
143     * </ul>
144     * <p>
145     * The constructors and accessor methods allow the use and retrieval of
146     * URI components which contain non-US-ASCII characters directly.
147     * They are only escaped when the <code>toASCIIString()</code> method
148     * is used.  In contrast, illegal characters are always quoted, with the
149     * exception of the return values of the non-raw accessors.
150     * </p>
151     *
152     * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp)
153     * @author Dalibor Topic (robilad@kaffe.org)
154     * @author Michael Koch (konqueror@gmx.de)
155     * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
156     * @since 1.4
157     */
158    public final class URI 
159      implements Comparable<URI>, Serializable
160    {
161      /**
162       * For serialization compatability.
163       */
164      static final long serialVersionUID = -6052424284110960213L;
165    
166      /**
167       * Regular expression for parsing URIs.
168       *
169       * Taken from RFC 2396, Appendix B.
170       * This expression doesn't parse IPv6 addresses.
171       */
172      private static final String URI_REGEXP =
173        "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?";
174    
175      /**
176       * Regular expression for parsing the authority segment.
177       */
178      private static final String AUTHORITY_REGEXP =
179        "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?";
180    
181      /**
182       * Valid characters (taken from rfc2396/3986)
183       */
184      private static final String RFC2396_DIGIT = "0123456789";
185      private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz";
186      private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
187      private static final String RFC2396_ALPHA =
188        RFC2396_LOWALPHA + RFC2396_UPALPHA;
189      private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA;
190      private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~";
191      private static final String RFC3986_SUBDELIMS = "!$&'()*+,;=";
192      private static final String RFC3986_REG_NAME =
193        RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%";
194      private static final String RFC3986_PCHAR = RFC3986_UNRESERVED + 
195        RFC3986_SUBDELIMS + ":@%";
196      private static final String RFC3986_SEGMENT = RFC3986_PCHAR;
197      private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/";
198      private static final String RFC3986_SSP = RFC3986_PCHAR + "?/";
199      private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]";
200      private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":";
201    
202      /**
203       * Index of scheme component in parsed URI.
204       */
205      private static final int SCHEME_GROUP = 2;
206    
207      /**
208       * Index of scheme-specific-part in parsed URI.
209       */
210      private static final int SCHEME_SPEC_PART_GROUP = 3;
211    
212      /**
213       * Index of authority component in parsed URI.
214       */
215      private static final int AUTHORITY_GROUP = 5;
216    
217      /**
218       * Index of path component in parsed URI.
219       */
220      private static final int PATH_GROUP = 6;
221    
222      /**
223       * Index of query component in parsed URI.
224       */
225      private static final int QUERY_GROUP = 8;
226    
227      /**
228       * Index of fragment component in parsed URI.
229       */
230      private static final int FRAGMENT_GROUP = 10;
231      
232      /**
233       * Index of userinfo component in parsed authority section.
234       */
235      private static final int AUTHORITY_USERINFO_GROUP = 2;
236    
237      /**
238       * Index of host component in parsed authority section.
239       */
240      private static final int AUTHORITY_HOST_GROUP = 3;
241    
242      /**
243       * Index of port component in parsed authority section.
244       */
245      private static final int AUTHORITY_PORT_GROUP = 5;
246    
247      /**
248       * The compiled version of the URI regular expression.
249       */
250      private static final Pattern URI_PATTERN;
251    
252      /**
253       * The compiled version of the authority regular expression.
254       */
255      private static final Pattern AUTHORITY_PATTERN;
256    
257      /**
258       * The set of valid hexadecimal characters.
259       */
260      private static final String HEX = "0123456789ABCDEF";
261    
262      private transient String scheme;
263      private transient String rawSchemeSpecificPart;
264      private transient String schemeSpecificPart;
265      private transient String rawAuthority;
266      private transient String authority;
267      private transient String rawUserInfo;
268      private transient String userInfo;
269      private transient String rawHost;
270      private transient String host;
271      private transient int port = -1;
272      private transient String rawPath;
273      private transient String path;
274      private transient String rawQuery;
275      private transient String query;
276      private transient String rawFragment;
277      private transient String fragment;
278      private String string;
279    
280      /**
281       * Static initializer to pre-compile the regular expressions.
282       */
283      static
284      {
285        URI_PATTERN = Pattern.compile(URI_REGEXP);
286        AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP);
287      }
288    
289      private void readObject(ObjectInputStream is)
290        throws ClassNotFoundException, IOException
291      {
292        this.string = (String) is.readObject();
293        try
294          {
295            parseURI(this.string);
296          }
297        catch (URISyntaxException x)
298          {
299            // Should not happen.
300            throw new RuntimeException(x);
301          }
302      }
303    
304      private void writeObject(ObjectOutputStream os) throws IOException
305      {
306        if (string == null)
307          string = toString(); 
308        os.writeObject(string);
309      }
310    
311      /**
312       * <p>
313       * Returns the string content of the specified group of the supplied
314       * matcher.  The returned value is modified according to the following:
315       * </p>
316       * <ul>
317       * <li>If the resulting string has a length greater than 0, then
318       * that string is returned.</li>
319       * <li>If a string of zero length, is matched, then the content
320       * of the preceding group is considered.  If this is also an empty
321       * string, then <code>null</code> is returned to indicate an undefined
322       * value.  Otherwise, the value is truly the empty string and this is
323       * the returned value.</li>
324       * </ul>
325       * <p>
326       * This method is used for matching against all parts of the URI
327       * that may be either undefined or empty (i.e. all those but the
328       * scheme-specific part and the path).  In each case, the preceding
329       * group is the content of the original group, along with some
330       * additional distinguishing feature.  For example, the preceding
331       * group for the query includes the preceding question mark,
332       * while that of the fragment includes the hash symbol.  The presence
333       * of these features enables disambiguation between the two cases
334       * of a completely unspecified value and a simple non-existant value.
335       * The scheme differs in that it will never return an empty string;
336       * the delimiter follows the scheme rather than preceding it, so
337       * it becomes part of the following section.  The same is true
338       * of the user information.
339       * </p>
340       *
341       * @param match the matcher, which contains the results of the URI
342       *              matched against the URI regular expression.
343       * @return either the matched content, <code>null</code> for undefined
344       *         values, or an empty string for a URI part with empty content.
345       */
346      private static String getURIGroup(Matcher match, int group)
347      {
348        String matched = match.group(group);
349        if (matched == null || matched.length() == 0)
350          {
351            String prevMatched = match.group(group -1);
352            if (prevMatched == null || prevMatched.length() == 0)
353              return null;
354            else
355              return "";
356          }
357        return matched;
358      }
359    
360      /**
361       * Sets fields of this URI by parsing the given string.
362       *
363       * @param str The string to parse
364       *
365       * @exception URISyntaxException If the given string violates RFC 2396
366       */
367      private void parseURI(String str) throws URISyntaxException
368      {
369        Matcher matcher = URI_PATTERN.matcher(str);
370        
371        if (matcher.matches())
372          {
373            scheme = getURIGroup(matcher, SCHEME_GROUP);
374            rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
375            schemeSpecificPart = unquote(rawSchemeSpecificPart);
376            if (!isOpaque())
377              {
378                rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
379                rawPath = matcher.group(PATH_GROUP);
380                rawQuery = getURIGroup(matcher, QUERY_GROUP);
381              }
382            rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
383          }
384        else
385          throw new URISyntaxException(str,
386                                       "doesn't match URI regular expression");
387        parseServerAuthority();
388    
389        // We must eagerly unquote the parts, because this is the only time
390        // we may throw an exception.
391        authority = unquote(rawAuthority);
392        userInfo = unquote(rawUserInfo);
393        host = unquote(rawHost);
394        path = unquote(rawPath);
395        query = unquote(rawQuery);
396        fragment = unquote(rawFragment);
397      }
398    
399      /**
400       * Unquote "%" + hex quotes characters
401       *
402       * @param str The string to unquote or null.
403       *
404       * @return The unquoted string or null if str was null.
405       *
406       * @exception URISyntaxException If the given string contains invalid
407       * escape sequences.
408       */
409      private static String unquote(String str) throws URISyntaxException
410      {
411        if (str == null)
412          return null;
413        byte[] buf = new byte[str.length()];
414        int pos = 0;
415        for (int i = 0; i < str.length(); i++)
416          {
417            char c = str.charAt(i);
418            if (c == '%')
419              {
420                if (i + 2 >= str.length())
421                  throw new URISyntaxException(str, "Invalid quoted character");
422                int hi = Character.digit(str.charAt(++i), 16);
423                int lo = Character.digit(str.charAt(++i), 16);
424                if (lo < 0 || hi < 0)
425                  throw new URISyntaxException(str, "Invalid quoted character");
426                buf[pos++] = (byte) (hi * 16 + lo);
427              }
428            else
429              buf[pos++] = (byte) c;
430          }
431        try
432          {
433            return new String(buf, 0, pos, "utf-8");
434          }
435        catch (java.io.UnsupportedEncodingException x2)
436          {
437            throw (Error) new InternalError().initCause(x2);
438          }
439      }
440    
441      /**
442       * Quote characters illegal in URIs in given string.
443       *
444       * Replace illegal characters by encoding their UTF-8
445       * representation as "%" + hex code for each resulting
446       * UTF-8 character.
447       *
448       * @param str The string to quote
449       *
450       * @return The quoted string.
451       */
452      private static String quote(String str)
453      {
454        return quote(str, RFC3986_SSP);
455      }
456    
457      /**
458       * Quote characters illegal in URI authorities in given string.
459       *
460       * Replace illegal characters by encoding their UTF-8
461       * representation as "%" + hex code for each resulting
462       * UTF-8 character.
463       *
464       * @param str The string to quote
465       *
466       * @return The quoted string.
467       */
468      private static String quoteAuthority(String str)
469      {
470        // Technically, we should be using RFC2396_AUTHORITY, but
471        // it contains no additional characters.
472        return quote(str, RFC3986_REG_NAME);
473      }
474    
475      /**
476       * Quotes the characters in the supplied string that are not part of
477       * the specified set of legal characters.
478       *
479       * @param str the string to quote
480       * @param legalCharacters the set of legal characters
481       *
482       * @return the quoted string.
483       */
484      private static String quote(String str, String legalCharacters)
485      {
486        StringBuffer sb = new StringBuffer(str.length());
487        for (int i = 0; i < str.length(); i++)
488          {
489            char c = str.charAt(i);
490            if ((legalCharacters.indexOf(c) == -1)
491                && (c <= 127))
492              {
493                sb.append('%');
494                sb.append(HEX.charAt(c / 16));
495                sb.append(HEX.charAt(c % 16));
496              }
497            else
498              sb.append(c);
499          }
500        return sb.toString();
501      }
502    
503      /**
504       * Quote characters illegal in URI hosts in given string.
505       *
506       * Replace illegal characters by encoding their UTF-8
507       * representation as "%" + hex code for each resulting
508       * UTF-8 character.
509       *
510       * @param str The string to quote
511       *
512       * @return The quoted string.
513       */
514      private static String quoteHost(String str)
515      {
516        return quote(str, RFC3986_HOST);
517      }
518    
519      /**
520       * Quote characters illegal in URI paths in given string.
521       *
522       * Replace illegal characters by encoding their UTF-8
523       * representation as "%" + hex code for each resulting
524       * UTF-8 character.
525       *
526       * @param str The string to quote
527       *
528       * @return The quoted string.
529       */
530      private static String quotePath(String str)
531      {
532        // Technically, we should be using RFC2396_PATH, but
533        // it contains no additional characters.
534        return quote(str, RFC3986_PATH_SEGMENTS);
535      }
536    
537      /**
538       * Quote characters illegal in URI user infos in given string.
539       *
540       * Replace illegal characters by encoding their UTF-8
541       * representation as "%" + hex code for each resulting
542       * UTF-8 character.
543       *
544       * @param str The string to quote
545       *
546       * @return The quoted string.
547       */
548      private static String quoteUserInfo(String str)
549      {
550        return quote(str, RFC3986_USERINFO);
551      }
552    
553      /**
554       * Creates an URI from the given string
555       *
556       * @param str The string to create the URI from
557       *
558       * @exception URISyntaxException If the given string violates RFC 2396
559       * @exception NullPointerException If str is null
560       */
561      public URI(String str) throws URISyntaxException
562      {
563        this.string = str;
564        parseURI(str);
565      }
566    
567      /**
568       * Create an URI from the given components
569       *
570       * @param scheme The scheme name
571       * @param userInfo The username and authorization info
572       * @param host The hostname
573       * @param port The port number
574       * @param path The path
575       * @param query The query
576       * @param fragment The fragment
577       *
578       * @exception URISyntaxException If the given string violates RFC 2396
579       */
580      public URI(String scheme, String userInfo, String host, int port,
581                 String path, String query, String fragment)
582        throws URISyntaxException
583      {
584        this((scheme == null ? "" : scheme + ":")
585             + (userInfo == null && host == null && port == -1 ? "" : "//")
586             + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
587             + (host == null ? "" : quoteHost(host))
588             + (port == -1 ? "" : ":" + String.valueOf(port))
589             + (path == null ? "" : quotePath(path))
590             + (query == null ? "" : "?" + quote(query))
591             + (fragment == null ? "" : "#" + quote(fragment)));
592      }
593    
594      /**
595       * Create an URI from the given components
596       *
597       * @param scheme The scheme name
598       * @param authority The authority
599       * @param path The apth
600       * @param query The query
601       * @param fragment The fragment
602       *
603       * @exception URISyntaxException If the given string violates RFC 2396
604       */
605      public URI(String scheme, String authority, String path, String query,
606                 String fragment) throws URISyntaxException
607      {
608        this((scheme == null ? "" : scheme + ":")
609             + (authority == null ? "" : "//" + quoteAuthority(authority))
610             + (path == null ? "" : quotePath(path))
611             + (query == null ? "" : "?" + quote(query))
612             + (fragment == null ? "" : "#" + quote(fragment)));
613      }
614    
615      /**
616       * Create an URI from the given components
617       *
618       * @param scheme The scheme name
619       * @param host The hostname
620       * @param path The path
621       * @param fragment The fragment
622       *
623       * @exception URISyntaxException If the given string violates RFC 2396
624       */
625      public URI(String scheme, String host, String path, String fragment)
626        throws URISyntaxException
627      {
628        this(scheme, null, host, -1, path, null, fragment);
629      }
630    
631      /**
632       * Create an URI from the given components
633       *
634       * @param scheme The scheme name
635       * @param ssp The scheme specific part
636       * @param fragment The fragment
637       *
638       * @exception URISyntaxException If the given string violates RFC 2396
639       */
640      public URI(String scheme, String ssp, String fragment)
641        throws URISyntaxException
642      {
643        this((scheme == null ? "" : scheme + ":")
644             + (ssp == null ? "" : quote(ssp))
645             + (fragment == null ? "" : "#" + quote(fragment)));
646      }
647    
648      /**
649       * Create an URI from the given string
650       *
651       * @param str The string to create the URI from
652       *
653       * @exception IllegalArgumentException If the given string violates RFC 2396
654       * @exception NullPointerException If str is null
655       */
656      public static URI create(String str)
657      {
658        try
659          {
660            return new URI(str);
661          }
662        catch (URISyntaxException e)
663          {
664            throw (IllegalArgumentException) new IllegalArgumentException()
665                  .initCause(e);
666          }
667      }
668    
669      /**
670       * Attempts to parse this URI's authority component, if defined,
671       * into user-information, host, and port components.  The purpose
672       * of this method was to disambiguate between some authority sections,
673       * which form invalid server-based authories, but valid registry
674       * based authorities.  In the updated RFC 3986, the authority section
675       * is defined differently, with registry-based authorities part of
676       * the host section.  Thus, this method is now simply an explicit
677       * way of parsing any authority section.
678       *
679       * @return the URI, with the authority section parsed into user
680       *         information, host and port components.
681       * @throws URISyntaxException if the given string violates RFC 2396
682       */
683      public URI parseServerAuthority() throws URISyntaxException
684      {
685        if (rawAuthority != null)
686          {
687            Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
688    
689            if (matcher.matches())
690              {
691                rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
692                rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
693                
694                String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
695                
696                if (portStr != null && ! portStr.isEmpty())
697                  try
698                    {
699                      port = Integer.parseInt(portStr);
700                    }
701                  catch (NumberFormatException e)
702                    {
703                      URISyntaxException use =
704                        new URISyntaxException
705                          (string, "doesn't match URI regular expression");
706                      use.initCause(e);
707                      throw use;
708                    }
709              }
710            else
711              throw new URISyntaxException(string,
712                                           "doesn't match URI regular expression");
713          }
714        return this;
715      }
716    
717      /**
718       * <p>
719       * Returns a normalized version of the URI.  If the URI is opaque,
720       * or its path is already in normal form, then this URI is simply
721       * returned.  Otherwise, the following transformation of the path
722       * element takes place:
723       * </p>
724       * <ol>
725       * <li>All `.' segments are removed.</li>
726       * <li>Each `..' segment which can be paired with a prior non-`..' segment
727       * is removed along with the preceding segment.</li>
728       * <li>A `.' segment is added to the front if the first segment contains
729       * a colon (`:').  This is a deviation from the RFC, which prevents
730       * confusion between the path and the scheme.</li>
731       * </ol>
732       * <p>
733       * The resulting URI will be free of `.' and `..' segments, barring those
734       * that were prepended or which couldn't be paired, respectively.
735       * </p>
736       *
737       * @return the normalized URI.
738       */
739      public URI normalize()
740      {
741        if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1)
742          return this;
743        try
744          {
745            return new URI(scheme, authority, normalizePath(path), query,
746                           fragment);
747          }
748        catch (URISyntaxException e)
749          {
750            throw (Error) new InternalError("Normalized URI variant could not "+
751                                            "be constructed").initCause(e);
752          }
753      }
754    
755      /**
756       * <p>
757       * Normalize the given path.  The following transformation takes place:
758       * </p>
759       * <ol>
760       * <li>All `.' segments are removed.</li>
761       * <li>Each `..' segment which can be paired with a prior non-`..' segment
762       * is removed along with the preceding segment.</li>
763       * <li>A `.' segment is added to the front if the first segment contains
764       * a colon (`:').  This is a deviation from the RFC, which prevents
765       * confusion between the path and the scheme.</li>
766       * </ol>
767       * <p>
768       * The resulting URI will be free of `.' and `..' segments, barring those
769       * that were prepended or which couldn't be paired, respectively.
770       * </p>
771       * 
772       * @param relativePath the relative path to be normalized.
773       * @return the normalized path.
774       */
775      private String normalizePath(String relativePath)
776      {
777        /* 
778           This follows the algorithm in section 5.2.4. of RFC3986,
779           but doesn't modify the input buffer.
780        */
781        StringBuffer input = new StringBuffer(relativePath);
782        StringBuffer output = new StringBuffer();
783        int start = 0;
784        while (start < input.length())
785          {
786            /* A */
787            if (input.indexOf("../",start) == start)
788              {
789                start += 3;
790                continue;
791              }
792            if (input.indexOf("./",start) == start)
793              {
794                start += 2;
795                continue;
796              }
797            /* B */
798            if (input.indexOf("/./",start) == start)
799              {
800                start += 2;
801                continue;
802              }
803            if (input.indexOf("/.",start) == start
804                && input.charAt(start + 2) != '.')
805              {
806                start += 1;
807                input.setCharAt(start,'/');
808                continue;
809              }
810            /* C */
811            if (input.indexOf("/../",start) == start)
812              {
813                start += 3;
814                removeLastSegment(output);
815                continue;
816              }
817            if (input.indexOf("/..",start) == start)
818              {
819                start += 2;
820                input.setCharAt(start,'/');
821                removeLastSegment(output);
822                continue;
823              }
824            /* D */
825            if (start == input.length() - 1 && input.indexOf(".",start) == start)
826              {
827                input.delete(0,1);
828                continue;
829              }
830            if (start == input.length() - 2 && input.indexOf("..",start) == start)
831              {
832                input.delete(0,2);
833                continue;
834              }
835            /* E */
836            int indexOfSlash = input.indexOf("/",start);
837            while (indexOfSlash == start)
838              {
839                output.append("/");
840                ++start;
841                indexOfSlash = input.indexOf("/",start);
842              }
843            if (indexOfSlash == -1)
844              indexOfSlash = input.length();
845            output.append(input.substring(start, indexOfSlash));
846            start = indexOfSlash;
847          }
848        return output.toString();
849      }
850    
851      /**
852       * Removes the last segment of the path from the specified buffer.
853       *
854       * @param buffer the buffer containing the path.
855       */
856      private void removeLastSegment(StringBuffer buffer)
857      {
858        int lastSlash = buffer.lastIndexOf("/");
859        if (lastSlash == -1)
860          buffer.setLength(0);
861        else
862          buffer.setLength(lastSlash);
863      }
864    
865      /**
866       * Resolves the given URI against this URI
867       *
868       * @param uri The URI to resolve against this URI
869       *
870       * @return The resulting URI, or null when it couldn't be resolved
871       * for some reason.
872       *
873       * @throws NullPointerException if uri is null
874       */
875      public URI resolve(URI uri)
876      {
877        if (uri.isAbsolute())
878          return uri;
879        if (uri.isOpaque())
880          return uri;
881    
882        String scheme = uri.getScheme();
883        String schemeSpecificPart = uri.getSchemeSpecificPart();
884        String authority = uri.getAuthority();
885        String path = uri.getPath();
886        String query = uri.getQuery();
887        String fragment = uri.getFragment();
888    
889        try
890          {
891            if (fragment != null && path != null && path.equals("")
892                && scheme == null && authority == null && query == null)
893              return new URI(this.scheme, this.schemeSpecificPart, fragment);
894    
895            if (authority == null)
896              {
897                authority = this.authority;
898                if (path == null)
899                  path = "";
900                if (! (path.startsWith("/")))
901                  {
902                    StringBuffer basepath = new StringBuffer(this.path);
903                    int i = this.path.lastIndexOf('/');
904    
905                    if (i >= 0)
906                      basepath.delete(i + 1, basepath.length());
907    
908                    basepath.append(path);
909                    path = normalizePath(basepath.toString());
910                  }
911              }
912            return new URI(this.scheme, authority, path, query, fragment);
913          }
914        catch (URISyntaxException e)
915          {
916            throw (Error) new InternalError("Resolved URI variant could not "+
917                                            "be constructed").initCause(e);
918          }
919      }
920    
921      /**
922       * Resolves the given URI string against this URI
923       *
924       * @param str The URI as string to resolve against this URI
925       *
926       * @return The resulting URI
927       *
928       * @throws IllegalArgumentException If the given URI string
929       * violates RFC 2396
930       * @throws NullPointerException If uri is null
931       */
932      public URI resolve(String str) throws IllegalArgumentException
933      {
934        return resolve(create(str));
935      }
936    
937      /**
938       * <p>
939       * Relativizes the given URI against this URI.  The following
940       * algorithm is used:
941       * </p>
942       * <ul>
943       * <li>If either URI is opaque, the given URI is returned.</li>
944       * <li>If the schemes of the URIs differ, the given URI is returned.</li>
945       * <li>If the authority components of the URIs differ, then the given
946       * URI is returned.</li>
947       * <li>If the path of this URI is not a prefix of the supplied URI,
948       * then the given URI is returned.</li>
949       * <li>If all the above conditions hold, a new URI is created using the
950       * query and fragment components of the given URI, along with a path
951       * computed by removing the path of this URI from the start of the path
952       * of the supplied URI.</li>
953       * </ul>
954       *
955       * @param uri the URI to relativize agsint this URI
956       * @return the resulting URI
957       * @throws NullPointerException if the uri is null
958       */
959      public URI relativize(URI uri)
960      {
961        if (isOpaque() || uri.isOpaque())
962          return uri;
963        if (scheme == null && uri.getScheme() != null)
964          return uri;
965        if (scheme != null && !(scheme.equals(uri.getScheme())))
966          return uri;
967        if (rawAuthority == null && uri.getRawAuthority() != null)
968          return uri;
969        if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority())))
970          return uri;
971        String basePath = rawPath;
972        if (!(uri.getRawPath().equals(rawPath)))
973          {
974            if (!(basePath.endsWith("/")))
975              basePath = basePath.concat("/");
976            if (!(uri.getRawPath().startsWith(basePath)))
977              return uri;
978          }
979        try
980          {
981            return new URI(null, null, 
982                           uri.getRawPath().substring(basePath.length()),
983                           uri.getRawQuery(), uri.getRawFragment());
984          }
985        catch (URISyntaxException e)
986          {
987            throw (Error) new InternalError("Relativized URI variant could not "+
988                                            "be constructed").initCause(e);       
989          }
990      }
991    
992      /**
993       * Creates an URL from an URI
994       *
995       * @throws MalformedURLException If a protocol handler for the URL could
996       * not be found, or if some other error occurred while constructing the URL
997       * @throws IllegalArgumentException If the URI is not absolute
998       */
999      public URL toURL() throws IllegalArgumentException, MalformedURLException
1000      {
1001        if (isAbsolute())
1002          return new URL(this.toString());
1003    
1004        throw new IllegalArgumentException("not absolute");
1005      }
1006    
1007      /**
1008       * Returns the scheme of the URI
1009       */
1010      public String getScheme()
1011      {
1012        return scheme;
1013      }
1014    
1015      /**
1016       * Tells whether this URI is absolute or not
1017       */
1018      public boolean isAbsolute()
1019      {
1020        return scheme != null;
1021      }
1022    
1023      /**
1024       * Tell whether this URI is opaque or not
1025       */
1026      public boolean isOpaque()
1027      {
1028        return ((scheme != null) && ! (schemeSpecificPart.startsWith("/")));
1029      }
1030    
1031      /**
1032       * Returns the raw scheme specific part of this URI.
1033       * The scheme-specific part is never undefined, though it may be empty
1034       */
1035      public String getRawSchemeSpecificPart()
1036      {
1037        return rawSchemeSpecificPart;
1038      }
1039    
1040      /**
1041       * Returns the decoded scheme specific part of this URI.
1042       */
1043      public String getSchemeSpecificPart()
1044      {
1045        return schemeSpecificPart;
1046      }
1047    
1048      /**
1049       * Returns the raw authority part of this URI
1050       */
1051      public String getRawAuthority()
1052      {
1053        return rawAuthority;
1054      }
1055    
1056      /**
1057       * Returns the decoded authority part of this URI
1058       */
1059      public String getAuthority()
1060      {
1061        return authority;
1062      }
1063    
1064      /**
1065       * Returns the raw user info part of this URI
1066       */
1067      public String getRawUserInfo()
1068      {
1069        return rawUserInfo;
1070      }
1071    
1072      /**
1073       * Returns the decoded user info part of this URI
1074       */
1075      public String getUserInfo()
1076      {
1077        return userInfo;
1078      }
1079    
1080      /**
1081       * Returns the hostname of the URI
1082       */
1083      public String getHost()
1084      {
1085        return host;
1086      }
1087    
1088      /**
1089       * Returns the port number of the URI
1090       */
1091      public int getPort()
1092      {
1093        return port;
1094      }
1095    
1096      /**
1097       * Returns the raw path part of this URI
1098       */
1099      public String getRawPath()
1100      {
1101        return rawPath;
1102      }
1103    
1104      /**
1105       * Returns the path of the URI
1106       */
1107      public String getPath()
1108      {
1109        return path;
1110      }
1111    
1112      /**
1113       * Returns the raw query part of this URI
1114       */
1115      public String getRawQuery()
1116      {
1117        return rawQuery;
1118      }
1119    
1120      /**
1121       * Returns the query of the URI
1122       */
1123      public String getQuery()
1124      {
1125        return query;
1126      }
1127    
1128      /**
1129       * Return the raw fragment part of this URI
1130       */
1131      public String getRawFragment()
1132      {
1133        return rawFragment;
1134      }
1135    
1136      /**
1137       * Returns the fragment of the URI
1138       */
1139      public String getFragment()
1140      {
1141        return fragment;
1142      }
1143    
1144      /**
1145       * <p> 
1146       * Compares the URI with the given object for equality.  If the
1147       * object is not a <code>URI</code>, then the method returns false.
1148       * Otherwise, the following criteria are observed:
1149       * </p>
1150       * <ul>
1151       * <li>The scheme of the URIs must either be null (undefined) in both cases,
1152       * or equal, ignorant of case.</li>
1153       * <li>The raw fragment of the URIs must either be null (undefined) in both
1154       * cases, or equal, ignorant of case.</li>
1155       * <li>Both URIs must be of the same type (opaque or hierarchial)</li>
1156       * <li><strong>For opaque URIs:</strong></li>
1157       * <ul>
1158       * <li>The raw scheme-specific parts must be equal.</li>
1159       * </ul>
1160       * <li>For hierarchical URIs:</li>
1161       * <ul>
1162       * <li>The raw paths must be equal, ignorant of case.</li>
1163       * <li>The raw queries are either both undefined or both equal, ignorant
1164       * of case.</li>
1165       * <li>The raw authority sections are either both undefined or:</li>
1166       * <li><strong>For registry-based authorities:</strong></li>
1167       * <ul><li>they are equal.</li></ul>
1168       * <li><strong>For server-based authorities:</strong></li>
1169       * <ul>
1170       * <li>the hosts are equal, ignoring case</li>
1171       * <li>the ports are equal</li>
1172       * <li>the user information components are equal</li>
1173       * </ul>
1174       * </ul>
1175       * </ul>
1176       *
1177       * @param obj the obj to compare the URI with.
1178       * @return <code>true</code> if the objects are equal, according to
1179       *         the specification above.
1180       */
1181      public boolean equals(Object obj)
1182      {
1183        if (!(obj instanceof URI))
1184          return false;
1185        URI uriObj = (URI) obj;
1186        if (scheme == null)
1187          {
1188            if (uriObj.getScheme() != null)
1189              return false;
1190          }
1191        else
1192          if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
1193            return false;
1194        if (rawFragment == null)
1195          {
1196            if (uriObj.getRawFragment() != null)
1197              return false;
1198          }
1199        else
1200          if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
1201            return false;
1202        boolean opaqueThis = isOpaque();
1203        boolean opaqueObj = uriObj.isOpaque();
1204        if (opaqueThis && opaqueObj)
1205          return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
1206        else if (!opaqueThis && !opaqueObj)
1207          {
1208            boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
1209              && ((rawQuery == null && uriObj.getRawQuery() == null)
1210                  || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
1211            if (rawAuthority == null && uriObj.getRawAuthority() == null)
1212              return common;
1213            if (host == null)
1214              return common 
1215                && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
1216            return common 
1217              && host.equalsIgnoreCase(uriObj.getHost())
1218              && port == uriObj.getPort()
1219              && (rawUserInfo == null ?
1220                  uriObj.getRawUserInfo() == null :
1221                  rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
1222          }
1223        else
1224          return false;
1225      }
1226    
1227      /**
1228       * Computes the hashcode of the URI
1229       */
1230      public int hashCode()
1231      {
1232        return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
1233          + 17 * getRawSchemeSpecificPart().hashCode()
1234          + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
1235      }
1236    
1237      /**
1238       * Compare the URI with another URI.
1239       * Undefined components are taken to be less than any other component.
1240       * The following criteria are observed:
1241       * </p>
1242       * <ul>
1243       * <li>Two URIs with different schemes are compared according to their
1244       * scheme, regardless of case.</li>
1245       * <li>A hierarchical URI is less than an opaque URI with the same
1246       * scheme.</li>
1247       * <li><strong>For opaque URIs:</strong></li>
1248       * <ul>
1249       * <li>URIs with differing scheme-specific parts are ordered according
1250       * to the ordering of the scheme-specific part.</li>
1251       * <li>URIs with the same scheme-specific part are ordered by the
1252       * raw fragment.</li>
1253       * </ul>
1254       * <li>For hierarchical URIs:</li>
1255       * <ul>
1256       * <li>URIs are ordered according to their raw authority sections,
1257       * if they are unequal.</li>
1258       * <li><strong>For registry-based authorities:</strong></li>
1259       * <ul><li>they are ordered according to the ordering of the authority
1260       * component.</li></ul>
1261       * <li><strong>For server-based authorities:</strong></li>
1262       * <ul>
1263       * <li>URIs are ordered according to the raw user information.</li>
1264       * <li>URIs with the same user information are ordered by the host,
1265       * ignoring case.</li>
1266       * <lI>URIs with the same host are ordered by the port.</li>
1267       * </ul>
1268       * <li>URIs with the same authority section are ordered by the raw path.</li>
1269       * <li>URIs with the same path are ordered by their raw query.</li>
1270       * <li>URIs with the same query are ordered by their raw fragments.</li>
1271       * </ul>
1272       * </ul>
1273       *
1274       * @param uri The other URI to compare this URI with
1275       * @return a negative integer, zero or a positive integer depending
1276       *         on whether this URI is less than, equal to or greater
1277       *         than that supplied, respectively.
1278       */
1279      public int compareTo(URI uri) 
1280        throws ClassCastException
1281      {
1282        if (scheme == null && uri.getScheme() != null)
1283          return -1;
1284        if (scheme != null)
1285          {
1286            int sCompare = scheme.compareToIgnoreCase(uri.getScheme()); 
1287            if (sCompare != 0)
1288              return sCompare;
1289          }
1290        boolean opaqueThis = isOpaque();
1291        boolean opaqueObj = uri.isOpaque();
1292        if (opaqueThis && !opaqueObj)
1293          return 1;
1294        if (!opaqueThis && opaqueObj)
1295          return -1;
1296        if (opaqueThis)
1297          {
1298            int ssCompare = 
1299              rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart());
1300            if (ssCompare == 0)
1301              return compareFragments(uri);
1302            else
1303              return ssCompare;
1304          }
1305        if (rawAuthority == null && uri.getRawAuthority() != null)
1306          return -1;
1307        if (rawAuthority != null)
1308          {
1309            int aCompare = rawAuthority.compareTo(uri.getRawAuthority());
1310            if (aCompare != 0)
1311              {
1312                if (host == null)
1313                  return aCompare;
1314                if (rawUserInfo == null && uri.getRawUserInfo() != null)
1315                  return -1;
1316                int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo());
1317                if (uCompare != 0)
1318                  return uCompare;
1319                if (host == null && uri.getHost() != null)
1320                  return -1;
1321                int hCompare = host.compareTo(uri.getHost());
1322                if (hCompare != 0)
1323                  return hCompare;
1324                return new Integer(port).compareTo(new Integer(uri.getPort()));
1325              }
1326          }
1327        if (rawPath == null && uri.getRawPath() != null)
1328          return -1;
1329        if (rawPath != null)
1330          {
1331            int pCompare = rawPath.compareTo(uri.getRawPath()); 
1332            if (pCompare != 0)
1333              return pCompare;
1334          }
1335        if (rawQuery == null && uri.getRawQuery() != null)
1336          return -1;
1337        if (rawQuery != null)
1338          {
1339            int qCompare = rawQuery.compareTo(uri.getRawQuery());
1340            if (qCompare != 0)
1341              return qCompare;
1342          }
1343        return compareFragments(uri);
1344      }
1345    
1346      /**
1347       * Compares the fragment of this URI with that of the supplied URI.
1348       *
1349       * @param uri the URI to compare with this one.
1350       * @return a negative integer, zero or a positive integer depending
1351       *         on whether this uri's fragment is less than, equal to
1352       *         or greater than the fragment of the uri supplied, respectively.
1353       */
1354      private int compareFragments(URI uri)
1355      {
1356        if (rawFragment == null && uri.getRawFragment() != null)
1357          return -1;
1358        else if (rawFragment == null)
1359          return 0;
1360        else
1361          return rawFragment.compareTo(uri.getRawFragment());
1362      }
1363    
1364      /**
1365       * Returns the URI as a String.  If the URI was created using a constructor,
1366       * then this will be the same as the original input string.
1367       *
1368       * @return a string representation of the URI.
1369       */
1370      public String toString()
1371      {
1372        return (scheme == null ? "" : scheme + ":")
1373          + rawSchemeSpecificPart
1374          + (rawFragment == null ? "" : "#" + rawFragment);
1375      }
1376    
1377      /**
1378       * Returns the URI as US-ASCII string.  This is the same as the result
1379       * from <code>toString()</code> for URIs that don't contain any non-US-ASCII
1380       * characters.  Otherwise, the non-US-ASCII characters are replaced
1381       * by their percent-encoded representations.
1382       *
1383       * @return a string representation of the URI, containing only US-ASCII
1384       *         characters.
1385       */
1386      public String toASCIIString()
1387      {
1388        String strRep = toString();
1389        boolean inNonAsciiBlock = false;
1390        StringBuffer buffer = new StringBuffer();
1391        StringBuffer encBuffer = null;
1392        for (int i = 0; i < strRep.length(); i++)
1393          {
1394            char c = strRep.charAt(i);
1395            if (c <= 127)
1396              {
1397                if (inNonAsciiBlock)
1398                  {
1399                    buffer.append(escapeCharacters(encBuffer.toString()));
1400                    inNonAsciiBlock = false;
1401                  }
1402                buffer.append(c);
1403              }
1404            else
1405              {
1406                if (!inNonAsciiBlock)
1407                  {
1408                    encBuffer = new StringBuffer();
1409                    inNonAsciiBlock = true;
1410                  }
1411                encBuffer.append(c);
1412              }
1413          }
1414        return buffer.toString();
1415      }
1416    
1417      /**
1418       * Converts the non-ASCII characters in the supplied string
1419       * to their equivalent percent-encoded representations.
1420       * That is, they are replaced by "%" followed by their hexadecimal value.
1421       *
1422       * @param str a string including non-ASCII characters.
1423       * @return the string with the non-ASCII characters converted to their
1424       *         percent-encoded representations.
1425       */
1426      private static String escapeCharacters(String str)
1427      {
1428        try
1429          {
1430            StringBuffer sb = new StringBuffer(); 
1431            // this is far from optimal, but it works
1432            byte[] utf8 = str.getBytes("utf-8");
1433            for (int j = 0; j < utf8.length; j++)
1434              {
1435                sb.append('%');
1436                sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
1437                sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
1438              }
1439            return sb.toString();
1440          }
1441        catch (java.io.UnsupportedEncodingException x)
1442          {
1443            throw (Error) new InternalError("Escaping error").initCause(x);
1444          }
1445      }
1446    
1447    }