KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > net > UURI


1 /* UURI
2  *
3  * $Id: UURI.java,v 1.8.2.1 2007/01/13 01:31:38 stack-sf Exp $
4  *
5  * Created on Apr 18, 2003
6  *
7  * Copyright (C) 2003 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.net;
26
27 import java.io.File JavaDoc;
28 import java.io.Serializable JavaDoc;
29 import java.net.URI JavaDoc;
30 import java.net.URISyntaxException JavaDoc;
31 import java.util.logging.Level JavaDoc;
32 import java.util.logging.Logger JavaDoc;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CandidateURI;
36 import org.archive.util.SURT;
37 import org.archive.util.TextUtils;
38
39
40 /**
41  * Usable URI.
42  *
43  * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
44  * and methods. It cannot be instantiated directly. Go via UURIFactory.
45  *
46  * <p>We used to use {@link java.net.URI} for parsing URIs but ran across
47  * quirky behaviors and bugs. {@link java.net.URI} is not subclassable --
48  * its final -- and its unlikely that java.net.URI will change any time soon
49  * (See Gordon's considered petition here:
50  * <a HREF="http://developer.java.sun.com/developer/bugParade/bugs/4939847.html">java.net.URI
51  * should have loose/tolerant/compatibility option (or allow reuse)</a>).
52  *
53  * <p>This class tries to cache calculated strings such as the extracted host
54  * and this class as a string rather than have the parent class rerun its
55  * calculation everytime.
56  *
57  * @author gojomo
58  * @author stack
59  *
60  * @see org.apache.commons.httpclient.URI
61  */

62 public class UURI extends LaxURI
63 implements CharSequence JavaDoc, Serializable JavaDoc {
64
65     private static final long serialVersionUID = -1277570889914647093L;
66
67     private static Logger JavaDoc LOGGER =
68         Logger.getLogger(UURI.class.getName());
69     
70     /**
71      * Consider URIs too long for IE as illegal.
72      */

73     public final static int MAX_URL_LENGTH = 2083;
74     
75     public static final String JavaDoc MASSAGEHOST_PATTERN = "^www\\d*\\.";
76
77     /**
78      * Cache of the host name.
79      *
80      * Super class calculates on every call. Profiling shows us spend 30% of
81      * total elapsed time in URI class.
82      */

83     private transient String JavaDoc cachedHost = null;
84
85     /**
86      * Cache of this uuri escaped as a string.
87      *
88      * Super class calculates on every call. Profiling shows us spend 30% of
89      * total elapsed time in URI class.
90      */

91     private transient String JavaDoc cachedEscapedURI = null;
92
93     /**
94      * Cache of this uuri escaped as a string.
95      *
96      * Super class calculates on every call. Profiling shows us spend 30% of
97      * total elapsed time in URI class.
98      */

99     private transient String JavaDoc cachedString = null;
100     
101     /**
102      * Cached authority minus userinfo.
103      */

104     private transient String JavaDoc cachedAuthorityMinusUserinfo = null;
105
106     /**
107      * Cache of this uuri in SURT format
108      */

109     private transient String JavaDoc surtForm = null;
110     
111     // Technically, underscores are disallowed in the domainlabel
112
// portion of hostname according to rfc2396 but we'll be more
113
// loose and allow them. See: [ 1072035 ] [uuri] Underscore in
114
// host messes up port parsing.
115
static {
116         hostname.set('_');
117     }
118
119
120     /**
121      * Shutdown access to default constructor.
122      */

123     protected UURI() {
124         super();
125     }
126     
127     /**
128      * @param uri String representation of an absolute URI.
129      * @param escaped If escaped.
130      * @param charset Charset to use.
131      * @throws org.apache.commons.httpclient.URIException
132      */

133     protected UURI(String JavaDoc uri, boolean escaped, String JavaDoc charset)
134     throws URIException {
135         super(uri, escaped, charset);
136         normalize();
137     }
138     
139     /**
140      * @param relative String representation of URI.
141      * @param base Parent UURI to use derelativizing.
142      * @throws org.apache.commons.httpclient.URIException
143      */

144     protected UURI(UURI base, UURI relative) throws URIException {
145         super(base, relative);
146         normalize();
147     }
148
149     /**
150      * @param uri String representation of a URI.
151      * @param escaped If escaped.
152      * @throws NullPointerException
153      * @throws URIException
154      */

155     public UURI(String JavaDoc uri, boolean escaped) throws URIException, NullPointerException JavaDoc {
156         super(uri,escaped);
157         normalize();
158     }
159
160     /**
161      * @param uri URI as string that is resolved relative to this UURI.
162      * @return UURI that uses this UURI as base.
163      * @throws URIException
164      */

165     public UURI resolve(String JavaDoc uri)
166     throws URIException {
167         return resolve(uri, false, // assume not escaped
168
this.getProtocolCharset());
169     }
170
171     /**
172      * @param uri URI as string that is resolved relative to this UURI.
173      * @param e True if escaped.
174      * @return UURI that uses this UURI as base.
175      * @throws URIException
176      */

177     public UURI resolve(String JavaDoc uri, boolean e)
178     throws URIException {
179         return resolve(uri, e, this.getProtocolCharset());
180     }
181     
182     /**
183      * @param uri URI as string that is resolved relative to this UURI.
184      * @param e True if uri is escaped.
185      * @param charset Charset to use.
186      * @return UURI that uses this UURI as base.
187      * @throws URIException
188      */

189     public UURI resolve(String JavaDoc uri, boolean e, String JavaDoc charset)
190     throws URIException {
191         return new UURI(this, new UURI(uri, e, charset));
192     }
193
194     /**
195      * Test an object if this UURI is equal to another.
196      *
197      * @param obj an object to compare
198      * @return true if two URI objects are equal
199      */

200     public boolean equals(Object JavaDoc obj) {
201
202         // normalize and test each components
203
if (obj == this) {
204             return true;
205         }
206         if (!(obj instanceof UURI)) {
207             return false;
208         }
209         UURI another = (UURI) obj;
210         // scheme
211
if (!equals(this._scheme, another._scheme)) {
212             return false;
213         }
214         // is_opaque_part or is_hier_part? and opaque
215
if (!equals(this._opaque, another._opaque)) {
216             return false;
217         }
218         // is_hier_part
219
// has_authority
220
if (!equals(this._authority, another._authority)) {
221             return false;
222         }
223         // path
224
if (!equals(this._path, another._path)) {
225             return false;
226         }
227         // has_query
228
if (!equals(this._query, another._query)) {
229             return false;
230         }
231         // UURIs do not have fragments
232
return true;
233     }
234
235     /**
236      * Strips www variants from the host.
237      *
238      * Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a
239      * performance issue we should consider adding the hostBasename member that
240      * is set on initialization.
241      *
242      * @return Host's basename.
243      * @throws URIException
244      */

245     public String JavaDoc getHostBasename() throws URIException {
246         // caching eliminated because this is rarely used
247
// (only benefits legacy DomainScope, which should
248
// be retired). Saves 4-byte object pointer in UURI
249
// instances.
250
return (this.getReferencedHost() == null)
251             ? null
252             : TextUtils.replaceFirst(MASSAGEHOST_PATTERN,
253                     this.getReferencedHost(), UURIFactory.EMPTY_STRING);
254     }
255
256     /**
257      * Override to cache result
258      *
259      * @return String representation of this URI
260      */

261     public synchronized String JavaDoc toString() {
262         if (this.cachedString == null) {
263             this.cachedString = super.toString();
264             coalesceUriStrings();
265         }
266         return this.cachedString;
267     }
268
269     public synchronized String JavaDoc getEscapedURI() {
270         if (this.cachedEscapedURI == null) {
271             this.cachedEscapedURI = super.getEscapedURI();
272             coalesceUriStrings();
273         }
274         return this.cachedEscapedURI;
275     }
276
277     /**
278      * The two String fields cachedString and cachedEscapedURI are
279      * usually identical; if so, coalesce into a single instance.
280      */

281     protected void coalesceUriStrings() {
282         if (this.cachedString != null && this.cachedEscapedURI != null
283                 && this.cachedString.length() == this.cachedEscapedURI.length()) {
284             // lengths will only be identical if contents are identical
285
// (deescaping will always shrink length), so coalesce to
286
// use only single cached instance
287
this.cachedString = this.cachedEscapedURI;
288         }
289     }
290     
291     public synchronized String JavaDoc getHost() throws URIException {
292         if (this.cachedHost == null) {
293             // If this._host is null, 3.0 httpclient throws
294
// illegalargumentexception. Don't go there.
295
if (this._host != null) {
296                 this.cachedHost = super.getHost();
297                 coalesceHostAuthorityStrings();
298             }
299         }
300         return this.cachedHost;
301     }
302     
303     /**
304      * The two String fields cachedHost and cachedAuthorityMinusUserInfo are
305      * usually identical; if so, coalesce into a single instance.
306      */

307     protected void coalesceHostAuthorityStrings() {
308         if (this.cachedAuthorityMinusUserinfo != null
309                 && this.cachedHost != null
310                 && this.cachedHost.length() ==
311                     this.cachedAuthorityMinusUserinfo.length()) {
312             // lengths can only be identical if contents
313
// are identical; use only one instance
314
this.cachedAuthorityMinusUserinfo = this.cachedHost;
315         }
316     }
317
318     /**
319      * Return the referenced host in the UURI, if any, also extracting the
320      * host of a DNS-lookup URI where necessary.
321      *
322      * @return the target or topic host of the URI
323      * @throws URIException
324      */

325     public String JavaDoc getReferencedHost() throws URIException {
326         String JavaDoc referencedHost = this.getHost();
327         if(referencedHost==null && this.getScheme().equals("dns")) {
328             // extract target domain of DNS lookup
329
String JavaDoc possibleHost = this.getCurrentHierPath();
330             if(possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) {
331                 referencedHost = possibleHost;
332             }
333         }
334         return referencedHost;
335     }
336
337     /**
338      * @return Return the 'SURT' format of this UURI
339      */

340     public String JavaDoc getSurtForm() {
341         if (surtForm == null) {
342             surtForm = SURT.fromURI(this.toString());
343         }
344         return surtForm;
345     }
346     
347     /**
348      * Return the authority minus userinfo (if any).
349      *
350      * If no userinfo present, just returns the authority.
351      *
352      * @return The authority stripped of any userinfo if present.
353      * @throws URIException
354      */

355     public String JavaDoc getAuthorityMinusUserinfo()
356     throws URIException {
357         if (this.cachedAuthorityMinusUserinfo == null) {
358             String JavaDoc tmp = getAuthority();
359             if (tmp != null && tmp.length() > 0) {
360                 int index = tmp.indexOf('@');
361                 if (index >= 0 && index < tmp.length()) {
362                     tmp = tmp.substring(index + 1);
363                 }
364             }
365             this.cachedAuthorityMinusUserinfo = tmp;
366             coalesceHostAuthorityStrings();
367         }
368         return this.cachedAuthorityMinusUserinfo;
369     }
370
371     /* (non-Javadoc)
372      * @see java.lang.CharSequence#length()
373      */

374     public int length() {
375         return getEscapedURI().length();
376     }
377
378     /* (non-Javadoc)
379      * @see java.lang.CharSequence#charAt(int)
380      */

381     public char charAt(int index) {
382         return getEscapedURI().charAt(index);
383     }
384
385     /* (non-Javadoc)
386      * @see java.lang.CharSequence#subSequence(int, int)
387      */

388     public CharSequence JavaDoc subSequence(int start, int end) {
389         return getEscapedURI().subSequence(start,end);
390     }
391
392     /* (non-Javadoc)
393      * @see java.lang.Comparable#compareTo(java.lang.Object)
394      */

395     public int compareTo(Object JavaDoc arg0) {
396         return getEscapedURI().compareTo(arg0.toString());
397     }
398     
399     /**
400      * Convenience method for finding the UURI inside an
401      * Object likely to have (or be/imply) one.
402      *
403      * @param o Object that is, has, or implies a UURI
404      * @return the UURI found, or null if none
405      */

406     public static UURI from(Object JavaDoc o) {
407         UURI u = null;
408         if (o instanceof UURI) {
409             u = (UURI)o;
410         } else if (o instanceof CandidateURI) {
411             u = ((CandidateURI) o).getUURI();
412         } else if (o instanceof CharSequence JavaDoc) {
413             String JavaDoc s = o.toString();
414             try {
415                 u = UURIFactory.getInstance(s);
416             } catch (URIException e) {
417                 LOGGER.log(Level.FINE,"bad URI",e);
418             }
419         }
420         return u;
421     }
422     
423     /**
424      * Test if passed String has likely URI scheme prefix.
425      * @param possibleUrl URL string to examine.
426      * @return True if passed string looks like it could be an URL.
427      */

428     public static boolean hasScheme(String JavaDoc possibleUrl) {
429         boolean result = false;
430         for (int i = 0; i < possibleUrl.length(); i++) {
431             char c = possibleUrl.charAt(i);
432             if (c == ':') {
433                 if (i != 0) {
434                     result = true;
435                 }
436                 break;
437             }
438             if (!scheme.get(c)) {
439                 break;
440             }
441         }
442         return result;
443     }
444     
445     /**
446      * @param pathOrUri A file path or a URI.
447      * @return Path parsed from passed <code>pathOrUri</code>.
448      * @throws URISyntaxException
449      */

450     public static String JavaDoc parseFilename(final String JavaDoc pathOrUri)
451     throws URISyntaxException JavaDoc {
452         String JavaDoc path = pathOrUri;
453         if (UURI.hasScheme(pathOrUri)) {
454             URI JavaDoc url = new URI JavaDoc(pathOrUri);
455             path = url.getPath();
456         }
457         return (new File JavaDoc(path)).getName();
458     }
459 }
460
Popular Tags