KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > net > LaxURI


1 /* LaxURI
2 *
3 * $Id: LaxURI.java,v 1.6.4.1 2007/01/13 01:31:38 stack-sf Exp $
4 *
5 * Created on Aug 3, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.net;
26
27 import java.util.Arrays JavaDoc;
28 import java.util.BitSet JavaDoc;
29
30 import org.apache.commons.httpclient.URI;
31 import org.apache.commons.httpclient.URIException;
32 import org.apache.commons.httpclient.util.EncodingUtil;
33
34 /**
35  * URI subclass which allows partial/inconsistent encoding, matching
36  * the URIs which will be relayed in requests from popular web
37  * browsers (esp. Mozilla Firefox and MS IE).
38  *
39  * @author gojomo
40  */

41 public class LaxURI extends URI {
42
43     private static final long serialVersionUID = 5273922211722239537L;
44     
45     final protected static char[] HTTP_SCHEME = {'h','t','t','p'};
46     final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};
47     
48     protected static final BitSet JavaDoc lax_rel_segment = new BitSet JavaDoc(256);
49     // Static initializer for lax_rel_segment
50
static {
51         lax_rel_segment.or(rel_segment);
52         lax_rel_segment.set(':'); // allow ':'
53
// TODO: add additional allowances as need is demonstrated
54
}
55
56     protected static final BitSet JavaDoc lax_abs_path = new BitSet JavaDoc(256);
57     static {
58         lax_abs_path.or(abs_path);
59         lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.
60
}
61     
62     protected static final BitSet JavaDoc lax_query = new BitSet JavaDoc(256);
63     static {
64         lax_query.or(query);
65         lax_query.set('{'); // tests indicate FF doesn't escape { in query
66
lax_query.set('}'); // tests indicate FF doesn't escape } in query
67
lax_query.set('|'); // tests indicate FF doesn't escape | in query
68
lax_query.set('['); // tests indicate FF doesn't escape [ in query
69
lax_query.set(']'); // tests indicate FF doesn't escape ] in query
70
lax_query.set('^'); // tests indicate FF doesn't escape ^ in query
71
}
72     
73     // passthrough initializers
74
public LaxURI(String JavaDoc uri, boolean escaped, String JavaDoc charset)
75     throws URIException {
76         super(uri,escaped,charset);
77     }
78     public LaxURI(URI base, URI relative) throws URIException {
79         super(base,relative);
80     }
81     public LaxURI(String JavaDoc uri, boolean escaped) throws URIException {
82         super(uri,escaped);
83     }
84     public LaxURI() {
85         super();
86     }
87
88     // overridden to use this class's static decode()
89
public String JavaDoc getURI() throws URIException {
90         return (_uri == null) ? null : decode(_uri, getProtocolCharset());
91     }
92     
93     // overridden to use this class's static decode()
94
public String JavaDoc getPath() throws URIException {
95         char[] p = getRawPath();
96         return (p == null) ? null : decode(p, getProtocolCharset());
97     }
98
99     // overridden to use this class's static decode()
100
public String JavaDoc getPathQuery() throws URIException {
101         char[] rawPathQuery = getRawPathQuery();
102         return (rawPathQuery == null) ? null : decode(rawPathQuery,
103                 getProtocolCharset());
104     }
105     // overridden to use this class's static decode()
106
protected static String JavaDoc decode(char[] component, String JavaDoc charset)
107             throws URIException {
108         if (component == null) {
109             throw new IllegalArgumentException JavaDoc(
110                     "Component array of chars may not be null");
111         }
112         return decode(new String JavaDoc(component), charset);
113     }
114
115     // overridden to use IA's LaxURLCodec, which never throws DecoderException
116
protected static String JavaDoc decode(String JavaDoc component, String JavaDoc charset)
117             throws URIException {
118         if (component == null) {
119             throw new IllegalArgumentException JavaDoc(
120                     "Component array of chars may not be null");
121         }
122         byte[] rawdata = null;
123         // try {
124
rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
125                 .getAsciiBytes(component));
126         // } catch (DecoderException e) {
127
// throw new URIException(e.getMessage());
128
// }
129
return EncodingUtil.getString(rawdata, charset);
130     }
131     
132     // overidden to lax() the acceptable-char BitSet passed in
133
protected boolean validate(char[] component, BitSet JavaDoc generous) {
134         return super.validate(component, lax(generous));
135     }
136
137     // overidden to lax() the acceptable-char BitSet passed in
138
protected boolean validate(char[] component, int soffset, int eoffset,
139             BitSet JavaDoc generous) {
140         return super.validate(component, soffset, eoffset, lax(generous));
141     }
142     
143     /**
144      * Given a BitSet -- typically one of the URI superclass's
145      * predefined static variables -- possibly replace it with
146      * a more-lax version to better match the character sets
147      * actually left unencoded in web browser requests
148      *
149      * @param generous original BitSet
150      * @return (possibly more lax) BitSet to use
151      */

152     protected BitSet JavaDoc lax(BitSet JavaDoc generous) {
153         if (generous == rel_segment) {
154             // Swap in more lax allowable set
155
return lax_rel_segment;
156         }
157         if (generous == abs_path) {
158             return lax_abs_path;
159         }
160         if (generous == query) {
161             return lax_query;
162         }
163         // otherwise, leave as is
164
return generous;
165     }
166     
167     /**
168      * Coalesce the _host and _authority fields where
169      * possible.
170      *
171      * In the web crawl/http domain, most URIs have an
172      * identical _host and _authority. (There is no port
173      * or user info.) However, the superclass always
174      * creates two separate char[] instances.
175      *
176      * Notably, the lengths of these char[] fields are
177      * equal if and only if their values are identical.
178      * This method makes use of this fact to reduce the
179      * two instances to one where possible, slimming
180      * instances.
181      *
182      * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
183      */

184     protected void parseAuthority(String JavaDoc original, boolean escaped)
185             throws URIException {
186         super.parseAuthority(original, escaped);
187         if (_host != null && _authority != null
188                 && _host.length == _authority.length) {
189             _host = _authority;
190         }
191     }
192     
193     
194     /**
195      * Coalesce _scheme to existing instances, where appropriate.
196      *
197      * In the web-crawl domain, most _schemes are 'http' or 'https',
198      * but the superclass always creates a new char[] instance. For
199      * these two cases, we replace the created instance with a
200      * long-lived instance from a static field, saving 12-14 bytes
201      * per instance.
202      *
203      * @see org.apache.commons.httpclient.URI#setURI()
204      */

205     protected void setURI() {
206         if (_scheme != null) {
207             if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
208                 _scheme = HTTP_SCHEME;
209             } else if (_scheme.length == 5
210                     && Arrays.equals(_scheme, HTTP_SCHEME)) {
211                 _scheme = HTTPS_SCHEME;
212             }
213         }
214         super.setURI();
215     }
216     
217     /**
218      * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR
219      * http://issues.apache.org/jira/browse/HTTPCLIENT-588
220      *
221      * In order to avoid any possilbity of conflict with non-ASCII characters,
222      * Parse a URI reference as a <code>String</code> with the character
223      * encoding of the local system or the document.
224      * <p>
225      * The following line is the regular expression for breaking-down a URI
226      * reference into its components.
227      * <p><blockquote><pre>
228      * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
229      * 12 3 4 5 6 7 8 9
230      * </pre></blockquote><p>
231      * For example, matching the above expression to
232      * http://jakarta.apache.org/ietf/uri/#Related
233      * results in the following subexpression matches:
234      * <p><blockquote><pre>
235      * $1 = http:
236      * scheme = $2 = http
237      * $3 = //jakarta.apache.org
238      * authority = $4 = jakarta.apache.org
239      * path = $5 = /ietf/uri/
240      * $6 = <undefined>
241      * query = $7 = <undefined>
242      * $8 = #Related
243      * fragment = $9 = Related
244      * </pre></blockquote><p>
245      *
246      * @param original the original character sequence
247      * @param escaped <code>true</code> if <code>original</code> is escaped
248      * @throws URIException If an error occurs.
249      */

250     protected void parseUriReference(String JavaDoc original, boolean escaped)
251         throws URIException {
252
253         // validate and contruct the URI character sequence
254
if (original == null) {
255             throw new URIException("URI-Reference required");
256         }
257
258         /* @
259          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
260          */

261         String JavaDoc tmp = original.trim();
262         
263         /*
264          * The length of the string sequence of characters.
265          * It may not be equal to the length of the byte array.
266          */

267         int length = tmp.length();
268
269         /*
270          * Remove the delimiters like angle brackets around an URI.
271          */

272         if (length > 0) {
273             char[] firstDelimiter = { tmp.charAt(0) };
274             if (validate(firstDelimiter, delims)) {
275                 if (length >= 2) {
276                     char[] lastDelimiter = { tmp.charAt(length - 1) };
277                     if (validate(lastDelimiter, delims)) {
278                         tmp = tmp.substring(1, length - 1);
279                         length = length - 2;
280                     }
281                 }
282             }
283         }
284
285         /*
286          * The starting index
287          */

288         int from = 0;
289
290         /*
291          * The test flag whether the URI is started from the path component.
292          */

293         boolean isStartedFromPath = false;
294         int atColon = tmp.indexOf(':');
295         int atSlash = tmp.indexOf('/');
296         if ((atColon <= 0 && !tmp.startsWith("//"))
297             || (atSlash >= 0 && atSlash < atColon)) {
298             isStartedFromPath = true;
299         }
300
301         /*
302          * <p><blockquote><pre>
303          * @@@@@@@@
304          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
305          * </pre></blockquote><p>
306          */

307         int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
308         if (at == -1) {
309             at = 0;
310         }
311
312         /*
313          * Parse the scheme.
314          * <p><blockquote><pre>
315          * scheme = $2 = http
316          * @
317          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
318          * </pre></blockquote><p>
319          */

320         if (at > 0 && at < length && tmp.charAt(at) == ':') {
321             char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
322             if (validate(target, scheme)) {
323                 _scheme = target;
324             } else {
325                 throw new URIException("incorrect scheme");
326             }
327             from = ++at;
328         }
329
330         /*
331          * Parse the authority component.
332          * <p><blockquote><pre>
333          * authority = $4 = jakarta.apache.org
334          * @@
335          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
336          * </pre></blockquote><p>
337          */

338         // Reset flags
339
_is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
340         if (0 <= at && at < length && tmp.charAt(at) == '/') {
341             // Set flag
342
_is_hier_part = true;
343             if (at + 2 < length && tmp.charAt(at + 1) == '/'
344                 && !isStartedFromPath) {
345                 // the temporary index to start the search from
346
int next = indexFirstOf(tmp, "/?#", at + 2);
347                 if (next == -1) {
348                     next = (tmp.substring(at + 2).length() == 0) ? at + 2
349                         : tmp.length();
350                 }
351                 parseAuthority(tmp.substring(at + 2, next), escaped);
352                 from = at = next;
353                 // Set flag
354
_is_net_path = true;
355             }
356             if (from == at) {
357                 // Set flag
358
_is_abs_path = true;
359             }
360         }
361
362         /*
363          * Parse the path component.
364          * <p><blockquote><pre>
365          * path = $5 = /ietf/uri/
366          * @@@@@@
367          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
368          * </pre></blockquote><p>
369          */

370         if (from < length) {
371             // rel_path = rel_segment [ abs_path ]
372
int next = indexFirstOf(tmp, "?#", from);
373             if (next == -1) {
374                 next = tmp.length();
375             }
376             if (!_is_abs_path) {
377                 if (!escaped
378                     && prevalidate(tmp.substring(from, next), disallowed_rel_path)
379                     || escaped
380                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
381                     // Set flag
382
_is_rel_path = true;
383                 } else if (!escaped
384                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
385                     || escaped
386                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
387                     // Set flag
388
_is_opaque_part = true;
389                 } else {
390                     // the path component may be empty
391
_path = null;
392                 }
393             }
394             String JavaDoc s = tmp.substring(from, next);
395             if (escaped) {
396                 setRawPath(s.toCharArray());
397             } else {
398                 setPath(s);
399             }
400             at = next;
401         }
402
403         // set the charset to do escape encoding
404
String JavaDoc charset = getProtocolCharset();
405
406         /*
407          * Parse the query component.
408          * <p><blockquote><pre>
409          * query = $7 = <undefined>
410          * @@@@@@@@@
411          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
412          * </pre></blockquote><p>
413          */

414         if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
415             int next = tmp.indexOf('#', at + 1);
416             if (next == -1) {
417                 next = tmp.length();
418             }
419             if (escaped) {
420                 _query = tmp.substring(at + 1, next).toCharArray();
421                 if (!validate(_query, query)) {
422                     throw new URIException("Invalid query");
423                 }
424             } else {
425                 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
426             }
427             at = next;
428         }
429
430         /*
431          * Parse the fragment component.
432          * <p><blockquote><pre>
433          * fragment = $9 = Related
434          * @@@@@@@@
435          * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
436          * </pre></blockquote><p>
437          */

438         if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
439             if (at + 1 == length) { // empty fragment
440
_fragment = "".toCharArray();
441             } else {
442                 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
443                     : encode(tmp.substring(at + 1), allowed_fragment, charset);
444             }
445         }
446
447         // set this URI.
448
setURI();
449     }
450     
451 }
452
Popular Tags