KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > net > LaxURLCodec


1 /* IAURLCodec
2 *
3 * $Id: LaxURLCodec.java,v 1.5 2006/07/18 00:40:16 gojomo Exp $
4 *
5 * Created on Jul 21, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.net;
26
27 import java.io.ByteArrayOutputStream JavaDoc;
28 import java.io.UnsupportedEncodingException JavaDoc;
29 import java.util.BitSet JavaDoc;
30
31 import org.apache.commons.codec.net.URLCodec;
32
33 /**
34  * @author gojomo
35  */

36 public class LaxURLCodec extends URLCodec {
37     public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");
38
39     // passthrough constructor
40
public LaxURLCodec(String JavaDoc encoding) {
41         super(encoding);
42     }
43
44     /**
45      * Decodes an array of URL safe 7-bit characters into an array of
46      * original bytes. Escaped characters are converted back to their
47      * original representation.
48      *
49      * Differs from URLCodec.decodeUrl() in that it throws no
50      * exceptions; bad or incomplete escape sequences are ignored
51      * and passed into result undecoded. This matches the behavior
52      * of browsers, which will use inconsistently-encoded URIs
53      * in HTTP request-lines.
54      *
55      * @param bytes array of URL safe characters
56      * @return array of original bytes
57      */

58     public static final byte[] decodeUrlLoose(byte[] bytes)
59     {
60         if (bytes == null) {
61             return null;
62         }
63         ByteArrayOutputStream JavaDoc buffer = new ByteArrayOutputStream JavaDoc();
64         for (int i = 0; i < bytes.length; i++) {
65             int b = bytes[i];
66             if (b == '+') {
67                 buffer.write(' ');
68                 continue;
69             }
70             if (b == '%') {
71                 if(i+2<bytes.length) {
72                     int u = Character.digit((char)bytes[i+1], 16);
73                     int l = Character.digit((char)bytes[i+2], 16);
74                     if (u > -1 && l > -1) {
75                         // good encoding
76
int c = ((u << 4) + l);
77                         buffer.write((char)c);
78                         i += 2;
79                         continue;
80                     } // else: bad encoding digits, leave '%' in place
81
} // else: insufficient encoding digits, leave '%' in place
82
}
83             buffer.write(b);
84         }
85         return buffer.toByteArray();
86     }
87
88     /**
89      * A more expansive set of ASCII URI characters to consider as 'safe' to
90      * leave unencoded, based on actual browser behavior.
91      */

92     public static BitSet JavaDoc EXPANDED_URI_SAFE = new BitSet JavaDoc(256);
93     static {
94         // alpha characters
95
for (int i = 'a'; i <= 'z'; i++) {
96             EXPANDED_URI_SAFE.set(i);
97         }
98         for (int i = 'A'; i <= 'Z'; i++) {
99             EXPANDED_URI_SAFE.set(i);
100         }
101         // numeric characters
102
for (int i = '0'; i <= '9'; i++) {
103             EXPANDED_URI_SAFE.set(i);
104         }
105         // special chars
106
EXPANDED_URI_SAFE.set('-');
107         EXPANDED_URI_SAFE.set('~');
108         EXPANDED_URI_SAFE.set('_');
109         EXPANDED_URI_SAFE.set('.');
110         EXPANDED_URI_SAFE.set('*');
111         EXPANDED_URI_SAFE.set('/');
112         EXPANDED_URI_SAFE.set('=');
113         EXPANDED_URI_SAFE.set('&');
114         EXPANDED_URI_SAFE.set('+');
115         EXPANDED_URI_SAFE.set(',');
116         EXPANDED_URI_SAFE.set(':');
117         EXPANDED_URI_SAFE.set(';');
118         EXPANDED_URI_SAFE.set('@');
119         EXPANDED_URI_SAFE.set('$');
120         EXPANDED_URI_SAFE.set('!');
121         EXPANDED_URI_SAFE.set(')');
122         EXPANDED_URI_SAFE.set('(');
123         // experiments indicate: Firefox (1.0.6) never escapes '%'
124
EXPANDED_URI_SAFE.set('%');
125         // experiments indicate: Firefox (1.0.6) does not escape '|' or '''
126
EXPANDED_URI_SAFE.set('|');
127         EXPANDED_URI_SAFE.set('\'');
128     }
129     
130     public static BitSet JavaDoc QUERY_SAFE = new BitSet JavaDoc(256);
131     static {
132         QUERY_SAFE.or(EXPANDED_URI_SAFE);
133         // Tests indicate Firefox (1.0.7-1) doesn't escape curlies in query str.
134
QUERY_SAFE.set('{');
135         QUERY_SAFE.set('}');
136         // nor any of these: [ ] ^ ?
137
QUERY_SAFE.set('[');
138         QUERY_SAFE.set(']');
139         QUERY_SAFE.set('^');
140         QUERY_SAFE.set('?');
141     }
142     
143     /**
144      * Encodes a string into its URL safe form using the specified
145      * string charset. Unsafe characters are escaped.
146      *
147      * This method is analogous to superclass encode() methods,
148      * additionally offering the ability to specify a different
149      * 'safe' character set (such as EXPANDED_URI_SAFE).
150      *
151      * @param safe BitSet of characters that don't need to be encoded
152      * @param pString String to encode
153      * @param cs Name of character set to use
154      * @return Encoded version of <code>pString</code>.
155      * @throws UnsupportedEncodingException
156      */

157     public String JavaDoc encode(BitSet JavaDoc safe, String JavaDoc pString, String JavaDoc cs)
158     throws UnsupportedEncodingException JavaDoc {
159         if (pString == null) {
160             return null;
161         }
162         return new String JavaDoc(encodeUrl(safe,pString.getBytes(cs)), "US-ASCII");
163     }
164 }
165
Popular Tags