KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > SURT


1 /* SURT
2 *
3 * $Id: SURT.java,v 1.12.12.1 2007/01/13 01:31:40 stack-sf Exp $
4 *
5 * Created on Jul 16, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.util;
26
27 import java.io.BufferedInputStream JavaDoc;
28 import java.io.BufferedOutputStream JavaDoc;
29 import java.io.BufferedReader JavaDoc;
30 import java.io.FileInputStream JavaDoc;
31 import java.io.FileOutputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.io.InputStream JavaDoc;
34 import java.io.InputStreamReader JavaDoc;
35 import java.io.PrintStream JavaDoc;
36 import java.util.regex.Matcher JavaDoc;
37
38 /**
39  * Sort-friendly URI Reordering Transform.
40  *
41  * Converts URIs of the form:
42  *
43  * scheme://userinfo@domain.tld:port/path?query#fragment
44  *
45  * ...into...
46  *
47  * scheme://(tld,domain,:port@userinfo)/path?query#fragment
48  *
49  * The '(' ')' characters serve as an unambiguous notice that the so-called
50  * 'authority' portion of the URI ([userinfo@]host[:port] in http URIs) has
51  * been transformed; the commas prevent confusion with regular hostnames.
52  *
53  * This remedies the 'problem' with standard URIs that the host portion of a
54  * regular URI, with its dotted-domains, is actually in reverse order from
55  * the natural hierarchy that's usually helpful for grouping and sorting.
56  *
57  * The value of respecting URI case variance is considered negligible: it
58  * is vanishingly rare for case-variance to be meaningful, while URI case-
59  * variance often arises from people's confusion or sloppiness, and they
60  * only correct it insofar as necessary to avoid blatant problems. Thus
61  * SURT form is considered to be flattened to all lowercase, and thus not
62  * completely reversible.
63  *
64  * @author gojomo
65  */

66 public class SURT {
67     static char DOT = '.';
68     static String JavaDoc BEGIN_TRANSFORMED_AUTHORITY = "(";
69     static String JavaDoc TRANSFORMED_HOST_DELIM = ",";
70     static String JavaDoc END_TRANSFORMED_AUTHORITY = ")";
71     
72     // 1: scheme://
73
// 2: userinfo (if present)
74
// 3: @ (if present)
75
// 4: dotted-quad host
76
// 5: other host
77
// 6: :port
78
// 7: path
79
static String JavaDoc URI_SPLITTER =
80             "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?"+
81     // 1 2 3
82
"(?:((?:\\d{1,3}\\.){3}\\d{1,3})|(\\S+?))(:\\d+)?(/\\S*)?$";
83     // 4 5 6 7
84

85     // RFC2396
86
// reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
87
// "$" | ","
88
// unreserved = alphanum | mark
89
// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
90
// userinfo = *( unreserved | escaped |
91
// ";" | ":" | "&" | "=" | "+" | "$" | "," )
92
// escaped = "%" hex hex
93

94
95     
96     
97     /**
98      * Utility method for creating the SURT form of the URI in the
99      * given String.
100      *
101      * If it appears a bit convoluted in its approach, note that it was
102      * optimized to minimize object-creation after allocation-sites profiling
103      * indicated this method was a top source of garbage in long-running crawls.
104      *
105      * Assumes that the String URI has already been cleaned/fixed (eg
106      * by UURI fixup) in ways that put it in its crawlable form for
107      * evaluation.
108      *
109      * @param s String URI to be converted to SURT form
110      * @return SURT form
111      */

112     public static String JavaDoc fromURI(String JavaDoc s) {
113         Matcher JavaDoc m = TextUtils.getMatcher(URI_SPLITTER,s);
114         if(!m.matches()) {
115             // not an authority-based URI scheme; return unchanged
116
TextUtils.recycleMatcher(m);
117             return s;
118         }
119         // preallocate enough space for SURT form, which includes
120
// 3 extra characters ('(', ')', and one more ',' than '.'s
121
// in original)
122
StringBuffer JavaDoc builder = new StringBuffer JavaDoc(s.length()+3);
123         append(builder,s,m.start(1),m.end(1)); // scheme://
124
builder.append(BEGIN_TRANSFORMED_AUTHORITY); // '('
125

126         if(m.start(4)>-1) {
127             // dotted-quad ip match: don't reverse
128
append(builder,s,m.start(4),m.end(4));
129         } else {
130             // other hostname match: do reverse
131
int hostSegEnd = m.end(5);
132             int hostStart = m.start(5);
133             for(int i = m.end(5)-1; i>=hostStart; i--) {
134                 if(s.charAt(i-1)!=DOT && i > hostStart) {
135                     continue;
136                 }
137                 append(builder,s,i,hostSegEnd); // rev host segment
138
builder.append(TRANSFORMED_HOST_DELIM); // ','
139
hostSegEnd = i-1;
140             }
141         }
142
143         append(builder,s,m.start(6),m.end(6)); // :port
144
append(builder,s,m.start(3),m.end(3)); // at
145
append(builder,s,m.start(2),m.end(2)); // userinfo
146
builder.append(END_TRANSFORMED_AUTHORITY); // ')'
147
append(builder,s,m.start(7),m.end(7)); // path
148
for(int i = 0; i < builder.length(); i++) {
149             builder.setCharAt(i,Character.toLowerCase(builder.charAt((i))));
150         }
151         TextUtils.recycleMatcher(m);
152         return builder.toString();
153     }
154     
155     private static void append(StringBuffer JavaDoc b, CharSequence JavaDoc cs, int start,
156             int end) {
157         if (start < 0) {
158             return;
159         }
160         b.append(cs, start, end);
161     }
162         
163     /**
164      * Allow class to be used as a command-line tool for converting
165      * URL lists (or naked host or host/path fragments implied
166      * to be HTTP URLs) to SURT form. Lines that cannot be converted
167      * are returned unchanged.
168      *
169      *
170      * Read from stdin or first file argument. Writes to stdout or
171      * second argument filename
172      *
173      * @param args cmd-line arguments
174      * @throws IOException
175      */

176     public static void main(String JavaDoc[] args) throws IOException JavaDoc {
177         InputStream JavaDoc in = args.length > 0 ? new BufferedInputStream JavaDoc(
178                 new FileInputStream JavaDoc(args[0])) : System.in;
179         PrintStream JavaDoc out = args.length > 1 ? new PrintStream JavaDoc(
180                 new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(args[1])))
181                 : System.out;
182         BufferedReader JavaDoc br =
183             new BufferedReader JavaDoc(new InputStreamReader JavaDoc(in));
184         String JavaDoc line;
185         while((line = br.readLine())!=null) {
186             if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
187             line = line.trim();
188             if(line.length()==0) continue;
189             line = ArchiveUtils.addImpliedHttpIfNecessary(line);
190             out.println(SURT.fromURI(line));
191         }
192         br.close();
193         out.close();
194     }
195 }
196
Popular Tags