KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > SurtPrefixSet


1 /* SURTPrefixSet
2 *
3 * $Id: SurtPrefixSet.java,v 1.20.4.1 2007/01/13 01:31:40 stack-sf Exp $
4 *
5 * Created on Jul 23, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.util;
26
27 import java.io.BufferedInputStream JavaDoc;
28 import java.io.BufferedOutputStream JavaDoc;
29 import java.io.BufferedReader JavaDoc;
30 import java.io.FileInputStream JavaDoc;
31 import java.io.FileOutputStream JavaDoc;
32 import java.io.FileWriter JavaDoc;
33 import java.io.IOException JavaDoc;
34 import java.io.InputStream JavaDoc;
35 import java.io.InputStreamReader JavaDoc;
36 import java.io.PrintStream JavaDoc;
37 import java.io.Reader JavaDoc;
38 import java.util.Iterator JavaDoc;
39 import java.util.SortedSet JavaDoc;
40 import java.util.TreeSet JavaDoc;
41
42 import org.apache.commons.httpclient.URIException;
43 import org.archive.net.UURI;
44 import org.archive.net.UURIFactory;
45 import org.archive.util.iterator.LineReadingIterator;
46 import org.archive.util.iterator.RegexpLineIterator;
47
48 /**
49  * Specialized TreeSet for keeping a set of String prefixes.
50  *
51  * Redundant prefixes (those that are themselves prefixed
52  * by other set entries) are eliminated.
53  *
54  * @author gojomo
55  */

56 public class SurtPrefixSet extends TreeSet JavaDoc<String JavaDoc> {
57
58     private static final long serialVersionUID = 2598365040524933110L;
59
60     private static final String JavaDoc SURT_PREFIX_DIRECTIVE = "+";
61
62     /**
63      * Test whether the given String is prefixed by one
64      * of this set's entries.
65      *
66      * @param s
67      * @return True if contains prefix.
68      */

69     public boolean containsPrefixOf(String JavaDoc s) {
70         SortedSet JavaDoc sub = headSet(s);
71         // because redundant prefixes have been eliminated,
72
// only a test against last item in headSet is necessary
73
if (!sub.isEmpty() && s.startsWith((String JavaDoc)sub.last())) {
74             return true; // prefix substring exists
75
} // else: might still exist exactly (headSet does not contain boundary)
76
return contains(s); // exact string exists, or no prefix is there
77
}
78     
79     /**
80      * Maintains additional invariant: if one entry is a
81      * prefix of another, keep only the prefix.
82      *
83      * @see java.util.Collection#add(java.lang.Object)
84      */

85     public boolean add(String JavaDoc s) {
86         SortedSet JavaDoc sub = headSet(s);
87         if (!sub.isEmpty() && s.startsWith((String JavaDoc)sub.last())) {
88             // no need to add; prefix is already present
89
return false;
90         }
91         boolean retVal = super.add(s);
92         sub = tailSet(s+"\0");
93         while(!sub.isEmpty() && ((String JavaDoc)sub.first()).startsWith(s)) {
94             // remove redundant entries
95
sub.remove(sub.first());
96         }
97         return retVal;
98     }
99     
100     
101     /**
102      * Read a set of SURT prefixes from a reader source; keep sorted and
103      * with redundant entries removed.
104      *
105      * @param r reader over file of SURT_format strings
106      * @throws IOException
107      */

108     public void importFrom(Reader JavaDoc r) {
109         BufferedReader JavaDoc reader = new BufferedReader JavaDoc(r);
110         String JavaDoc s;
111         
112         Iterator JavaDoc iter =
113             new RegexpLineIterator(
114                     new LineReadingIterator(reader),
115                     RegexpLineIterator.COMMENT_LINE,
116                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
117                     RegexpLineIterator.ENTRY);
118
119         while (iter.hasNext()) {
120             s = (String JavaDoc) iter.next();
121             add(s.toLowerCase());
122         }
123     }
124
125     /**
126      * @param r Where to read from.
127      */

128     public void importFromUris(Reader JavaDoc r) {
129         BufferedReader JavaDoc reader = new BufferedReader JavaDoc(r);
130         String JavaDoc s;
131         
132         Iterator JavaDoc iter =
133             new RegexpLineIterator(
134                     new LineReadingIterator(reader),
135                     RegexpLineIterator.COMMENT_LINE,
136                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
137                     RegexpLineIterator.ENTRY);
138
139         while (iter.hasNext()) {
140             s = (String JavaDoc) iter.next();
141             // s is a URI (or even fragmentary hostname), not a SURT
142
addFromPlain(s);
143         }
144     }
145
146     /**
147      * Import SURT prefixes from a reader with mixed URI and SURT prefix
148      * format.
149      *
150      * @param r the reader to import the prefixes from
151      * @param deduceFromSeeds true to also import SURT prefixes implied
152      * from normal URIs/hostname seeds
153      */

154     public void importFromMixed(Reader JavaDoc r, boolean deduceFromSeeds) {
155         BufferedReader JavaDoc reader = new BufferedReader JavaDoc(r);
156         String JavaDoc s;
157         
158         Iterator JavaDoc iter =
159             new RegexpLineIterator(
160                     new LineReadingIterator(reader),
161                     RegexpLineIterator.COMMENT_LINE,
162                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
163                     RegexpLineIterator.ENTRY);
164
165         while (iter.hasNext()) {
166             s = (String JavaDoc) iter.next();
167             if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
168                 // it's specifically a SURT prefix line
169
String JavaDoc u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim();
170                 if(u.indexOf("(")>0) {
171                     // formal SURT prefix; toLowerCase just in case
172
add(u.toLowerCase());
173                 } else {
174                     // hostname/normal form URI from which
175
// to deduce SURT prefix
176
addFromPlain(u);
177                 }
178                 
179                 continue;
180             } else {
181                 if(deduceFromSeeds) {
182                     // also deducing 'implied' SURT prefixes
183
// from normal URIs/hostname seeds
184
addFromPlain(s);
185                 }
186             }
187         }
188     }
189     
190     /**
191      * Given a plain URI or hostname, deduce an implied SURT prefix from
192      * it and add to active prefixes.
193      *
194      * @param u String of URI or hostname
195      */

196     private void addFromPlain(String JavaDoc u) {
197         u = prefixFromPlain(u);
198         add(u);
199     }
200
201     /**
202      * Given a plain URI or hostname/hostname+path, deduce an implied SURT
203      * prefix from it. Results may be unpredictable on strings that cannot
204      * be interpreted as URIs.
205      *
206      * UURI 'fixup' is applied to the URI that is built.
207      *
208      * @param u URI or almost-URI to consider
209      * @return implied SURT prefix form
210      */

211     public static String JavaDoc prefixFromPlain(String JavaDoc u) {
212         u = ArchiveUtils.addImpliedHttpIfNecessary(u);
213         u = coerceFromHttpsForComparison(u);
214         boolean trailingSlash = u.endsWith("/");
215         // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
216
try {
217             u = UURIFactory.getInstance(u).toString();
218         } catch (URIException e) {
219             e.printStackTrace();
220             // allow to continue with original string uri
221
}
222         // except: don't let UURI-fixup add a trailing slash
223
// if it wasn't already there (presence or absence of
224
// such slash has special meaning specifying implied
225
// SURT prefixes)
226
if(!trailingSlash && u.endsWith("/")) {
227             u = u.substring(0,u.length()-1);
228         }
229         // convert to full SURT
230
u = SURT.fromURI(u);
231         // truncate to implied prefix
232
u = SurtPrefixSet.asPrefix(u);
233         return u;
234     }
235
236     /**
237      * For SURT comparisons -- prefixes or candidates being checked against
238      * those prefixes -- we treat https URIs as if they were http.
239      *
240      * @param u string to coerce if it has https scheme
241      * @return string converted to http scheme, or original if not necessary
242      */

243     private static String JavaDoc coerceFromHttpsForComparison(String JavaDoc u) {
244         if (u.startsWith("https://")) {
245             u = "http" + u.substring("https".length());
246         }
247         return u;
248     }
249
250     /**
251      * Utility method for truncating a SURT that came from a
252      * full URI (as a seed, for example) into a prefix
253      * for determining inclusion.
254      *
255      * This involves:
256      * <pre>
257      * (1) removing the last path component, if any
258      * (anything after the last '/', if there are
259      * at least 3 '/'s)
260      * (2) removing a trailing ')', if present, opening
261      * the possibility of proper subdomains. (This
262      * means that the presence or absence of a
263      * trailing '/' after a hostname in a seed list
264      * is significant for the how the SURT prefix is
265      * created, even though it is not signficant for
266      * the URI's treatment as a seed.)
267      * </pre>
268      *
269      * @param s String to work on.
270      * @return As prefix.
271      */

272     private static String JavaDoc asPrefix(String JavaDoc s) {
273         // Strip last path-segment, if more than 3 slashes
274
s = s.replaceAll("^(.*//.*/)[^/]*","$1");
275         // Strip trailing ")", if present and NO path (no 3rd slash).
276
if (!s.endsWith("/")) {
277             s = s.replaceAll("^(.*)\\)","$1");
278         }
279         return s;
280     }
281
282     /**
283      * Calculate the SURT form URI to use as a candidate against prefixes
284      * from the given Object (CandidateURI or UURI)
285      *
286      * @param object CandidateURI or UURI
287      * @return SURT form of URI for evaluation, or null if unavailable
288      */

289     public static String JavaDoc getCandidateSurt(Object JavaDoc object) {
290         UURI u = UURI.from(object);
291         if (u == null) {
292             return null;
293         }
294         String JavaDoc candidateSurt = u.getSurtForm();
295         // also want to treat https as http
296
candidateSurt = coerceFromHttpsForComparison(candidateSurt);
297         return candidateSurt;
298     }
299     /**
300      * @param fw
301      * @throws IOException
302      */

303     public void exportTo(FileWriter JavaDoc fw) throws IOException JavaDoc {
304         Iterator JavaDoc iter = this.iterator();
305         while(iter.hasNext()) {
306             fw.write((String JavaDoc)iter.next() + "\n");
307         }
308     }
309
310     /**
311      * Changes all prefixes so that they enforce an exact host. For
312      * prefixes that already include a ')', this means discarding
313      * anything after ')' (path info). For prefixes that don't include
314      * a ')' -- domain prefixes open to subdomains -- add the closing
315      * ')' (or ",)").
316      */

317     public void convertAllPrefixesToHosts() {
318         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
319         Iterator JavaDoc iter = iterCopy.iterator();
320         while (iter.hasNext()) {
321             String JavaDoc prefix = (String JavaDoc) iter.next();
322             String JavaDoc convPrefix = convertPrefixToHost(prefix);
323             if(prefix!=convPrefix) {
324                 // if returned value not unchanged, update set
325
this.remove(prefix);
326                 this.add(convPrefix);
327             }
328         }
329     }
330     
331     public static String JavaDoc convertPrefixToHost(String JavaDoc prefix) {
332         if(prefix.endsWith(")")) {
333             return prefix; // no change necessary
334
}
335         if(prefix.indexOf(')')<0) {
336             // open-ended domain prefix
337
if(!prefix.endsWith(",")) {
338                 prefix += ",";
339             }
340             prefix += ")";
341         } else {
342             // prefix with excess path-info
343
prefix = prefix.substring(0,prefix.indexOf(')')+1);
344         }
345         return prefix;
346     }
347
348     /**
349      * Changes all prefixes so that they only enforce a general
350      * domain (allowing subdomains).For prefixes that don't include
351      * a ')', no change is necessary. For others, truncate everything
352      * from the ')' onward. Additionally, truncate off "www," if it
353      * appears.
354      */

355     public void convertAllPrefixesToDomains() {
356         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
357         Iterator JavaDoc iter = iterCopy.iterator();
358         while (iter.hasNext()) {
359             String JavaDoc prefix = (String JavaDoc) iter.next();
360             String JavaDoc convPrefix = convertPrefixToDomain(prefix);
361             if(prefix!=convPrefix) {
362                 // if returned value not unchanged, update set
363
this.remove(prefix);
364                 this.add(convPrefix);
365             }
366         }
367     }
368     
369     public static String JavaDoc convertPrefixToDomain(String JavaDoc prefix) {
370         if(prefix.indexOf(')')>=0) {
371             prefix = prefix.substring(0,prefix.indexOf(')'));
372         }
373         // strip 'www,' when present
374
if(prefix.endsWith("www,")) {
375             prefix = prefix.substring(0,prefix.length()-4);
376         }
377         return prefix;
378     }
379     
380     /**
381      * Allow class to be used as a command-line tool for converting
382      * URL lists (or naked host or host/path fragments implied
383      * to be HTTP URLs) to implied SURT prefix form.
384      *
385      * Read from stdin or first file argument. Writes to stdout.
386      *
387      * @param args cmd-line arguments: may include input file
388      * @throws IOException
389      */

390     public static void main(String JavaDoc[] args) throws IOException JavaDoc {
391         InputStream JavaDoc in = args.length > 0 ? new BufferedInputStream JavaDoc(
392                 new FileInputStream JavaDoc(args[0])) : System.in;
393         PrintStream JavaDoc out = args.length > 1 ? new PrintStream JavaDoc(
394                 new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(args[1])))
395                 : System.out;
396         BufferedReader JavaDoc br =
397             new BufferedReader JavaDoc(new InputStreamReader JavaDoc(in));
398         String JavaDoc line;
399         while((line = br.readLine())!=null) {
400             if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
401             line = line.trim();
402             if(line.length()==0) continue;
403             out.println(prefixFromPlain(line));
404         }
405         br.close();
406         out.close();
407     }
408 }
409
Popular Tags