KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > canonicalize > StripUserinfoRule


1 /* StripUserinfoRule
2  *
3  * Created on Oct 5, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern JavaDoc;
26
27
28
29 /**
30  * Strip any 'userinfo' found on http/https URLs.
31  * @author stack
32  * @version $Date: 2007/01/13 01:31:29 $, $Revision: 1.3.18.1 $
33  */

34 public class StripUserinfoRule extends BaseRule {
35
36     private static final long serialVersionUID = -4271062607638914996L;
37
38     private static final String JavaDoc DESCRIPTION = "Strip any 'userinfo' found. " +
39         "Use this rule to equate 'http://stack:psswrd@archive.org/index.htm'" +
40         " and 'http://archive.org/index.htm'. The resulting canonicalization" +
41         " returns 'http://archive.org/index.htm'. Removes any userinfo" +
42         " found. Operates on http/https/ftp/ftps schemes only.";
43     
44     /**
45      * Strip userinfo.
46      */

47     private static final Pattern JavaDoc REGEX =
48         Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
49             Pattern.CASE_INSENSITIVE);
50
51     public StripUserinfoRule(String JavaDoc name) {
52         super(name, DESCRIPTION);
53     }
54
55     public String JavaDoc canonicalize(String JavaDoc url, Object JavaDoc context) {
56         return doStripRegexMatch(url, REGEX.matcher(url));
57     }
58 }
59
Popular Tags