KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > canonicalize > StripSessionIDs


1 /* StripSessionIDs
2  *
3  * Created on Oct 6, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern JavaDoc;
26
27
28 /**
29  * Strip known session ids.
30  * @author stack
31  * @version $Date: 2007/01/13 01:31:29 $, $Revision: 1.5.2.1 $
32  */

33 public class StripSessionIDs
34 extends BaseRule {
35
36     private static final long serialVersionUID = -3737115200690525641L;
37
38     private static final String JavaDoc DESCRIPTION = "Strip known session IDs. " +
39         "Use this rule to remove all of a set of known session IDs." +
40         " For example, this rule will strip JSESSIONID and its value from" +
41         " 'http://archive.org/index.html?" +
42         "JSESSIONID=DDDSSE233232333355FFSXXXXDSDSDS'. The resulting" +
43         " canonicalization returns 'http://archive.org/index.html'." +
44         " This rule strips JSESSIONID, ASPSESSIONID, PHPSESSID, and 'sid'" +
45         " session ids.";
46     
47     /**
48      * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
49      * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
50      */

51     private static final Pattern JavaDoc BASE_PATTERN = Pattern.compile("^(.+)" +
52             "(?:(?:(?:jsessionid)|(?:phpsessid))=" +
53                  "[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
54     
55     /**
56      * Example: sid=9682993c8daa2c5497996114facdc805.
57      * 'sid=' can be tricky but all sid= followed by 32 byte string
58      * so far seen have been session ids. Sid is a 32 byte string
59      * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
60      * so have to have it run after the phpsessid elimination.
61      */

62     private static final Pattern JavaDoc SID_PATTERN =
63         Pattern.compile("^(.+)" +
64             "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
65     
66     /**
67      * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
68      */

69     private static final Pattern JavaDoc ASPSESSION_PATTERN =
70         Pattern.compile("^(.+)" +
71             "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
72                 Pattern.CASE_INSENSITIVE);
73     
74
75     public StripSessionIDs(String JavaDoc name) {
76         super(name, DESCRIPTION);
77     }
78
79     public String JavaDoc canonicalize(String JavaDoc url, Object JavaDoc context) {
80         url = doStripRegexMatch(url, BASE_PATTERN.matcher(url));
81         url = doStripRegexMatch(url, SID_PATTERN.matcher(url));
82         url = doStripRegexMatch(url, ASPSESSION_PATTERN.matcher(url));
83         return url;
84     }
85 }
Popular Tags