KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > canonicalize > FixupQueryStr


1 /* FixupQueryStr
2  *
3  * Created on Oct 5, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url.canonicalize;
24
25
26
27
28 /**
29  * Strip any trailing question mark.
30  * @author stack
31  * @version $Date: 2007/01/13 01:31:28 $, $Revision: 1.3.18.1 $
32  */

33 public class FixupQueryStr
34 extends BaseRule {
35
36     private static final long serialVersionUID = 3169526832544474794L;
37
38     private static final String JavaDoc DESCRIPTION =
39         "Fixup the question mark that leads off the query string. " +
40         "This rule returns 'http://www.archive.org/index.html' if passed" +
41         " 'http://www.archive.org/index.html?'. It will also strip '?&'" +
42         " if '?&' is all that comprises the query string. Also strips" +
43         " extraneous leading '&': Returns 'http://archive.org/index.html?x=y" +
44         " if passed 'http://archive.org/index.html?&x=y." +
45         " Will also strip '&' if last thing in query string." +
46         " Operates on all schemes. This is a good rule to run toward the" +
47         " end of canonicalization processing.";
48
49     public FixupQueryStr(String JavaDoc name) {
50         super(name, DESCRIPTION);
51     }
52
53     public String JavaDoc canonicalize(String JavaDoc url, Object JavaDoc context) {
54         if (url == null || url.length() <= 0) {
55             return url;
56         }
57         
58         int index = url.lastIndexOf('?');
59         if (index > 0) {
60             if (index == (url.length() - 1)) {
61                 // '?' is last char in url. Strip it.
62
url = url.substring(0, url.length() - 1);
63             } else if (url.charAt(index + 1) == '&') {
64                 // Next char is '&'. Strip it.
65
if (url.length() == (index + 2)) {
66                     // Then url ends with '?&'. Strip them.
67
url = url.substring(0, url.length() - 2);
68                 } else {
69                     // The '&' is redundant. Strip it.
70
url = url.substring(0, index + 1) +
71                     url.substring(index + 2);
72                 }
73             } else if (url.charAt(url.length() - 1) == '&') {
74                 // If we have a lone '&' on end of query str,
75
// strip it.
76
url = url.substring(0, url.length() - 1);
77             }
78         }
79         return url;
80     }
81 }
82
Popular Tags