KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > url > Canonicalizer


1 /* Canonicalizer
2  *
3  * Created on Oct 7, 2004
4  *
5  * Copyright (C) 2004 Internet Archive.
6  *
7  * This file is part of the Heritrix web crawler (crawler.archive.org).
8  *
9  * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22  */

23 package org.archive.crawler.url;
24
25 import java.util.Iterator JavaDoc;
26 import java.util.logging.Logger JavaDoc;
27 import java.util.logging.Level JavaDoc;
28
29 import javax.management.AttributeNotFoundException JavaDoc;
30
31 import org.archive.crawler.datamodel.CrawlOrder;
32 import org.archive.crawler.settings.MapType;
33 import org.archive.net.UURI;
34
35 /**
36  * URL canonicalizer.
37  * @author stack
38  * @version $Date: 2007/01/13 01:31:28 $, $Revision: 1.4.12.1 $
39  */

40 public class Canonicalizer {
41     private static Logger JavaDoc logger =
42         Logger.getLogger(Canonicalizer.class.getName());
43     
44     /**
45      * Constructor.
46      * This class can't be constructed.
47      * Shutdown.
48      */

49     private Canonicalizer() {
50         super();
51     }
52     
53     /**
54      * Convenience method that is passed a settings object instance pulling
55      * from it what it needs to canonicalize.
56      * @param uuri UURI to canonicalize.
57      * @param order A crawlorder instance.
58      * @return Canonicalized string of uuri else uuri if an error.
59      */

60     public static String JavaDoc canonicalize(UURI uuri, CrawlOrder order) {
61         MapType rules = null;
62         String JavaDoc canonical = uuri.toString();
63         try {
64             rules = (MapType)order.getAttribute(uuri, CrawlOrder.ATTR_RULES);
65             canonical = Canonicalizer.canonicalize(uuri, rules.iterator(uuri));
66         } catch (AttributeNotFoundException JavaDoc e) {
67             logger.warning("Failed canonicalization of " + canonical +
68                 ": " + e);
69         }
70         return canonical;
71     }
72
73     /**
74      * Run the passed uuri through the list of rules.
75      * @param uuri Url to canonicalize.
76      * @param rules Iterator of canonicalization rules to apply (Get one
77      * of these on the url-canonicalizer-rules element in order files or
78      * create a list externally). Rules must implement the Rule interface.
79      * @return Canonicalized URL.
80      */

81     public static String JavaDoc canonicalize(UURI uuri, Iterator JavaDoc rules) {
82         String JavaDoc before = uuri.toString();
83         //String beforeRule = null;
84
String JavaDoc canonical = before;
85         for (; rules.hasNext();) {
86             CanonicalizationRule r = (CanonicalizationRule)rules.next();
87             //if (logger.isLoggable(Level.FINER)) {
88
// beforeRule = canonical;
89
//}
90
if (!r.isEnabled(uuri)) {
91                 if (logger.isLoggable(Level.FINER)) {
92                     logger.finer("Rule " + r.getName() + " is disabled.");
93                 }
94                 continue;
95             }
96             canonical = r.canonicalize(canonical, uuri);
97             if (logger.isLoggable(Level.FINER)) {
98                 logger.finer("Rule " + r.getName() + " " + before + " => " +
99                         canonical);
100             }
101         }
102         if (logger.isLoggable(Level.INFO)) {
103             logger.fine(before + " => " + canonical);
104         }
105         return canonical;
106     }
107 }
108
Popular Tags