KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > html > RobotsMetaProcessor


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.html;
5
6 import java.net.URL JavaDoc;
7
8 import org.w3c.dom.*;
9 import org.w3c.dom.html.*;
10 import org.apache.html.dom.*;
11
12 /**
13  * Class for parsing META Directives from DOM trees. This class
14  * currently handles Robots META directives (all, none, nofollow,
15  * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
16  * instructions.
17  */

18 public class RobotsMetaProcessor {
19
20   /**
21    * Utility class with indicators for the robots directives "noindex"
22    * and "nofollow", and HTTP-EQUIV/no-cache
23    */

24   public static class RobotsMetaIndicator {
25     private boolean noIndex= false;
26     private boolean noFollow= false;
27     private boolean noCache= false;
28     private URL JavaDoc baseHref= null;
29
30     /**
31      * Sets <code>noIndex</code>, <code>noFollow</code> and
32      * <code>noCache</code> to <code>false</code>.
33      */

34     public void reset() {
35       noIndex= false;
36       noFollow= false;
37       noCache= false;
38       baseHref= null;
39     }
40
41     /**
42      * Sets <code>noFollow</code> to <code>true</code>.
43      */

44     public void setNoFollow() {
45       noFollow= true;
46     }
47
48     /**
49      * Sets <code>noIndex</code> to <code>true</code>.
50      */

51     public void setNoIndex() {
52       noIndex= true;
53     }
54
55     /**
56      * Sets <code>noCache</code> to <code>true</code>.
57      */

58     public void setNoCache() {
59       noCache= true;
60     }
61
62     /**
63      * Sets the <code>baseHref</code>.
64      */

65     public void setBaseHref(URL JavaDoc baseHref) {
66       this.baseHref= baseHref;
67     }
68
69     /**
70      * Returns the current value of <code>noIndex</code>.
71      */

72     public boolean getNoIndex() {
73       return noIndex;
74     }
75
76     /**
77      * Returns the current value of <code>noFollow</code>.
78      */

79     public boolean getNoFollow() {
80       return noFollow;
81     }
82
83     /**
84      * Returns the current value of <code>noCache</code>.
85      */

86     public boolean getNoCache() {
87       return noCache;
88     }
89
90     /**
91      * Returns the <code>baseHref</code>, if set, or <code>null</code>
92      * otherwise.
93      */

94     public URL JavaDoc getBaseHref() {
95       return baseHref;
96     }
97
98   }
99
100   /**
101    * Sets the indicators in <code>robotsMeta</code> to appropriate
102    * values, based on any META tags found under the given
103    * <code>node</code>.
104    */

105   public static final void getRobotsMetaDirectives(
106     RobotsMetaIndicator robotsMeta, Node node, URL JavaDoc currURL) {
107
108     robotsMeta.reset();
109     getRobotsMetaDirectivesHelper(robotsMeta, node, currURL);
110   }
111
112   private static final void getRobotsMetaDirectivesHelper(
113     RobotsMetaIndicator robotsMeta, Node node, URL JavaDoc currURL) {
114
115     if (node.getNodeType() == Node.ELEMENT_NODE) {
116
117       if ("BODY".equals(node.getNodeName())) {
118         // META tags should not be under body
119
return;
120       }
121
122       if ("META".equals(node.getNodeName())) {
123         NamedNodeMap attrs= node.getAttributes();
124         Node nameNode= attrs.getNamedItem("name");
125
126         if (nameNode != null) {
127           if ("robots".equalsIgnoreCase(nameNode.getNodeValue())) {
128             Node contentNode= attrs.getNamedItem("content");
129
130             if (contentNode != null) {
131               String JavaDoc directives=
132                 contentNode.getNodeValue().toLowerCase();
133               int index= directives.indexOf("none");
134
135               if (index >= 0) {
136                 robotsMeta.setNoIndex();
137                 robotsMeta.setNoFollow();
138               }
139
140               index= directives.indexOf("all");
141               if (index >= 0) {
142                 // do nothing...
143
}
144
145               index= directives.indexOf("noindex");
146               if (index >= 0) {
147                 robotsMeta.setNoIndex();
148               }
149
150               index= directives.indexOf("nofollow");
151               if (index >= 0) {
152                 robotsMeta.setNoFollow();
153               }
154             }
155
156           } // end if (name == robots)
157
} // end if (nameNode != null)
158

159         Node HTTPEquivNode= attrs.getNamedItem("http-equiv");
160
161         if ( (HTTPEquivNode != null)
162              && ("Pragma".equalsIgnoreCase(HTTPEquivNode.getNodeValue())) ) {
163           Node contentNode= attrs.getNamedItem("content");
164
165           if (contentNode != null) {
166             String JavaDoc content= contentNode.getNodeValue().toLowerCase();
167             int index= content.indexOf("no-cache");
168             if (index >= 0)
169               robotsMeta.setNoCache();
170           }
171
172         }
173
174       } else if ("BASE".equalsIgnoreCase(node.getNodeName())) {
175         NamedNodeMap attrs= node.getAttributes();
176         Node hrefNode= attrs.getNamedItem("href");
177
178         if (hrefNode != null) {
179           String JavaDoc urlString= hrefNode.getNodeValue();
180
181           URL JavaDoc url= null;
182           try {
183             if (currURL == null)
184               url= new URL JavaDoc(urlString);
185             else
186               url= new URL JavaDoc(currURL, urlString);
187           } catch (Exception JavaDoc e) {
188             ;
189           }
190
191           if (url != null)
192             robotsMeta.setBaseHref(url);
193         }
194
195       }
196
197     }
198
199     NodeList children = node.getChildNodes();
200     if ( children != null ) {
201       int len = children.getLength();
202       for ( int i = 0; i < len; i++ ) {
203         getRobotsMetaDirectivesHelper(robotsMeta, children.item(i), currURL);
204       }
205     }
206   }
207
208 }
209
Popular Tags