DOMContentUtils


1   /* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.parse.html;
5   
6   import java.net.URL  ;
7   import java.net.MalformedURLException  ;
8   import java.util.ArrayList  ;
9   import java.util.HashMap  ;
10  
11  import net.nutch.parse.Outlink;
12  
13  import org.w3c.dom.*;
14  
15  /**
16   * A collection of methods for extracting content from DOM trees.
17   * 
18   * This class holds a few utility methods for pulling content out of 
19   * DOM nodes, such as getOutlinks, getText, etc.
20   *
21   */
22  public class DOMContentUtils {
23  
24    public static class LinkParams {
25      public String   elName;
26      public String   attrName;
27        public int childLen;
28        
29        public LinkParams(String   elName, String   attrName, int childLen) {
30            this.elName = elName;
31            this.attrName = attrName;
32            this.childLen = childLen;
33        }
34        
35        public String   toString() {
36            return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
37        }
38    }
39    
40    public static HashMap   linkParams = new HashMap  ();
41    
42    static {
43        linkParams.put("a", new LinkParams("a", "href", 1));
44        linkParams.put("area", new LinkParams("area", "href", 0));
45        linkParams.put("frame", new LinkParams("frame", "src", 0));
46        linkParams.put("iframe", new LinkParams("iframe", "src", 0));
47    }
48    
49    /**
50     * This method takes a {@link StringBuffer} and a DOM {@link Node},
51     * and will append all the content text found beneath the DOM node to 
52     * the <code>StringBuffer</code>.
53     *
54     * <p>
55     *
56     * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
57     * be aborted and the <code>StringBuffer</code> will not contain
58     * any text encountered after a nested anchor is found.
59     * 
60     * <p>
61     *
62     * Currently, only SCRIPT, STYLE and comment text are ignored.
63     *
64     * @return true if nested anchors were found
65     */
66    public static final boolean getText(StringBuffer   sb, Node node, 
67                                        boolean abortOnNestedAnchors) {
68      if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
69        return true;
70      } 
71      return false;
72    }
73  
74  
75    /**
76     * This is a convinience method, equivalent to {@link
77     * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
78     * 
79     */
80    public static final void getText(StringBuffer   sb, Node node) {
81      getText(sb, node, false);
82    }
83  
84    // returns true if abortOnNestedAnchors is true and we find nested 
85    // anchors
86    private static final boolean getTextHelper(StringBuffer   sb, Node node, 
87                                               boolean abortOnNestedAnchors,
88                                               int anchorDepth) {
89      if ("script".equalsIgnoreCase(node.getNodeName())) {
90        return false;
91      }
92      if ("style".equalsIgnoreCase(node.getNodeName())) {
93        return false;
94      }
95      if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
96        anchorDepth++;
97        if (anchorDepth > 1)
98          return true;
99      }
100     if (node.getNodeType() == Node.COMMENT_NODE) {
101       return false;
102     }
103     if (node.getNodeType() == Node.TEXT_NODE) {
104       // cleanup and trim the value
105       String   text = node.getNodeValue();
106       text = text.replaceAll("\\s+", " ");
107       text = text.trim();
108       if (text.length() > 0) {
109         if (sb.length() > 0) sb.append(' ');
110         sb.append(text);
111       }
112     }
113     boolean abort = false;
114     NodeList children = node.getChildNodes();
115     if (children != null) {
116       int len = children.getLength();
117       for (int i = 0; i < len; i++) {
118         if (getTextHelper(sb, children.item(i), 
119                           abortOnNestedAnchors, anchorDepth)) {
120           abort = true;
121           break;
122         }
123       }
124     }
125     return abort;
126   }
127 
128   /**
129    * This method takes a {@link StringBuffer} and a DOM {@link Node},
130    * and will append the content text found beneath the first
131    * <code>title</code> node to the <code>StringBuffer</code>.
132    *
133    * @return true if a title node was found, false otherwise
134    */
135   public static final boolean getTitle(StringBuffer   sb, Node node) {
136     if (node.getNodeType() == Node.ELEMENT_NODE) {
137       if ("title".equalsIgnoreCase(node.getNodeName())) {
138         getText(sb, node);
139         return true;
140       }
141     }
142     NodeList children = node.getChildNodes();
143     if (children != null) {
144       int len = children.getLength();
145       for (int i = 0; i < len; i++) {
146         if (getTitle(sb, children.item(i))) {
147           return true;
148         }
149       }
150     }
151     return false;
152   }
153 
154   /** If Node contains a BASE tag then it's HREF is returned. */
155   public static final URL   getBase(Node node) {
156 
157     // is this node a BASE tag?
158     if (node.getNodeType() == Node.ELEMENT_NODE) {
159       if ("base".equalsIgnoreCase(node.getNodeName())) {
160         NamedNodeMap attrs = node.getAttributes();
161         for (int i= 0; i < attrs.getLength(); i++ ) {
162           Node attr = attrs.item(i);
163           if ("href".equalsIgnoreCase(attr.getNodeName())) {
164             try {
165               return new URL  (attr.getNodeValue());
166             } catch (MalformedURLException   e) {}
167           }
168         }
169       }
170     }
171     
172     // does it contain a base tag?
173     NodeList children = node.getChildNodes();
174     if (children != null) {
175       int len = children.getLength();
176       for (int i = 0; i < len; i++) {
177         URL   base = getBase(children.item(i));
178         if (base != null)
179           return base;
180       }
181     }
182 
183     // no.
184     return null;
185   }
186 
187 
188   private static boolean hasOnlyWhiteSpace(Node node) {
189     String   val= node.getNodeValue();
190     for (int i= 0; i < val.length(); i++) {
191       if (!Character.isWhitespace(val.charAt(i)))
192         return false;
193     }
194     return true;
195   }
196 
197   // this only covers a few cases of empty links that are symptomatic
198   // of nekohtml's DOM-fixup process...
199   private static boolean shouldThrowAwayLink(Node node, NodeList children, 
200                                               int childLen, LinkParams params) {
201     if (childLen == 0) {
202       // this has no inner structure 
203       if (params.childLen == 0) return false;
204       else return true;
205     } else if ((childLen == 1) 
206                && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
207                && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) { 
208       // single nested link
209       return true;
210 
211     } else if (childLen == 2) {
212 
213       Node c0= children.item(0);
214       Node c1= children.item(1);
215 
216       if ((c0.getNodeType() == Node.ELEMENT_NODE)
217           && (params.elName.equalsIgnoreCase(c0.getNodeName()))
218           && (c1.getNodeType() == Node.TEXT_NODE) 
219           && hasOnlyWhiteSpace(c1) ) {
220         // single link followed by whitespace node
221         return true;
222       }
223 
224       if ((c1.getNodeType() == Node.ELEMENT_NODE)
225           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
226           && (c0.getNodeType() == Node.TEXT_NODE) 
227           && hasOnlyWhiteSpace(c0) ) {
228         // whitespace node followed by single link
229         return true;
230       }
231 
232     } else if (childLen == 3) {
233       Node c0= children.item(0);
234       Node c1= children.item(1);
235       Node c2= children.item(2);
236       
237       if ((c1.getNodeType() == Node.ELEMENT_NODE)
238           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
239           && (c0.getNodeType() == Node.TEXT_NODE) 
240           && (c2.getNodeType() == Node.TEXT_NODE) 
241           && hasOnlyWhiteSpace(c0)
242           && hasOnlyWhiteSpace(c2) ) {
243         // single link surrounded by whitespace nodes
244         return true;
245       }
246     }
247 
248     return false;
249   }
250 
251   /**
252    * This method finds all anchors below the supplied DOM
253    * <code>node</code>, and creates appropriate {@link Outlink}
254    * records for each (relative to the supplied <code>base</code>
255    * URL), and adds them to the <code>outlinks</code> {@link
256    * ArrayList}.
257    *
258    * <p>
259    *
260    * Links without inner structure (tags, text, etc) are discarded, as
261    * are links which contain only single nested links and empty text
262    * nodes (this is a common DOM-fixup artifact, at least with
263    * nekohtml).
264    */
265   public static final void getOutlinks(URL   base, ArrayList   outlinks, 
266                                        Node node) {
267 
268     NodeList children = node.getChildNodes();
269     int childLen= 0;
270     if (children != null)
271       childLen= children.getLength();
272   
273     if (node.getNodeType() == Node.ELEMENT_NODE) {
274       LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
275       if (params != null) {
276         if (shouldThrowAwayLink(node, children, childLen, params)) {
277           // this has no inner structure or just a single nested
278           // anchor-- toss it!
279         } else {
280 
281           StringBuffer   linkText = new StringBuffer  ();
282           getText(linkText, node, true);
283 
284           NamedNodeMap attrs = node.getAttributes();
285           String   target = null;
286           for (int i= 0; i < attrs.getLength(); i++ ) {
287             if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
288               target = attrs.item(i).getNodeValue();
289               break;
290             }
291           }
292           if (target != null)
293             try {
294               URL   url = new URL  (base, target);
295               outlinks.add(new Outlink(url.toString(),
296                                        linkText.toString().trim()));
297             } catch (MalformedURLException   e) {
298               // don't care
299             }
300         }
301         // this should not have any children, skip them
302         if (params.childLen == 0) return;
303       }
304     }
305     for ( int i = 0; i < childLen; i++ ) {
306       getOutlinks(base, outlinks, children.item(i));
307     }
308   }
309 
310 }
311 
312
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags