KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > html > DOMContentUtils


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.html;
5
6 import java.net.URL JavaDoc;
7 import java.net.MalformedURLException JavaDoc;
8 import java.util.ArrayList JavaDoc;
9 import java.util.HashMap JavaDoc;
10
11 import net.nutch.parse.Outlink;
12
13 import org.w3c.dom.*;
14
15 /**
16  * A collection of methods for extracting content from DOM trees.
17  *
18  * This class holds a few utility methods for pulling content out of
19  * DOM nodes, such as getOutlinks, getText, etc.
20  *
21  */

22 public class DOMContentUtils {
23
24   public static class LinkParams {
25     public String JavaDoc elName;
26     public String JavaDoc attrName;
27       public int childLen;
28       
29       public LinkParams(String JavaDoc elName, String JavaDoc attrName, int childLen) {
30           this.elName = elName;
31           this.attrName = attrName;
32           this.childLen = childLen;
33       }
34       
35       public String JavaDoc toString() {
36           return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
37       }
38   }
39   
40   public static HashMap JavaDoc linkParams = new HashMap JavaDoc();
41   
42   static {
43       linkParams.put("a", new LinkParams("a", "href", 1));
44       linkParams.put("area", new LinkParams("area", "href", 0));
45       linkParams.put("frame", new LinkParams("frame", "src", 0));
46       linkParams.put("iframe", new LinkParams("iframe", "src", 0));
47   }
48   
49   /**
50    * This method takes a {@link StringBuffer} and a DOM {@link Node},
51    * and will append all the content text found beneath the DOM node to
52    * the <code>StringBuffer</code>.
53    *
54    * <p>
55    *
56    * If <code>abortOnNestedAnchors</code> is true, DOM traversal will
57    * be aborted and the <code>StringBuffer</code> will not contain
58    * any text encountered after a nested anchor is found.
59    *
60    * <p>
61    *
62    * Currently, only SCRIPT, STYLE and comment text are ignored.
63    *
64    * @return true if nested anchors were found
65    */

66   public static final boolean getText(StringBuffer JavaDoc sb, Node node,
67                                       boolean abortOnNestedAnchors) {
68     if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
69       return true;
70     }
71     return false;
72   }
73
74
75   /**
76    * This is a convinience method, equivalent to {@link
77    * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
78    *
79    */

80   public static final void getText(StringBuffer JavaDoc sb, Node node) {
81     getText(sb, node, false);
82   }
83
84   // returns true if abortOnNestedAnchors is true and we find nested
85
// anchors
86
private static final boolean getTextHelper(StringBuffer JavaDoc sb, Node node,
87                                              boolean abortOnNestedAnchors,
88                                              int anchorDepth) {
89     if ("script".equalsIgnoreCase(node.getNodeName())) {
90       return false;
91     }
92     if ("style".equalsIgnoreCase(node.getNodeName())) {
93       return false;
94     }
95     if (abortOnNestedAnchors && "a".equalsIgnoreCase(node.getNodeName())) {
96       anchorDepth++;
97       if (anchorDepth > 1)
98         return true;
99     }
100     if (node.getNodeType() == Node.COMMENT_NODE) {
101       return false;
102     }
103     if (node.getNodeType() == Node.TEXT_NODE) {
104       // cleanup and trim the value
105
String JavaDoc text = node.getNodeValue();
106       text = text.replaceAll("\\s+", " ");
107       text = text.trim();
108       if (text.length() > 0) {
109         if (sb.length() > 0) sb.append(' ');
110         sb.append(text);
111       }
112     }
113     boolean abort = false;
114     NodeList children = node.getChildNodes();
115     if (children != null) {
116       int len = children.getLength();
117       for (int i = 0; i < len; i++) {
118         if (getTextHelper(sb, children.item(i),
119                           abortOnNestedAnchors, anchorDepth)) {
120           abort = true;
121           break;
122         }
123       }
124     }
125     return abort;
126   }
127
128   /**
129    * This method takes a {@link StringBuffer} and a DOM {@link Node},
130    * and will append the content text found beneath the first
131    * <code>title</code> node to the <code>StringBuffer</code>.
132    *
133    * @return true if a title node was found, false otherwise
134    */

135   public static final boolean getTitle(StringBuffer JavaDoc sb, Node node) {
136     if (node.getNodeType() == Node.ELEMENT_NODE) {
137       if ("title".equalsIgnoreCase(node.getNodeName())) {
138         getText(sb, node);
139         return true;
140       }
141     }
142     NodeList children = node.getChildNodes();
143     if (children != null) {
144       int len = children.getLength();
145       for (int i = 0; i < len; i++) {
146         if (getTitle(sb, children.item(i))) {
147           return true;
148         }
149       }
150     }
151     return false;
152   }
153
154   /** If Node contains a BASE tag then it's HREF is returned. */
155   public static final URL JavaDoc getBase(Node node) {
156
157     // is this node a BASE tag?
158
if (node.getNodeType() == Node.ELEMENT_NODE) {
159       if ("base".equalsIgnoreCase(node.getNodeName())) {
160         NamedNodeMap attrs = node.getAttributes();
161         for (int i= 0; i < attrs.getLength(); i++ ) {
162           Node attr = attrs.item(i);
163           if ("href".equalsIgnoreCase(attr.getNodeName())) {
164             try {
165               return new URL JavaDoc(attr.getNodeValue());
166             } catch (MalformedURLException JavaDoc e) {}
167           }
168         }
169       }
170     }
171     
172     // does it contain a base tag?
173
NodeList children = node.getChildNodes();
174     if (children != null) {
175       int len = children.getLength();
176       for (int i = 0; i < len; i++) {
177         URL JavaDoc base = getBase(children.item(i));
178         if (base != null)
179           return base;
180       }
181     }
182
183     // no.
184
return null;
185   }
186
187
188   private static boolean hasOnlyWhiteSpace(Node node) {
189     String JavaDoc val= node.getNodeValue();
190     for (int i= 0; i < val.length(); i++) {
191       if (!Character.isWhitespace(val.charAt(i)))
192         return false;
193     }
194     return true;
195   }
196
197   // this only covers a few cases of empty links that are symptomatic
198
// of nekohtml's DOM-fixup process...
199
private static boolean shouldThrowAwayLink(Node node, NodeList children,
200                                               int childLen, LinkParams params) {
201     if (childLen == 0) {
202       // this has no inner structure
203
if (params.childLen == 0) return false;
204       else return true;
205     } else if ((childLen == 1)
206                && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
207                && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
208       // single nested link
209
return true;
210
211     } else if (childLen == 2) {
212
213       Node c0= children.item(0);
214       Node c1= children.item(1);
215
216       if ((c0.getNodeType() == Node.ELEMENT_NODE)
217           && (params.elName.equalsIgnoreCase(c0.getNodeName()))
218           && (c1.getNodeType() == Node.TEXT_NODE)
219           && hasOnlyWhiteSpace(c1) ) {
220         // single link followed by whitespace node
221
return true;
222       }
223
224       if ((c1.getNodeType() == Node.ELEMENT_NODE)
225           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
226           && (c0.getNodeType() == Node.TEXT_NODE)
227           && hasOnlyWhiteSpace(c0) ) {
228         // whitespace node followed by single link
229
return true;
230       }
231
232     } else if (childLen == 3) {
233       Node c0= children.item(0);
234       Node c1= children.item(1);
235       Node c2= children.item(2);
236       
237       if ((c1.getNodeType() == Node.ELEMENT_NODE)
238           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
239           && (c0.getNodeType() == Node.TEXT_NODE)
240           && (c2.getNodeType() == Node.TEXT_NODE)
241           && hasOnlyWhiteSpace(c0)
242           && hasOnlyWhiteSpace(c2) ) {
243         // single link surrounded by whitespace nodes
244
return true;
245       }
246     }
247
248     return false;
249   }
250
251   /**
252    * This method finds all anchors below the supplied DOM
253    * <code>node</code>, and creates appropriate {@link Outlink}
254    * records for each (relative to the supplied <code>base</code>
255    * URL), and adds them to the <code>outlinks</code> {@link
256    * ArrayList}.
257    *
258    * <p>
259    *
260    * Links without inner structure (tags, text, etc) are discarded, as
261    * are links which contain only single nested links and empty text
262    * nodes (this is a common DOM-fixup artifact, at least with
263    * nekohtml).
264    */

265   public static final void getOutlinks(URL JavaDoc base, ArrayList JavaDoc outlinks,
266                                        Node node) {
267
268     NodeList children = node.getChildNodes();
269     int childLen= 0;
270     if (children != null)
271       childLen= children.getLength();
272   
273     if (node.getNodeType() == Node.ELEMENT_NODE) {
274       LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase());
275       if (params != null) {
276         if (shouldThrowAwayLink(node, children, childLen, params)) {
277           // this has no inner structure or just a single nested
278
// anchor-- toss it!
279
} else {
280
281           StringBuffer JavaDoc linkText = new StringBuffer JavaDoc();
282           getText(linkText, node, true);
283
284           NamedNodeMap attrs = node.getAttributes();
285           String JavaDoc target = null;
286           for (int i= 0; i < attrs.getLength(); i++ ) {
287             if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) {
288               target = attrs.item(i).getNodeValue();
289               break;
290             }
291           }
292           if (target != null)
293             try {
294               URL JavaDoc url = new URL JavaDoc(base, target);
295               outlinks.add(new Outlink(url.toString(),
296                                        linkText.toString().trim()));
297             } catch (MalformedURLException JavaDoc e) {
298               // don't care
299
}
300         }
301         // this should not have any children, skip them
302
if (params.childLen == 0) return;
303       }
304     }
305     for ( int i = 0; i < childLen; i++ ) {
306       getOutlinks(base, outlinks, children.item(i));
307     }
308   }
309
310 }
311
312
Popular Tags