KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > services > htmlparser > ExtractLinksDOMVisitor


1 package org.jahia.services.htmlparser;
2
3 import org.w3c.dom.Document JavaDoc;
4 import org.w3c.dom.Node JavaDoc;
5 import org.w3c.dom.NodeList JavaDoc;
6 import java.util.ArrayList JavaDoc;
7 import org.w3c.dom.Element JavaDoc;
8 import java.util.Map JavaDoc;
9 import java.util.HashMap JavaDoc;
10 import java.util.Set JavaDoc;
11 import java.util.HashSet JavaDoc;
12 import java.util.Iterator JavaDoc;
13
14 /**
15  * <p>Title: HTML DOM Visitor that extracts all the links in the document
16  * based on a set of tags specified in the W3C HTML specification</p>
17  * <p>Description: </p>
18  * <p>Copyright: Copyright (c) 2002</p>
19  * <p>Company: Jahia Ltd</p>
20  * @author Serge Huber
21  * @version 1.0
22  */

23
24 public class ExtractLinksDOMVisitor implements HtmlDOMVisitor {
25
26     private static org.apache.log4j.Logger logger =
27             org.apache.log4j.Logger.getLogger(ExtractLinksDOMVisitor.class);
28
29     private ArrayList JavaDoc documentLinks = new ArrayList JavaDoc();
30
31     private String JavaDoc[][] tagAndAttributesWithLinks = {
32         /* This list is based on the HTML 4.01 DTD, available here :
33            http://www.w3.org/TR/html401/sgml/dtd.html
34          */

35         { "a", "href" },
36         { "img", "src" },
37         { "img", "longdesc" },
38         { "img", "usemap" },
39         { "area", "href" },
40         { "link", "href" },
41         { "object", "classid" },
42         { "object", "codebase" },
43         { "object", "data" },
44         { "object", "usemap" },
45         { "q", "cite" },
46         { "blockquote", "cite" },
47         { "ins", "cite" },
48         { "del", "cite" },
49         { "form", "action" },
50         { "input", "src" },
51         { "input", "usemap" },
52         { "head", "profile" },
53         { "base", "href" },
54         { "script", "src" },
55         { "script", "for" }
56     };
57
58     private Map JavaDoc linkAttributesByTagName = new HashMap JavaDoc();
59
60     public ExtractLinksDOMVisitor() {
61         for (int i = 0; i < tagAndAttributesWithLinks.length; i++) {
62             String JavaDoc tagName = tagAndAttributesWithLinks[i][0];
63             String JavaDoc attributeName = tagAndAttributesWithLinks[i][1];
64             Set JavaDoc tagAttributes = null;
65             if (!linkAttributesByTagName.containsKey(tagName)) {
66                 tagAttributes = new HashSet JavaDoc();
67                 linkAttributesByTagName.put(tagName, tagAttributes);
68             } else {
69                 tagAttributes = (Set JavaDoc) linkAttributesByTagName.get(tagName);
70             }
71             tagAttributes.add(attributeName);
72         }
73     }
74
75     public void init(int siteId) {
76     }
77
78     public Document JavaDoc parseDOM(Document JavaDoc doc) {
79         if ( doc != null ){
80             extractNodeLinks(doc.getDocumentElement());
81         }
82         return doc;
83     }
84
85     private void extractNodeLinks(Node JavaDoc node){
86
87         if ( node == null ){
88             return;
89         }
90
91         if (node.getNodeType() == Node.ELEMENT_NODE) {
92             Element JavaDoc curElement = (Element JavaDoc) node;
93             Set JavaDoc linkAttributes = (Set JavaDoc) linkAttributesByTagName.get(curElement.getTagName().toLowerCase());
94             if (linkAttributes != null) {
95                 Iterator JavaDoc attributeIter = linkAttributes.iterator();
96                 while (attributeIter.hasNext()) {
97                     String JavaDoc curLinkAttribute = (String JavaDoc) attributeIter.next();
98                     String JavaDoc curLink = curElement.getAttribute(curLinkAttribute);
99                     if (curLink != null) {
100                         logger.debug("Found link [" + curLink + "] on tag [" + curElement.getTagName() + "] with attribute [" + curLinkAttribute + "]");
101                         documentLinks.add(curLink);
102                     }
103                 }
104             }
105         }
106
107         NodeList JavaDoc childNodes = node.getChildNodes();
108         for (int i=0; i < childNodes.getLength(); i++) {
109             extractNodeLinks(childNodes.item(i));
110         }
111     }
112
113     /**
114      * @return an ArrayList of String objects that contain all the links
115      * in the HTML DOM that we parsed.
116      */

117     public ArrayList JavaDoc getDocumentLinks() {
118         return documentLinks;
119     }
120
121 }
Popular Tags