ExtractLinksDOMVisitor


1   package org.jahia.services.htmlparser;
2   
3   import org.w3c.dom.Document  ;
4   import org.w3c.dom.Node  ;
5   import org.w3c.dom.NodeList  ;
6   import java.util.ArrayList  ;
7   import org.w3c.dom.Element  ;
8   import java.util.Map  ;
9   import java.util.HashMap  ;
10  import java.util.Set  ;
11  import java.util.HashSet  ;
12  import java.util.Iterator  ;
13  
14  /**
15   * <p>Title: HTML DOM Visitor that extracts all the links in the document
16   * based on a set of tags specified in the W3C HTML specification</p>
17   * <p>Description: </p>
18   * <p>Copyright: Copyright (c) 2002</p>
19   * <p>Company: Jahia Ltd</p>
20   * @author Serge Huber
21   * @version 1.0
22   */
23  
24  public class ExtractLinksDOMVisitor implements HtmlDOMVisitor {
25  
26      private static org.apache.log4j.Logger logger =
27              org.apache.log4j.Logger.getLogger(ExtractLinksDOMVisitor.class);
28  
29      private ArrayList   documentLinks = new ArrayList  ();
30  
31      private String  [][] tagAndAttributesWithLinks = {
32          /* This list is based on the HTML 4.01 DTD, available here :
33             http://www.w3.org/TR/html401/sgml/dtd.html
34           */
35          { "a", "href" },
36          { "img", "src" },
37          { "img", "longdesc" },
38          { "img", "usemap" },
39          { "area", "href" },
40          { "link", "href" },
41          { "object", "classid" },
42          { "object", "codebase" },
43          { "object", "data" },
44          { "object", "usemap" },
45          { "q", "cite" },
46          { "blockquote", "cite" },
47          { "ins", "cite" },
48          { "del", "cite" },
49          { "form", "action" },
50          { "input", "src" },
51          { "input", "usemap" },
52          { "head", "profile" },
53          { "base", "href" },
54          { "script", "src" },
55          { "script", "for" }
56      };
57  
58      private Map   linkAttributesByTagName = new HashMap  ();
59  
60      public ExtractLinksDOMVisitor() {
61          for (int i = 0; i < tagAndAttributesWithLinks.length; i++) {
62              String   tagName = tagAndAttributesWithLinks[i][0];
63              String   attributeName = tagAndAttributesWithLinks[i][1];
64              Set   tagAttributes = null;
65              if (!linkAttributesByTagName.containsKey(tagName)) {
66                  tagAttributes = new HashSet  ();
67                  linkAttributesByTagName.put(tagName, tagAttributes);
68              } else {
69                  tagAttributes = (Set  ) linkAttributesByTagName.get(tagName);
70              }
71              tagAttributes.add(attributeName);
72          }
73      }
74  
75      public void init(int siteId) {
76      }
77  
78      public Document   parseDOM(Document   doc) {
79          if ( doc != null ){
80              extractNodeLinks(doc.getDocumentElement());
81          }
82          return doc;
83      }
84  
85      private void extractNodeLinks(Node   node){
86  
87          if ( node == null ){
88              return;
89          }
90  
91          if (node.getNodeType() == Node.ELEMENT_NODE) {
92              Element   curElement = (Element  ) node;
93              Set   linkAttributes = (Set  ) linkAttributesByTagName.get(curElement.getTagName().toLowerCase());
94              if (linkAttributes != null) {
95                  Iterator   attributeIter = linkAttributes.iterator();
96                  while (attributeIter.hasNext()) {
97                      String   curLinkAttribute = (String  ) attributeIter.next();
98                      String   curLink = curElement.getAttribute(curLinkAttribute);
99                      if (curLink != null) {
100                         logger.debug("Found link [" + curLink + "] on tag [" + curElement.getTagName() + "] with attribute [" + curLinkAttribute + "]");
101                         documentLinks.add(curLink);
102                     }
103                 }
104             }
105         }
106 
107         NodeList   childNodes = node.getChildNodes();
108         for (int i=0; i < childNodes.getLength(); i++) {
109             extractNodeLinks(childNodes.item(i));
110         }
111     }
112 
113     /**
114      * @return an ArrayList of String objects that contain all the links
115      * in the HTML DOM that we parsed.
116      */
117     public ArrayList   getDocumentLinks() {
118         return documentLinks;
119     }
120 
121 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags