1 package org.jahia.services.htmlparser; 2 3 import org.w3c.dom.Document ; 4 import org.w3c.dom.Node ; 5 import org.w3c.dom.NodeList ; 6 import java.util.ArrayList ; 7 import org.w3c.dom.Element ; 8 import java.util.Map ; 9 import java.util.HashMap ; 10 import java.util.Set ; 11 import java.util.HashSet ; 12 import java.util.Iterator ; 13 14 23 24 public class ExtractLinksDOMVisitor implements HtmlDOMVisitor { 25 26 private static org.apache.log4j.Logger logger = 27 org.apache.log4j.Logger.getLogger(ExtractLinksDOMVisitor.class); 28 29 private ArrayList documentLinks = new ArrayList (); 30 31 private String [][] tagAndAttributesWithLinks = { 32 35 { "a", "href" }, 36 { "img", "src" }, 37 { "img", "longdesc" }, 38 { "img", "usemap" }, 39 { "area", "href" }, 40 { "link", "href" }, 41 { "object", "classid" }, 42 { "object", "codebase" }, 43 { "object", "data" }, 44 { "object", "usemap" }, 45 { "q", "cite" }, 46 { "blockquote", "cite" }, 47 { "ins", "cite" }, 48 { "del", "cite" }, 49 { "form", "action" }, 50 { "input", "src" }, 51 { "input", "usemap" }, 52 { "head", "profile" }, 53 { "base", "href" }, 54 { "script", "src" }, 55 { "script", "for" } 56 }; 57 58 private Map linkAttributesByTagName = new HashMap (); 59 60 public ExtractLinksDOMVisitor() { 61 for (int i = 0; i < tagAndAttributesWithLinks.length; i++) { 62 String tagName = tagAndAttributesWithLinks[i][0]; 63 String attributeName = tagAndAttributesWithLinks[i][1]; 64 Set tagAttributes = null; 65 if (!linkAttributesByTagName.containsKey(tagName)) { 66 tagAttributes = new HashSet (); 67 linkAttributesByTagName.put(tagName, tagAttributes); 68 } else { 69 tagAttributes = (Set ) linkAttributesByTagName.get(tagName); 70 } 71 tagAttributes.add(attributeName); 72 } 73 } 74 75 public void init(int siteId) { 76 } 77 78 public Document parseDOM(Document doc) { 79 if ( doc != null ){ 80 extractNodeLinks(doc.getDocumentElement()); 81 } 82 return doc; 83 } 84 85 private void extractNodeLinks(Node node){ 86 87 if ( node == null ){ 88 return; 89 } 90 91 if (node.getNodeType() == Node.ELEMENT_NODE) { 92 Element curElement = (Element ) node; 93 Set linkAttributes = (Set ) linkAttributesByTagName.get(curElement.getTagName().toLowerCase()); 94 if (linkAttributes != null) { 95 Iterator attributeIter = linkAttributes.iterator(); 96 while (attributeIter.hasNext()) { 97 String curLinkAttribute = (String ) attributeIter.next(); 98 String curLink = curElement.getAttribute(curLinkAttribute); 99 if (curLink != null) { 100 logger.debug("Found link [" + curLink + "] on tag [" + curElement.getTagName() + "] with attribute [" + curLinkAttribute + "]"); 101 documentLinks.add(curLink); 102 } 103 } 104 } 105 } 106 107 NodeList childNodes = node.getChildNodes(); 108 for (int i=0; i < childNodes.getLength(); i++) { 109 extractNodeLinks(childNodes.item(i)); 110 } 111 } 112 113 117 public ArrayList getDocumentLinks() { 118 return documentLinks; 119 } 120 121 } | Popular Tags |