1 18 19 package org.apache.jmeter.protocol.http.parser; 20 21 import java.io.ByteArrayInputStream ; 22 import java.net.MalformedURLException ; 23 import java.net.URL ; 24 import java.util.Iterator ; 25 26 import org.apache.jorphan.logging.LoggingManager; 27 import org.apache.log.Logger; 28 import org.w3c.dom.Document ; 29 import org.w3c.dom.NamedNodeMap ; 30 import org.w3c.dom.Node ; 31 import org.w3c.dom.NodeList ; 32 import org.w3c.tidy.Tidy; 33 import org.xml.sax.SAXException ; 34 35 40 class JTidyHTMLParser extends HTMLParser 41 { 42 43 transient private static Logger log = LoggingManager.getLoggerForClass(); 44 45 protected JTidyHTMLParser() 46 { 47 super(); 48 } 49 50 protected boolean isReusable() 51 { 52 return true; 53 } 54 55 58 public Iterator getEmbeddedResourceURLs(byte[] html, URL baseUrl, URLCollection urls) 59 throws HTMLParseException 60 { 61 Document dom = null; 62 try 63 { 64 dom = (Document )getDOM(html); 65 } 66 catch(SAXException se) 67 { 68 throw new HTMLParseException(se); 69 } 70 71 73 scanNodes(dom,urls, baseUrl); 74 75 return urls.iterator(); 76 } 77 78 86 private URL scanNodes(Node node, URLCollection urls, URL baseUrl) throws HTMLParseException 87 { 88 if ( node == null ) { 89 return baseUrl; 90 } 91 92 String name = node.getNodeName(); 93 94 int type = node.getNodeType(); 95 96 switch ( type ) { 97 98 case Node.DOCUMENT_NODE: 99 scanNodes(((Document )node).getDocumentElement(),urls,baseUrl); 100 break; 101 102 case Node.ELEMENT_NODE: 103 104 NamedNodeMap attrs = node.getAttributes(); 105 if (name.equalsIgnoreCase("base")) 106 { 107 String tmp=getValue(attrs,"href"); 108 if (tmp!=null) try 109 { 110 baseUrl= new URL (baseUrl, tmp); 111 } 112 catch (MalformedURLException e) 113 { 114 throw new HTMLParseException(e); 115 } 116 break; 117 } 118 119 if (name.equalsIgnoreCase("img")) 120 { 121 urls.addURL(getValue(attrs,"src"),baseUrl); 122 break; 123 } 124 125 if (name.equalsIgnoreCase("applet")) 126 { 127 urls.addURL(getValue(attrs,"code"),baseUrl); 128 break; 129 } 130 if (name.equalsIgnoreCase("input")) 131 { 132 String SRC=getValue(attrs,"src"); 133 String typ=getValue(attrs,"type"); 134 if ((src!=null) &&(typ.equalsIgnoreCase("image")) ){ 135 urls.addURL(src,baseUrl); 136 } 137 break; 138 } 139 if (name.equalsIgnoreCase("link") 140 && getValue(attrs,"rel").equalsIgnoreCase("stylesheet")) 141 { 142 urls.addURL(getValue(attrs,"href"),baseUrl); 143 break; 144 } 145 if (name.equalsIgnoreCase("script")) 146 { 147 urls.addURL(getValue(attrs,"src"),baseUrl); 148 break; 149 } 150 if (name.equalsIgnoreCase("frame")) 151 { 152 urls.addURL(getValue(attrs,"src"),baseUrl); 153 break; 154 } 155 String back=getValue(attrs,"background"); 156 if (back != null){ 157 urls.addURL(back,baseUrl); 158 break; 159 } 160 161 NodeList children = node.getChildNodes(); 162 if ( children != null ) { 163 int len = children.getLength(); 164 for ( int i = 0; i < len; i++ ) { 165 baseUrl= scanNodes(children.item(i),urls,baseUrl); 166 } 167 } 168 break; 169 170 173 } 174 175 return baseUrl; 176 177 } 178 179 185 private String getValue(NamedNodeMap attrs, String attname) 186 { 187 String v=null; 188 Node n = attrs.getNamedItem(attname); 189 if (n != null) v=n.getNodeValue(); 190 return v; 191 } 192 193 198 private static Tidy getTidyParser() 199 { 200 log.debug("Start : getParser"); 201 Tidy tidy = new Tidy(); 202 tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8); 203 tidy.setQuiet(true); 204 tidy.setShowWarnings(false); 205 if(log.isDebugEnabled()) 206 { 207 log.debug("getParser : tidy parser created - " + tidy); 208 } 209 log.debug("End : getParser"); 210 return tidy; 211 } 212 213 221 private static Node getDOM(byte [] text) throws SAXException 222 { 223 log.debug("Start : getDOM"); 224 Node node = getTidyParser().parseDOM(new 225 ByteArrayInputStream (text), null); 226 if(log.isDebugEnabled()) 227 { 228 log.debug("node : " + node); 229 } 230 log.debug("End : getDOM"); 231 return node; 232 } 233 } 234 | Popular Tags |