1 package net.matuschek.spider.docfilter; 2 3 6 7 8 import java.io.ByteArrayInputStream ; 9 import java.io.ByteArrayOutputStream ; 10 import java.io.PrintWriter ; 11 import java.net.MalformedURLException ; 12 import java.net.URL ; 13 import java.util.StringTokenizer ; 14 15 import net.matuschek.http.HttpDoc; 16 import net.matuschek.util.NullWriter; 17 18 import org.w3c.dom.Node ; 19 import org.w3c.dom.Element ; 20 import org.w3c.dom.NodeList ; 21 import org.w3c.dom.Document ; 22 23 import org.w3c.tidy.Tidy; 24 25 26 35 public class LinkLocalizer implements DocumentFilter 36 { 37 38 protected boolean enabled=true; 39 40 47 public HttpDoc process(HttpDoc input) 48 throws FilterException 49 { 50 if (input == null) { 51 return null; 52 } 53 54 if (! input.isHTML()) { 55 return input; 56 } 57 58 if (! enabled) { 59 return input; 60 } 61 62 ByteArrayInputStream bis = new ByteArrayInputStream (input.getContent()); 64 Tidy tidy = new Tidy(); 65 tidy.setUpperCaseTags(false); 66 tidy.setUpperCaseAttrs(false); 67 tidy.setErrout(new PrintWriter (new NullWriter())); 68 69 Document doc = tidy.parseDOM(bis,null); 70 71 rewriteDOM(doc,input.getURL()); 72 73 ByteArrayOutputStream bos = new ByteArrayOutputStream (); 74 tidy.pprint(doc,bos); 75 76 input.setContent(bos.toByteArray()); 77 78 return input; 79 } 80 81 82 86 public void enable() { 87 this.enabled=true; 88 } 89 90 91 95 public void disable() { 96 this.enabled=false; 97 } 98 99 100 105 public boolean isEnabled() { 106 return this.enabled; 107 } 108 109 110 116 private void rewriteDOM(Node node, URL url) 117 throws FilterException 118 { 119 120 if (node==null) { 122 throw new FilterException("Got a null node"); 123 } 124 125 if (node instanceof Element ) { 127 String name = node.getNodeName(); 128 if (name.equals("a") 129 || name.equals("area")) { 130 localizeAttrib(node,"href",url); 131 132 } else if (name.equals("img") 133 || name.equals("frame")) { 134 localizeAttrib(node,"src",url); 135 136 } 137 } 138 139 NodeList childs = node.getChildNodes(); 141 142 for (int i=0; i<childs.getLength(); i++) { 143 rewriteDOM(childs.item(i),url); 144 } 145 146 147 } 148 149 150 159 private void localizeAttrib(Node node, 160 String attribute, 161 URL context) 162 { 163 Element el = (Element )node; 164 String oldValue = el.getAttribute(attribute); 165 166 if (!oldValue.equals("") && oldValue.indexOf("/")!=-1) { 169 String newValue = localizeURL(oldValue,context); 170 el.setAttribute(attribute, newValue); 171 } 173 } 174 175 176 177 190 private String localizeURL(String urlStr, URL context) { 191 URL url; 192 try { 193 url = new URL (context, urlStr); 194 } catch (MalformedURLException e) { 195 return urlStr; 196 } 197 198 if (! url.getProtocol().equalsIgnoreCase("http")) { 200 return urlStr; 201 } 202 203 205 if ((context != null) 206 && (context.getHost().equalsIgnoreCase(url.getHost()))) { 207 String ref = url.getRef(); 208 String path = url.getPath(); 209 210 if (path.startsWith("../")) { 214 return urlStr; 215 } 216 217 if ((ref != null) && (! ref.equals(""))) { 219 path = path+"#"+ref; 220 } 221 222 if ((path.length()>0) && (path.charAt(path.length()-1)) == '/') { 224 path = path+"index.html"; 225 } 226 227 return localizePath(url.getPath(),context.getPath()); 228 } else { 229 return urlStr; 230 } 231 } 232 233 234 246 private String localizePath(String path, String context) { 247 StringTokenizer st = new StringTokenizer (context,"/"); 248 int depth = st.countTokens(); 249 if (! context.endsWith("/")) { 250 depth--; 251 } 252 253 StringBuffer sb = new StringBuffer (); 254 if (depth>0) { 255 for (int i=0; i<depth; i++) { 256 sb.append("/.."); 257 } 258 sb.deleteCharAt(0); 259 } else { 260 if (path.startsWith("/")) { 261 path=path.substring(1); 263 } 264 } 265 sb.append(path); 266 267 return sb.toString(); 268 } 269 270 } 271 272 | Popular Tags |