1 5 package net.matuschek.http; 6 7 import java.io.*; 8 import java.net.*; 9 import java.util.*; 10 import java.util.zip.ZipEntry ; 11 import java.util.zip.ZipFile ; 12 import java.util.zip.ZipOutputStream ; 13 14 import net.matuschek.util.MD5; 15 import org.apache.log4j.Category; 16 17 26 public class HttpDocCache implements HttpDocManager { 27 28 29 protected final static String CONTENT_DUPLICATE = "Content-Duplicate"; 30 31 32 public boolean useMD5 = true; 33 34 35 protected static Category log = 36 Category.getInstance(HttpDocCache.class.getName()); 37 38 39 private Collection urls = new LinkedList(); 40 41 42 protected String storagedir; 43 44 45 protected File storageDirectoryFile = null; 46 47 48 protected final static String LINKS = "links" + File.separator; 49 50 51 protected final static String CONTENT = "content" + File.separator; 52 53 54 protected final static String DOCUMENTS = "documents" + File.separator; 55 56 60 public HttpDocCache(String storageDirectory) { 61 setStorageDir(storageDirectory); 62 } 63 64 private FileOutputStream storageDirectoryStream = null; 65 66 70 private void setStorageDir(String newStoragedir) { 71 storagedir = newStoragedir; 72 73 if (!storagedir.endsWith(File.separator)) { 74 storagedir = storagedir + File.separator; 75 } 76 77 File storagedirFile = new File (storagedir + DOCUMENTS); 79 if (!storagedirFile.exists()) { 80 storagedirFile.mkdirs(); 81 } 82 File contentFile = new File (storagedir + CONTENT); 83 if (!contentFile.exists()) { 84 contentFile.mkdirs(); 85 } 86 87 if (useMD5) { 88 storageDirectoryFile = new File (storagedir + "directory.csv"); 89 try { 90 storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true); 91 if (!storageDirectoryFile.exists()) { 92 storageDirectoryStream.write(("Path,URL" + LF).getBytes()); 93 } 94 } catch (Exception e) { 95 log.error(e.getMessage()); 96 } 97 } 98 } 99 100 final static String QUOTE = "\""; 101 final static String LF = System.getProperty("line.separator"); 102 103 111 public void storeDocument(HttpDoc doc) throws DocManagerException { 112 List links = doc.getLinks(); 113 114 if (doc.isCached()) { 116 return; 117 } 118 119 String filename = generateFilename(doc.getURL().toExternalForm()); 121 122 String filepath = storagedir + DOCUMENTS + filename; 123 checkStoragePathFor(DOCUMENTS, filename); 124 125 try { 126 File f = new File (filepath + ".zip"); 127 if (!f.exists()) { 128 writeDirectoryInfo(doc, filename); 129 } 130 131 OutputStream fs = new BufferedOutputStream(new FileOutputStream(f)); 133 ZipOutputStream zos = new ZipOutputStream (fs); 134 zos.setLevel(9); 135 136 try { 137 storeContent(doc); 139 writeHeadersToZipFile(doc, zos); 140 writeUrlToZipFile(doc, zos); 141 if (links != null) { 142 writeLinksToZipFile(links, zos); 143 } 144 } catch (Throwable e){ 145 System.out.println(e); 146 } finally { 147 zos.close(); 148 fs.close(); 149 long date = doc.getDateAsMilliSeconds(); 150 f.setLastModified(date > 0 ? date : System.currentTimeMillis()); 151 } 152 } catch (IOException ioex) { 153 DocManagerException ex = new DocManagerException(ioex.getMessage()); 154 throw ex; 155 } 156 } 157 158 164 protected void writeDirectoryInfo(HttpDoc doc, String filename) 165 throws IOException { 166 if (storageDirectoryFile != null) { 167 synchronized(storageDirectoryFile) { 168 try { 169 String directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF; 170 storageDirectoryStream.write(directoryInfo.getBytes()); 171 } catch (Exception e) { 172 log.warn(e.getMessage()); 173 storageDirectoryStream.close(); 174 } 175 } 176 } 177 } 178 179 185 protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream zos) 186 throws IOException { 187 String contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE); 188 String extension = getExtensionFromContenttype(contenttype); 189 ZipEntry zipEntry = new ZipEntry ("content" + extension); 190 long date = doc.getLastModifiedAsMilliSeconds(); 191 if (date < 0) { 192 date = doc.getDateAsMilliSeconds(); 193 } 194 zipEntry.setTime(date); 195 zos.putNextEntry(zipEntry); 196 zos.write(doc.getContent()); 197 zos.closeEntry(); 198 } 199 200 207 protected ZipEntry writeHeadersToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException { 208 StringBuffer comment = new StringBuffer (); 209 Vector headers = doc.getHttpHeader(); 210 for (Iterator iter = headers.iterator(); iter.hasNext();) { 211 HttpHeader header = (HttpHeader) iter.next(); 212 if (!header.getName().equals(CONTENT_DUPLICATE)) { 213 comment.append(header.toString()); 214 if (iter.hasNext()) { 215 comment.append(LF); 216 } 217 } 218 } 219 ZipEntry ze = new ZipEntry ("header"); 220 zos.putNextEntry(ze); 221 zos.write(comment.toString().getBytes()); 222 long date = doc.getDateAsMilliSeconds(); 223 ze.setTime(date > 0 ? date : System.currentTimeMillis()); 224 zos.closeEntry(); 225 return ze; 226 } 227 228 235 protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile zf) throws IOException { 236 ZipEntry ze = zf.getEntry("header"); 237 if (ze != null) { 238 InputStream is = zf.getInputStream(ze); 239 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 240 while (reader.ready()) { 241 String line = reader.readLine(); 242 int pos = line.indexOf(": "); 243 if (pos >= 0) { 244 String name = line.substring(0, pos); 245 String value = line.substring(pos + 2); 246 HttpHeader header = new HttpHeader(name, value); 247 doc.addHeader(header); 248 } 249 } 250 reader.close(); 251 return true; 252 } 253 return false; 254 } 255 256 263 protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile zf) throws IOException { 264 ZipEntry ze = zf.getEntry("links"); 265 List links = doc.getLinks(); 266 if (links == null) { 267 links = new Vector(); 268 doc.setLinks(links); 269 } else { 270 links.clear(); 271 } 272 273 if (ze != null) { 274 InputStream is = zf.getInputStream(ze); 275 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 276 while (reader.ready()) { 277 String line = reader.readLine(); 278 if (line != null) { 279 URL url = new URL(line); 280 links.add(url); 281 } 282 } 283 reader.close(); 284 return true; 285 } 286 return false; 287 } 288 289 296 protected ZipEntry writeUrlToZipFile(HttpDoc doc, ZipOutputStream zos) throws IOException { 297 String url = doc.getURL().toString(); 298 ZipEntry ze = new ZipEntry ("url"); 299 zos.putNextEntry(ze); 300 zos.write(url.getBytes()); 301 long date = doc.getDateAsMilliSeconds(); 302 ze.setTime(date > 0 ? date : System.currentTimeMillis()); 303 zos.closeEntry(); 304 return ze; 305 } 306 307 312 private File getContentUsersFile(HttpDoc doc) { 313 File f = null; 314 byte[] content = doc.getContent(); 315 if (content.length != 0) { 316 String md5 = doc.getContentMD5(); 317 f = contentFile(md5, ".txt"); 318 } 319 return f; 320 } 321 322 326 public String findDuplicate(HttpDoc doc) throws IOException { 327 String duplicate = null; 328 File f = getContentUsersFile(doc); 329 if (f != null) { 330 String urlString = doc.getURL().toString(); 331 if (f.exists()) { 332 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f))); 333 while (reader.ready()) { 334 String line = reader.readLine(); 335 if (line.equals(urlString)) { 336 break; 337 } else if (duplicate == null) { 338 duplicate = line; 339 } 340 } 341 reader.close(); 342 } 343 } 344 return duplicate; 345 } 346 347 351 protected void storeContent(HttpDoc doc) throws IOException { 352 if (doc.getContent().length == 0) 353 return; 354 File f = getContentUsersFile(doc); 355 String urlString = doc.getURL().toString(); 356 String md5 = doc.getContentMD5(); 357 358 boolean found = false; 360 if (f.exists()) { 361 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f))); 362 try { 363 while (reader.ready()) { 364 String line = reader.readLine(); 365 if (line.equals(urlString)) { 366 found = true; break; 367 } 368 } 369 } finally { 370 reader.close(); 371 } 372 } 373 374 File fzip = contentFile(md5, ".zip"); 376 if (!fzip.exists()) { 377 checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5)); 378 OutputStream fs = new BufferedOutputStream(new FileOutputStream(fzip)); 379 ZipOutputStream zos = null; 380 try { 381 zos = new ZipOutputStream (fs); 382 zos.setLevel(9); 383 writeContentToZipFile(doc, zos); 384 } finally { 385 if (zos != null) { 386 zos.close(); 387 } else { 388 fs.close(); 389 } 390 } 391 } else { 392 fzip.setLastModified(System.currentTimeMillis()); 393 } 394 395 if (!found) { 397 FileOutputStream os = new FileOutputStream(f.getPath(), true); 398 try { 399 os.write((urlString + LF).getBytes()); 400 } finally { 401 os.close(); 402 } 403 } 404 } 405 406 411 protected void writeLinksToZipFile(List links, ZipOutputStream zs) 412 throws IOException { 413 HashSet storedLinks = new HashSet(); 414 ZipEntry zipEntry = new ZipEntry ("links"); 415 zs.putNextEntry(zipEntry); 416 for (Iterator iter = links.iterator(); iter.hasNext();) { 417 URL url = (URL) iter.next(); 418 if (!storedLinks.contains(url)) { 419 zs.write((url.toString() + LF).getBytes()); 420 storedLinks.add(url); 421 } 422 } 423 zs.closeEntry(); 424 } 425 426 434 public void processDocument(HttpDoc doc) throws DocManagerException { 435 log.info( 436 "Processing " 437 + doc.getURL().toExternalForm() 438 + doc.getHttpHeader()); 439 440 HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE); 442 if (duplicate == null) { 443 urls.add(doc.getURL()); 444 } 445 } 446 447 452 public HttpDoc retrieveFromCache(java.net.URL url) { 453 HttpDoc doc = null; 454 File f = null; 455 try { 456 String filename0 = url.toExternalForm(); 457 String filename = generateFilename(filename0) + ".zip"; 458 f = new File (storagedir + DOCUMENTS + filename); 459 460 if (f.exists()) { 461 log.info("retrieve " + f); 462 463 doc = new HttpDoc(); 465 doc.setURL(url); 466 ZipFile zf = new ZipFile (f); 467 468 readHeadersFromZipFile(doc, zf); 470 471 readLinksFromZipFile(doc, zf); 473 474 doc.setCached(true); 475 476 String md5 = doc.getContentMD5(); 478 File contentFile = contentFile(md5, ".zip"); 479 if (contentFile.exists()) { 480 ZipFile contentZip = new ZipFile (contentFile); 481 readContentFromZipFile(doc, contentZip); 482 contentZip.close(); 483 } else { 484 doc.setContent(new byte[0]); 485 } 486 zf.close(); 487 } 488 } catch (Exception e) { 489 log.warn("removing invalid file " + f); 490 f.delete(); 491 doc = null; 492 } 493 494 return doc; 495 } 496 497 503 protected void readContentFromZipFile(HttpDoc doc, ZipFile contentZip) 504 throws IOException { 505 byte[] content = null; 506 for (Enumeration enumeration = contentZip.entries(); enumeration.hasMoreElements();) { 507 ZipEntry zipEntry = (ZipEntry ) enumeration.nextElement(); 508 if (zipEntry.getName().startsWith("content")) { 509 InputStream is = contentZip.getInputStream(zipEntry); 510 int length = (int) zipEntry.getSize(); 511 content = new byte[length]; 512 int startPos = 0; 513 while (startPos < length) { 514 startPos += is.read(content, startPos, length - startPos); 515 } 516 is.close(); 517 break; 518 } 519 } 520 doc.setContent(content); 521 } 522 523 528 public void removeDocument(URL url) { 529 HttpDoc doc = retrieveFromCache(url); 530 531 File f = null; 532 try { 533 String filename0 = url.toExternalForm(); 534 String filename = generateFilename(filename0) + ".zip"; 535 536 f = new File (storagedir + LINKS + filename); 537 if (f.exists()) { 538 f.delete(); 539 } 540 541 deleteContent(doc); 542 f = new File (storagedir + DOCUMENTS + filename); 543 if (f.exists()) { 544 f.delete(); 545 } 546 } catch (Exception ex) { 547 log.error(ex); 548 } 549 } 550 551 555 private void deleteContent(HttpDoc doc) throws IOException { 556 byte[] content = doc.getContent(); 557 if (content.length == 0) { 558 return; 559 } 560 String urlString = doc.getURL().toString(); 561 String md5 = doc.getContentMD5(); 562 File f = contentFile(md5, ".txt"); 563 ArrayList entries = new ArrayList(); 564 if (f.exists()) { 565 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f))); 566 while (reader.ready()) { 567 String line = reader.readLine(); 568 if (!line.equals(urlString)) { 569 entries.add(line); 570 } 571 } 572 reader.close(); 573 } 574 if (entries.size() > 0) { 575 FileOutputStream os = new FileOutputStream(f.getPath(), false); 576 for (Iterator iter = entries.iterator(); iter.hasNext();) { 577 String line = (String ) iter.next(); 578 os.write((line + LF).getBytes()); 579 } 580 os.close(); 581 } else { 582 f.delete(); 583 File fzip = contentFile(md5, ".zip"); 584 if (fzip.exists()) { 585 fzip.delete(); 586 } 587 } 588 } 589 590 594 public String toString() { 595 StringBuffer sb = new StringBuffer (1000); 596 for (Iterator i = urls.iterator(); i.hasNext();) { 597 sb.append(i.next()).append("\n"); 598 } 599 return sb.toString(); 600 } 601 602 606 private final String useFirstCharactersAsDirectories(String filename) { 607 int n = storageDirDepth; 608 if (n > filename.length()) n = filename.length(); 609 char dir[] = new char[n*2]; 610 for (int i=0; i<n; i++) { 611 dir[i*2] = filename.charAt(i); 612 dir[i*2+1] = File.separatorChar; 613 } 614 return new String (dir); 615 } 616 617 622 private final void checkStoragePathFor(String subdirectory, String filename) { 623 if (!subdirectory.endsWith(File.separator)) { 624 subdirectory += File.separator; 625 } 626 String head = filename.substring(0, storageDirDepth*2); 627 File path = new File (storagedir + subdirectory + head); 628 if (!path.exists()) { 629 path.mkdirs(); 630 } 631 } 632 633 638 protected String generateFilename(String docURI) { 639 if (useMD5) { 640 MD5 md5 = new MD5(docURI); 641 String hex = md5.asHex(); 642 if (storageDirDepth > 0) { 643 return useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth); 644 } 645 return hex; 646 } else { 647 StringBuffer buf = new StringBuffer (docURI.length()); 648 649 for (int i = 0; i < docURI.length(); i++) { 650 char c = docURI.charAt(i); 651 switch (c) { 652 case '/' : buf.append("&slash;"); break; 653 case '\\' : buf.append("&backslash"); break; 654 case ':' : buf.append(":"); break; 655 case '*' : buf.append("&asterisk;"); break; 656 case '?' : buf.append("&question;"); break; 657 case '\"' : buf.append("""); break; 658 case '<' : buf.append("<"); break; 659 case '>' : buf.append(">"); break; 660 case '|' : buf.append("∨"); break; 661 default : buf.append(c); break; 662 } 663 } 664 docURI = buf.toString(); 665 666 return docURI; 667 } 668 } 669 670 675 protected File contentFile(String hex, String extension) { 676 return new File (storagedir + CONTENT + useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth) + extension); 677 } 678 679 683 public void finish() { 684 if (storageDirectoryStream != null) { 685 try { 686 storageDirectoryStream.close(); 687 storageDirectoryStream = null; 688 } catch (IOException e) { 689 e.printStackTrace(); 690 } 691 } 692 } 693 694 698 protected void finalize() throws Throwable { 699 finish(); 700 super.finalize(); 701 } 702 703 709 protected int storageDirDepth = 0; 710 711 717 public void setStorageDirDepth(int depth) { storageDirDepth = depth; } 718 719 725 public int getStorageDirDepth() { return storageDirDepth; } 726 727 732 private String getExtensionFromContenttype(String contenttype) { 733 String extension = null; 734 if (contenttype != null){ 735 String strContentType = null; 736 int pos = contenttype.indexOf(';'); 737 if (pos > 0) { 738 strContentType = contenttype.substring(0, pos).trim(); 739 } else { 740 strContentType = contenttype.trim(); 741 } 742 extension = getDefaultExtension(strContentType); 743 } 744 745 if (extension == null) { 746 extension = ""; 747 } else { 748 extension = "." + extension; 749 } 750 return extension; 751 } 752 753 758 protected String getDefaultExtension(String contentType) { 759 if (contentType == null) { 760 return null; 761 } else if (contentType.indexOf("text/html") >= 0) { 762 return ".html"; 763 } else if (contentType.indexOf("text/") >= 0) { 764 return ".txt"; 765 } else { 766 return null; 767 } 768 } 769 } | Popular Tags |