HttpDocCache


1   //////////////////////////////////////////////////////////////////////////////
2   // Copyright (c) Insiders Wissensbasierte Systeme GmbH, Germany
3   //////////////////////////////////////////////////////////////////////////////
4   
5   package net.matuschek.http;
6   
7   import java.io.*;
8   import java.net.*;
9   import java.util.*;
10  import java.util.zip.ZipEntry  ;
11  import java.util.zip.ZipFile  ;
12  import java.util.zip.ZipOutputStream  ;
13  
14  import net.matuschek.util.MD5;
15  import org.apache.log4j.Category;
16  
17  /**
18   * Full implementation of HttpDocManager interface.
19   * Caches documents, links and headers in ZIP-files.
20   * Documents with same content will be detected 
21   * and share the same content-storage.
22   *
23   * @author Oliver Schmidt
24   * @version $Revision: 1.2 $
25   */
26  public class HttpDocCache implements HttpDocManager {
27  
28      /** internally used header name to mark duplicates */
29      protected final static String   CONTENT_DUPLICATE = "Content-Duplicate";
30      
31      /** use MD5 encoding for filenames */
32      public boolean useMD5 = true;
33      
34      /** log4j logging instance */
35      protected static Category log =
36          Category.getInstance(HttpDocCache.class.getName());
37  
38      /** collection of visited URLs */
39      private Collection urls = new LinkedList();
40  
41      /** storage main directory */
42      protected String   storagedir;
43      
44      /** file that holds directory information */
45      protected File   storageDirectoryFile = null;
46      
47      /** subdirectory name for links */
48      protected final static String   LINKS = "links" + File.separator;
49      
50      /** subdirectory name for content */
51      protected final static String   CONTENT = "content" + File.separator;
52      
53      /** subdirectory name for document information */
54      protected final static String   DOCUMENTS = "documents" + File.separator;
55      
56      /**
57       * Constructor
58       * @param storageDirectory
59       */
60      public HttpDocCache(String   storageDirectory) {
61          setStorageDir(storageDirectory);
62      }
63      
64      private FileOutputStream storageDirectoryStream = null;
65      
66      /**
67       * Set storage directory and create directories if necessary.
68       * @param newStoragedir
69       */
70      private void setStorageDir(String   newStoragedir) {
71          storagedir = newStoragedir;
72          
73          if (!storagedir.endsWith(File.separator)) {
74              storagedir = storagedir + File.separator;
75          }
76          
77          // create the directories, if they do not exist yet.
78          File   storagedirFile = new File  (storagedir + DOCUMENTS);
79          if (!storagedirFile.exists()) {
80              storagedirFile.mkdirs();
81          }
82          File   contentFile = new File  (storagedir + CONTENT);
83          if (!contentFile.exists()) {
84              contentFile.mkdirs();
85          }
86          
87          if (useMD5) {
88              storageDirectoryFile = new File  (storagedir + "directory.csv");
89              try {
90                  storageDirectoryStream = new FileOutputStream(storageDirectoryFile.getPath(), true);
91                  if (!storageDirectoryFile.exists()) {
92                      storageDirectoryStream.write(("Path,URL" + LF).getBytes());
93                  }
94              } catch (Exception   e) {
95                  log.error(e.getMessage());
96              }
97          }
98      }
99      
100     final static String   QUOTE = "\"";
101     final static String   LF = System.getProperty("line.separator");
102 
103     /**
104      * Method store.
105      * stores the document to the storage directory
106      * @param doc the document to be stored
107      * @param links to be stored (optional)
108      * @return String
109      * @throws DocManagerException if the document cannot be written to the directory
110      */
111     public void storeDocument(HttpDoc doc) throws DocManagerException {
112         List links = doc.getLinks();
113          
114         // don�t store cached documents
115         if (doc.isCached()) {
116             return;
117         }
118         
119         // get the content type
120         String   filename = generateFilename(doc.getURL().toExternalForm());
121         
122         String   filepath = storagedir + DOCUMENTS + filename;
123         checkStoragePathFor(DOCUMENTS, filename);
124                     
125         try {
126             File   f = new File  (filepath + ".zip");
127             if (!f.exists()) {
128                 writeDirectoryInfo(doc, filename);
129             }
130     
131             // write it to the file
132             OutputStream   fs = new BufferedOutputStream(new FileOutputStream(f));
133             ZipOutputStream   zos = new ZipOutputStream  (fs);
134             zos.setLevel(9);
135             
136             try {
137     //          writeContentToZipFile(doc, zos);
138                 storeContent(doc);
139                 writeHeadersToZipFile(doc, zos);
140                 writeUrlToZipFile(doc, zos);
141                 if (links != null) {
142                     writeLinksToZipFile(links, zos);
143                 }
144             } catch (Throwable   e){
145                 System.out.println(e);
146             } finally {
147                 zos.close();
148                 fs.close();
149                 long date = doc.getDateAsMilliSeconds();
150                 f.setLastModified(date > 0 ? date : System.currentTimeMillis());
151             }
152         } catch (IOException ioex) {
153             DocManagerException ex = new DocManagerException(ioex.getMessage());
154             throw ex;
155         }
156     }
157 
158     /**
159      * Write Directory info.
160      * @param doc
161      * @param filename in cache
162      * @throws IOException
163      */
164     protected void writeDirectoryInfo(HttpDoc doc, String   filename)
165         throws IOException {
166         if (storageDirectoryFile != null) {
167             synchronized(storageDirectoryFile) {
168                 try {
169                     String   directoryInfo = QUOTE + filename + QUOTE + "," + QUOTE + doc.getURL() + QUOTE + LF;
170                     storageDirectoryStream.write(directoryInfo.getBytes());
171                 } catch (Exception   e) {
172                     log.warn(e.getMessage());
173                     storageDirectoryStream.close();
174                 }
175             }
176         }
177     }
178 
179     /**
180      * Write content to zipFile
181      * @param doc
182      * @param zos
183      * @throws IOException
184      */
185     protected void writeContentToZipFile(HttpDoc doc, ZipOutputStream   zos)
186         throws IOException {
187         String   contenttype = doc.getHeaderValue(HttpHeader.CONTENT_TYPE);
188         String   extension = getExtensionFromContenttype(contenttype);
189         ZipEntry   zipEntry = new ZipEntry  ("content" + extension);
190         long date = doc.getLastModifiedAsMilliSeconds();
191         if (date < 0) {
192             date = doc.getDateAsMilliSeconds();
193         }
194         zipEntry.setTime(date);
195         zos.putNextEntry(zipEntry);
196         zos.write(doc.getContent());
197         zos.closeEntry();
198     }
199 
200     /**
201      * Write headers to zipFile.
202      * @param doc
203      * @param zos
204      * @return ZipEntry
205      * @throws IOException
206      */
207     protected ZipEntry   writeHeadersToZipFile(HttpDoc doc, ZipOutputStream   zos) throws IOException {
208         StringBuffer   comment = new StringBuffer  ();
209         Vector headers = doc.getHttpHeader();
210         for (Iterator iter = headers.iterator(); iter.hasNext();) {
211             HttpHeader header = (HttpHeader) iter.next();
212             if (!header.getName().equals(CONTENT_DUPLICATE)) {
213                 comment.append(header.toString());
214                 if (iter.hasNext()) {
215                     comment.append(LF);
216                 }
217             }
218         }
219         ZipEntry   ze = new ZipEntry  ("header");
220         zos.putNextEntry(ze);
221         zos.write(comment.toString().getBytes());
222         long date = doc.getDateAsMilliSeconds();
223         ze.setTime(date > 0 ? date : System.currentTimeMillis());
224         zos.closeEntry();
225         return ze;
226     }
227     
228     /**
229      * Read headers from ZipFile
230      * @param doc
231      * @param zf
232      * @return boolean
233      * @throws IOException
234      */
235     protected boolean readHeadersFromZipFile(HttpDoc doc, ZipFile   zf) throws IOException {
236         ZipEntry   ze = zf.getEntry("header");
237         if (ze != null) {
238             InputStream is = zf.getInputStream(ze);
239             BufferedReader reader = new BufferedReader(new InputStreamReader(is));
240             while (reader.ready()) {
241                 String   line = reader.readLine();
242                 int pos = line.indexOf(": ");
243                 if (pos >= 0) {
244                     String   name = line.substring(0, pos);
245                     String   value = line.substring(pos + 2);
246                     HttpHeader header = new HttpHeader(name, value);
247                     doc.addHeader(header);
248                 }
249             }
250             reader.close();
251             return true;
252         }
253         return false;
254     }
255     
256     /**
257      * Read links from ZipFile
258      * @param doc
259      * @param zf
260      * @return boolean
261      * @throws IOException
262      */
263     protected boolean readLinksFromZipFile(HttpDoc doc, ZipFile   zf) throws IOException {
264         ZipEntry   ze = zf.getEntry("links");
265         List links = doc.getLinks();
266         if (links == null) {
267             links = new Vector();
268             doc.setLinks(links);
269         } else {
270             links.clear();
271         }
272         
273         if (ze != null) {
274             InputStream is = zf.getInputStream(ze);
275             BufferedReader reader = new BufferedReader(new InputStreamReader(is));
276             while (reader.ready()) {
277                 String   line = reader.readLine();
278                 if (line != null) {
279                     URL url = new URL(line);
280                     links.add(url);
281                 }
282             }
283             reader.close();
284             return true;
285         }
286         return false;
287     }
288     
289     /**
290      * Write Url to ZipFile.
291      * @param doc
292      * @param zos
293      * @return ZipEntry
294      * @throws IOException
295      */
296     protected ZipEntry   writeUrlToZipFile(HttpDoc doc, ZipOutputStream   zos) throws IOException {
297         String   url = doc.getURL().toString();
298         ZipEntry   ze = new ZipEntry  ("url");
299         zos.putNextEntry(ze);
300         zos.write(url.getBytes());
301         long date = doc.getDateAsMilliSeconds();
302         ze.setTime(date > 0 ? date : System.currentTimeMillis());
303         zos.closeEntry();
304         return ze;
305     }
306     
307     /**
308      * Get File of document content users.
309      * @param doc
310      * @return File
311      */
312     private File   getContentUsersFile(HttpDoc doc) {
313         File   f = null;
314         byte[] content = doc.getContent();
315         if (content.length != 0) {
316             String   md5 = doc.getContentMD5();
317             f = contentFile(md5, ".txt");
318         }
319         return f;
320     }
321     
322     /**
323      * Returns URL-String of duplicate content (if found).
324      * @see net.matuschek.http.HttpDocManager#findDuplicate(HttpDoc)
325      */
326     public String   findDuplicate(HttpDoc doc) throws IOException {
327         String   duplicate = null;
328         File   f = getContentUsersFile(doc);
329         if (f != null) {
330             String   urlString = doc.getURL().toString();
331             if (f.exists()) {
332                 BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
333                 while (reader.ready()) {
334                     String   line = reader.readLine();
335                     if (line.equals(urlString)) {
336                         break;
337                     } else if (duplicate == null) {
338                         duplicate = line; 
339                     }
340                 }
341                 reader.close();
342             } 
343         }
344         return duplicate;
345     }
346     
347     /**
348      * Creates a file with a name created by the content, containing the URL.
349      * @param doc
350      */ 
351     protected void storeContent(HttpDoc doc) throws IOException {
352         if (doc.getContent().length == 0) 
353             return;
354         File   f = getContentUsersFile(doc);
355         String   urlString = doc.getURL().toString();
356         String   md5 = doc.getContentMD5();
357         
358         // is content user?
359         boolean found = false;
360         if (f.exists()) {
361             BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
362             try {
363                 while (reader.ready()) {
364                     String   line = reader.readLine();
365                     if (line.equals(urlString)) {
366                         found = true; break;
367                     }
368                 }
369             } finally {
370                 reader.close();
371             }
372         } 
373         
374         // write content
375         File   fzip = contentFile(md5, ".zip");
376         if (!fzip.exists()) {
377             checkStoragePathFor(CONTENT, useFirstCharactersAsDirectories(md5));
378             OutputStream   fs = new BufferedOutputStream(new FileOutputStream(fzip));
379             ZipOutputStream   zos = null;
380             try {
381                 zos = new ZipOutputStream  (fs);
382                 zos.setLevel(9);
383                 writeContentToZipFile(doc, zos);
384             } finally {
385                 if (zos != null) {
386                     zos.close();
387                 } else {
388                     fs.close();
389                 }
390             }
391         } else {
392             fzip.setLastModified(System.currentTimeMillis());
393         }
394         
395         // append user
396         if (!found) {
397             FileOutputStream os = new FileOutputStream(f.getPath(), true);
398             try {
399                 os.write((urlString + LF).getBytes());
400             } finally {
401                 os.close();
402             }
403         }
404     }
405 
406     /**
407      * Write links to ZipFile.
408      * @param links
409      * @param ZipOutputStream
410      */ 
411     protected void writeLinksToZipFile(List links, ZipOutputStream   zs)
412         throws IOException {
413         HashSet storedLinks = new HashSet();
414         ZipEntry   zipEntry = new ZipEntry  ("links");
415         zs.putNextEntry(zipEntry);
416         for (Iterator iter = links.iterator(); iter.hasNext();) {
417             URL url = (URL) iter.next();
418             if (!storedLinks.contains(url)) {
419                 zs.write((url.toString() + LF).getBytes());
420                 storedLinks.add(url);
421             }
422         }
423         zs.closeEntry();
424     }
425     
426     /**
427      * Collects Urls (duplicates will be skipped).
428      * 
429      * @param doc a HttpDoc object to process. This may also be null
430      * @exception DocManagerException will be thrown if an error occurs
431      * while processing the document.
432      * @see net.matuschek.http.HttpDocManager#processDocument(net.matuschek.http.HttpDoc)
433      */
434     public void processDocument(HttpDoc doc) throws DocManagerException {
435         log.info(
436             "Processing "
437                 + doc.getURL().toExternalForm()
438                 + doc.getHttpHeader());
439                 
440         // collect URL (only if content is no duplicate)
441         HttpHeader duplicate = doc.getHeader(CONTENT_DUPLICATE);
442         if (duplicate == null) {
443             urls.add(doc.getURL());
444         }
445     }
446 
447     /**
448      * retrieves a document from the cache.
449      * @param url
450      * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
451      */
452     public HttpDoc retrieveFromCache(java.net.URL   url) {
453         HttpDoc doc = null;
454         File   f = null;
455         try {
456             String   filename0 = url.toExternalForm(); 
457             String   filename = generateFilename(filename0) + ".zip";
458             f = new File  (storagedir + DOCUMENTS + filename);
459                     
460             if (f.exists()) {
461                 log.info("retrieve " + f);
462                 
463                 // create document and read it from file
464                 doc = new HttpDoc();
465                 doc.setURL(url);
466                 ZipFile   zf = new ZipFile  (f);
467                 
468                 // read headers
469                 readHeadersFromZipFile(doc, zf);
470                 
471                 // read links
472                 readLinksFromZipFile(doc, zf);
473                 
474                 doc.setCached(true);
475                 
476                 // read content
477                 String   md5 = doc.getContentMD5();
478                 File   contentFile = contentFile(md5, ".zip");
479                 if (contentFile.exists()) {
480                     ZipFile   contentZip = new ZipFile  (contentFile);
481                     readContentFromZipFile(doc, contentZip);
482                     contentZip.close();
483                 } else {
484                     doc.setContent(new byte[0]);
485                 }
486                 zf.close();
487             } 
488         } catch (Exception   e) {
489             log.warn("removing invalid file " + f);
490             f.delete();
491             doc = null;
492         }
493                 
494         return doc;
495     }
496     
497     /**
498      * Read content from ZipFile
499      * @param doc
500      * @param contentZip
501      * @throws IOException
502      */
503     protected void readContentFromZipFile(HttpDoc doc, ZipFile   contentZip)
504         throws IOException {
505         byte[] content = null;
506         for (Enumeration enumeration = contentZip.entries(); enumeration.hasMoreElements();) {
507             ZipEntry   zipEntry = (ZipEntry  ) enumeration.nextElement();
508             if (zipEntry.getName().startsWith("content")) {
509                 InputStream is = contentZip.getInputStream(zipEntry);
510                 int length = (int) zipEntry.getSize();
511                 content = new byte[length]; 
512                 int startPos = 0;
513                 while (startPos < length) {
514                     startPos += is.read(content, startPos, length - startPos);
515                 }
516                 is.close();
517                 break;
518             }
519         }
520         doc.setContent(content);
521     }
522     
523     /**
524      * Remove document from cache.
525      * @param url
526      * @see net.matuschek.http.HttpDocManager#removeDocument(URL)
527      */
528     public void removeDocument(URL url) {
529         HttpDoc doc = retrieveFromCache(url);
530         
531         File   f = null;
532         try {
533             String   filename0 = url.toExternalForm(); 
534             String   filename = generateFilename(filename0) + ".zip";
535             
536             f = new File  (storagedir + LINKS + filename);
537             if (f.exists()) {
538                 f.delete();
539             }
540             
541             deleteContent(doc);
542             f = new File  (storagedir + DOCUMENTS + filename);
543             if (f.exists()) {
544                 f.delete();
545             }
546         } catch (Exception   ex) {
547             log.error(ex);
548         }
549     }
550     
551     /**
552      * Deletes stored content for the given document
553      * @param document
554      */ 
555     private void deleteContent(HttpDoc doc) throws IOException {
556         byte[] content = doc.getContent();
557         if (content.length == 0) {
558             return;
559         }
560         String   urlString = doc.getURL().toString();
561         String   md5 = doc.getContentMD5();
562         File   f = contentFile(md5, ".txt");
563         ArrayList entries = new ArrayList();
564         if (f.exists()) {
565             BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
566             while (reader.ready()) {
567                 String   line = reader.readLine();
568                 if (!line.equals(urlString)) {
569                     entries.add(line);
570                 }
571             }
572             reader.close();
573         }
574         if (entries.size() > 0) {
575             FileOutputStream os = new FileOutputStream(f.getPath(), false);
576             for (Iterator iter = entries.iterator(); iter.hasNext();) {
577                 String   line = (String  ) iter.next();
578                 os.write((line + LF).getBytes());
579             }
580             os.close();
581         } else {
582             f.delete();
583             File   fzip = contentFile(md5, ".zip");
584             if (fzip.exists()) {
585                 fzip.delete();
586             }
587         }
588     }
589     
590     /**
591      * List collected URLs.
592      * @see java.lang.Object#toString()
593      */
594     public String   toString() {
595         StringBuffer   sb = new StringBuffer  (1000);
596         for (Iterator i = urls.iterator(); i.hasNext();) {
597             sb.append(i.next()).append("\n");
598         }
599         return sb.toString();
600     }
601 
602     /**
603      * Uses the first storageDirDepth characters of filename as paths
604      * @param filename
605      */
606     private final String   useFirstCharactersAsDirectories(String   filename) {
607         int n = storageDirDepth;
608         if (n > filename.length()) n = filename.length();
609         char dir[] = new char[n*2];
610         for (int i=0; i<n; i++) {
611             dir[i*2] = filename.charAt(i);
612             dir[i*2+1] = File.separatorChar;
613         }
614         return new String  (dir);
615     }
616     
617     /**
618      * Checks if the storage path for the given file exists and creates it if necessary.
619      * @param subdirectory
620      * @param filename
621      */
622     private final void checkStoragePathFor(String   subdirectory, String   filename) {
623         if (!subdirectory.endsWith(File.separator)) {
624             subdirectory += File.separator;
625         }
626         String   head = filename.substring(0, storageDirDepth*2);
627         File   path = new File  (storagedir + subdirectory + head);
628         if (!path.exists()) {
629             path.mkdirs();
630         }
631     }
632     
633     /**
634      * Generate a valid filename for the given docURI.
635      * @param docURI
636      * @return String
637      */
638     protected String   generateFilename(String   docURI) {
639         if (useMD5) {
640             MD5 md5 = new MD5(docURI);
641             String   hex = md5.asHex();
642             if (storageDirDepth > 0) {
643                 return useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth);
644             }
645             return hex;
646         } else {
647             StringBuffer   buf = new StringBuffer  (docURI.length());
648             
649             for (int i = 0; i < docURI.length(); i++) {
650                 char c = docURI.charAt(i);
651                 switch (c) {
652                     case '/' : buf.append("&slash;"); break;
653                     case '\\' : buf.append("&backslash"); break;
654                     case ':' : buf.append("&colon;"); break;
655                     case '*' : buf.append("&asterisk;"); break;
656                     case '?' : buf.append("&question;"); break;
657                     case '\"' : buf.append("&quot;"); break;
658                     case '<' : buf.append("&lt;"); break;
659                     case '>' : buf.append("&gt;"); break;
660                     case '|' : buf.append("&or;"); break;
661                     default : buf.append(c); break;
662                 }
663             }
664             docURI = buf.toString();
665             
666             return docURI;
667         }
668     }
669 
670     /**
671      * Returns a File with the mapping of this content to its URLs.
672      * @param content
673      * @return long
674      */
675     protected File   contentFile(String   hex, String   extension) {
676         return new File  (storagedir + CONTENT + useFirstCharactersAsDirectories(hex) + hex.substring(storageDirDepth) + extension);
677     }
678     
679     /**
680      * Close storageDirectory File.
681      * @see net.matuschek.http.HttpDocManager#finish()
682      */
683     public void finish() {
684         if (storageDirectoryStream != null) {
685             try {
686                 storageDirectoryStream.close();
687                 storageDirectoryStream = null;
688             } catch (IOException e) {
689                 e.printStackTrace();
690             }
691         }
692     }
693     
694     /**
695      * Calls finish and super.finalize().
696      * @see java.lang.Object#finalize()
697      */
698     protected void finalize() throws Throwable   { 
699         finish();
700         super.finalize();
701     }
702     
703     /**
704      * Depth of source set directory.
705      * (depth = number of used subdirectory levels)
706      * The first storageDirDepth characters of file will be used
707      * as directories.
708      */
709     protected int storageDirDepth = 0;
710     
711     /**
712      * Sets the desired directory depth of the source set directory
713      * (depth = number of used subdirectory levels)
714      * 
715      * @param desired depth of source set directory.
716      */
717     public void setStorageDirDepth(int depth) { storageDirDepth = depth; }
718     
719     /**
720      * Method getstorageDirDepth.
721      * returns the directory depth of the source set directory
722      * @param desired depth of source set directory.
723      * @return the directory depth of the source set directory
724      */
725     public int getStorageDirDepth() { return storageDirDepth; }
726     
727     /**
728      * Get relevant part of contenttype and get default extension for it.
729      * @param contenttype
730      * @return extension
731      */
732     private String   getExtensionFromContenttype(String   contenttype) {
733         String   extension = null;
734         if (contenttype != null){
735             String   strContentType = null;
736             int pos = contenttype.indexOf(';');
737             if (pos > 0) {
738                 strContentType = contenttype.substring(0, pos).trim();
739             } else {
740                 strContentType = contenttype.trim();
741             }
742             extension = getDefaultExtension(strContentType);
743         }
744         
745         if (extension == null) {
746             extension = "";
747         } else {
748             extension = "." + extension;
749         }
750         return extension;
751     }
752 
753     /**
754      * Get default extension for given contentType.
755      * @param contentType
756      * @return default extension or null
757      */
758     protected String   getDefaultExtension(String   contentType) {
759         if (contentType == null) {
760             return null;
761         } else if (contentType.indexOf("text/html") >= 0) {
762             return ".html";
763         } else if (contentType.indexOf("text/") >= 0) {
764             return ".txt";
765         } else {
766             return null;
767         }
768     }
769 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags