HttpDocToFile


1   package net.matuschek.http;
2   
3   /************************************************
4       Copyright (c) 2001/2002 by Daniel Matuschek
5   *************************************************/
6   
7   
8   import java.io.BufferedOutputStream  ;
9   import java.io.File  ;
10  import java.io.FileInputStream  ;
11  import java.io.FileOutputStream  ;
12  import java.io.IOException  ;
13  import java.net.URL  ;
14  import java.util.StringTokenizer  ;
15  
16  import org.apache.log4j.Category;
17  
18  /**
19   * DocumentManager that will store document contents in a file.
20   *
21   * @author Daniel Matuschek 
22   * @version $Revision: 1.11 $
23   */
24  public class HttpDocToFile extends AbstractHttpDocManager
25  {
26    /**
27     * directory where the files will be created
28     */
29    private String   baseDir;
30  
31  
32    /**
33     * the object will not store files smaller then this size !
34     */
35    private int minFileSize;
36    
37  
38    /**
39     * defines if special characters in the URL should be replaced
40     * by "normal" characters
41     * @see #setReplaceAllSpecials(boolean)
42     */
43    private boolean replaceAllSpecials = false;
44  
45  
46    /**
47     * defines, if CGIs should be stored on disc. 
48     *
49     * @see #setStoreCGI
50     */
51    private boolean storeCGI = true;
52  
53    /** Log4J logging */
54    private Category log;
55  
56  
57    
58    /**
59     * creates a new HttpDocToFile object that will store the
60     * documents in the given directory
61     */
62    public HttpDocToFile(String   baseDir) {
63      this.baseDir = baseDir;
64      log = Category.getInstance(getClass().getName());
65    }
66    
67  
68    /**
69     * store document (that means write it to disk)
70     * @param doc the document to store
71     * @exception DocManagerException if the document can't be stored
72     * (some IO error occured)
73     */
74    public void storeDocument(HttpDoc doc) 
75      throws DocManagerException
76    {
77      if ((doc == null) || (doc.getContent() == null)) {
78        return;
79      }
80      
81      /* 
82       * write file only, if this was NOT a cached document
83       * (in this case we have it already on harddisk)
84       */
85      if (doc.isCached()) {
86          return;
87      }
88  
89  
90      if ((! storeCGI)
91      && (doc.getURL().toString().indexOf('?') >= 0)) {
92        // do not store dynamic pages, because storeCGI is false
93        // and the URL contains a "?"
94        return;
95      }
96  
97  
98      String   filename = url2Filename(doc.getURL());
99      if (doc.getContent().length >= minFileSize) {
100       try {
101         createDirs(filename);
102         BufferedOutputStream   os = 
103           new BufferedOutputStream  (new FileOutputStream  (filename));
104         os.write(doc.getContent());
105         os.flush();
106         os.close();
107       } catch (IOException   e) {
108         throw new DocManagerException(e.getMessage());
109       }
110     }
111   }
112 
113 
114   /**
115    * Gets the cacheFile of the given URL if its document was stored.
116    * @param url
117    * @return cacheFile
118    */
119   protected File   getCacheFile(URL   url) {
120       // does the file exists on the filesystem ?
121       File   cacheFile = new File  (url2Filename(url));
122       if (! (cacheFile.exists() && (cacheFile.isFile()))) {
123         return null;
124       }
125       return cacheFile;
126   }
127 
128     /**
129      * Gets the extension of the given URL if its document was stored.
130      * @param url
131      * @return String
132      */
133     protected String   getExtension(URL   url) {
134         // is it dynamic ?
135         if ((url.toString().indexOf('?') >= 0) 
136         || (url.toString().indexOf("cgi") >= 0)) {
137           return null;
138         }
139         
140         // do we have an filename extension ?
141         // without it is not possible to guess the MIME type.
142         String   path = url.getPath();
143         String   ext = null;
144     
145         if (path.indexOf(".") < 0) {
146             return null;
147         }
148     
149         StringTokenizer   st = new StringTokenizer  (path,".");
150         while (st.hasMoreTokens()) {
151             ext = st.nextToken();
152         }
153         // no extension if ext contains a "/"
154         if (ext.indexOf("/") >= 0) {
155             return null;
156         }
157         
158         return ext;
159     }
160     
161   /**
162    * Removes a document that was stored previous from the file system. Because
163    * the HttpDocToFile does not store the HTTP headers, only the Content-Type
164    * header will exists. Even this header may not be correct. It will only use a
165    * simple heuristic to determine the possible MIME type.
166    */
167   public void removeDocument(URL   u) {
168     String   ext = getExtension(u);
169     if (ext == null) return;
170     File   cacheFile = getCacheFile(u);
171     if (cacheFile == null) return ;
172     
173     cacheFile.delete();
174   }
175 
176   /**
177    * Gets a document that was stored previous from the file system.
178    * Because the HttpDocToFile does not store the HTTP headers, only
179    * the Content-Type header will exists. Even this header may not 
180    * be correct. It will only use a simple heuristic to determine the
181    * possible MIME type.
182    *
183    * @return null, if this document was not stored before or it seems
184    * to be a dynamic document.
185    */
186   public HttpDoc retrieveFromCache(URL   u) {
187     String   ext = getExtension(u);
188     if (ext == null) return null;
189     File   cacheFile = getCacheFile(u);
190     if (cacheFile == null) return null;
191     
192     // create a buffer;
193     long size = cacheFile.length();
194     if (size > Integer.MAX_VALUE) {
195       log.info("File too large");
196       return null;
197     }
198 
199     byte[] buff = new byte[(int) size];
200 
201     // read the file
202     try {
203       FileInputStream   fi = new FileInputStream  (cacheFile);
204       fi.read(buff);
205     } catch (IOException   e) {
206       log.info("Could not read cached document "+e.getMessage());
207       return null;
208     }
209     
210     // create a new HttpDoc object
211     HttpDoc doc = new HttpDoc();
212 
213     // and set the content and the header
214     doc.setHttpCode("HTTP/1.0 200 OK");
215     doc.setContent(buff);
216     
217    
218     // now guess the MIME type
219     String   mimetype = null;
220 
221     if (ext.equals("html") 
222     || ext.equals("htm")
223     || ext.equals("shtml")
224     || ext.equals("asp")
225     || ext.equals("php")
226     || ext.equals("jsp")) {
227       mimetype="text/html";
228     } else {
229       mimetype="application/unknown";
230     }
231 
232     doc.addHeader(new HttpHeader("Content-Type",mimetype));    
233     doc.setURL(u);
234     doc.setCached(true);
235     
236     return doc;
237   }
238   
239 
240   /**
241    * gets the value of baseDir
242    * @return the value of baseDir
243    */
244   public String   getBaseDir() {
245     return baseDir;
246   }
247   
248 
249   /**
250    * sets the value of basedir
251    * @param baseDir the new value of baseDir
252    */
253   public void setBaseDir(String   baseDir) {
254     this.baseDir = baseDir;
255   }
256   
257 
258   /**
259    * converts an URL to a filename http://host/path will 
260    * be converted to basedir/host/path
261    * @param URL a URL to convert, must not be null
262    * @return a pathname
263    */
264   protected String   url2Filename(URL   u) {
265     StringBuffer   sb = new StringBuffer  ();
266 
267     sb.append(baseDir);
268     sb.append(File.separatorChar);
269     sb.append(u.getHost());
270     sb.append(u.getFile());
271 
272     // is there a query part ?
273     // that is something after the file name seperated by ?
274     String   query = u.getQuery();
275     if ((query != null) &&
276     (!query.equals(""))) {
277       sb.append(File.separatorChar);
278       sb.append(query);
279     }
280 
281     // filename that ends with /
282     // are directories, we will name the file "index.html"
283     if (sb.charAt(sb.length()-1) == '/') {
284       sb.append("index.html");
285     } 
286 
287     // postprocess filename (replace special characters)
288     for (int i=0; i<sb.length(); i++) {
289       char c=sb.charAt(i);
290       char newc=(char)0;
291 
292       // replace / by operating system file name separator
293       if (c == '/') {
294     newc = File.separatorChar;
295       }
296       
297       // replace special characters from CGIs
298       if (replaceAllSpecials) {
299     if ((c == '?')
300         || (c == '=')
301         || (c == '&')) {
302       newc = '-';
303     }
304       }
305 
306       if ((newc != (char)0) 
307       && (newc != c)) {
308     sb.setCharAt(i,newc);
309       }
310     }
311 
312     return sb.toString();
313   }
314   
315 
316   /** 
317    * creates all directories that are needed to place the 
318    * file filename if they don't exists 
319    * @param filename the full path name of a file
320    */
321   protected void createDirs(String   filename) throws IOException   {
322     int pos = -1;
323     // look for the last directory separator in the filename
324     for (int i = filename.length() - 1; i >= 0; i--) {
325       if (filename.charAt(i) == File.separatorChar) {
326     pos = i;
327     i = -1;
328       }
329     }
330     File   dir = new File  (filename.substring(0, pos));
331     dir.mkdirs();
332   }
333   
334 
335   /**
336    * gets the value of minFileSize. Files smaller then this size
337    * (in Bytes) will not be saved to disk !
338    * @return the value of minFileSize 
339    */
340   public int getMinFileSize() {
341     return minFileSize;
342   }
343 
344   
345   /**
346    * sets the value of minFileSize
347    * @param minFileSize the new value of minFileSize
348    * @see #getMinFileSize()
349    */
350   public void setMinFileSize(int minFileSize) {
351     this.minFileSize = minFileSize;
352   }
353 
354 
355   /**
356    * Get the value of replaceAllSpecials.
357    *
358    * if replaceAllSpecials is true, all sepcial characters in the URL
359    * will be replaced by "-". This is useful for operating system that
360    * can't handle files with special characters in the filename (e.g.
361    * Windows)
362    *
363    * @return value of replaceAllSpecials.
364    */
365   public boolean isReplaceAllSpecials() {
366     return replaceAllSpecials;
367   }
368   
369 
370   /**
371    * Set the value of replaceAllSpecials.
372    *
373    * if replaceAllSpecials is true, all sepcial characters in the URL
374    * will be replaced by "-". This is useful for operating system that
375    * can't handle files with special characters in the filename (e.g.
376    * Windows)
377    *
378    * @param v  Value to assign to replaceAllSpecials.
379    */
380   public void setReplaceAllSpecials(boolean  v) {
381     this.replaceAllSpecials = v;
382   } 
383 
384 
385   /**
386    * Get the value of storeCGI
387    *
388    * If this is true, the object will store ALL retrieved documents,
389    * otherwise it will store only documents from URLs that do not
390    * have a "?" in the URL
391    */
392   public boolean getStoreCGI() {
393     return storeCGI;
394   }
395   
396 
397   /**
398    * Set the value of storeCGI.
399    *
400    * If this is true, the object will store ALL retrieved documents,
401    * otherwise it will store only documents from URLs that do not
402    * have a "?" in the URL
403    *
404    * @param v  Value to assign to storeCGI.
405    */
406   public void setStoreCGI(boolean v) {
407     this.storeCGI = v;
408   } 
409 
410 }
411 
412 
413
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags