KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > writer > MirrorWriterProcessor


1 /* MirrorWriter
2  *
3  * $Id: MirrorWriterProcessor.java,v 1.5.16.1 2007/01/13 01:31:30 stack-sf Exp $
4  *
5  * Created on 2004 October 26
6  *
7  * Copyright (C) 2004 Internet Archive.
8  *
9  * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24  */

25 package org.archive.crawler.writer;
26
27 import java.io.File JavaDoc;
28 import java.io.FileOutputStream JavaDoc;
29 import java.io.FilenameFilter JavaDoc;
30 import java.io.IOException JavaDoc;
31 import java.text.NumberFormat JavaDoc;
32 import java.util.Collections JavaDoc;
33 import java.util.HashMap JavaDoc;
34 import java.util.HashSet JavaDoc;
35 import java.util.Iterator JavaDoc;
36 import java.util.Map JavaDoc;
37 import java.util.Set JavaDoc;
38 import java.util.TreeMap JavaDoc;
39 import java.util.logging.Level JavaDoc;
40 import java.util.logging.Logger JavaDoc;
41
42 import javax.management.AttributeNotFoundException JavaDoc;
43
44 import org.archive.crawler.datamodel.CoreAttributeConstants;
45 import org.archive.crawler.datamodel.CrawlURI;
46 import org.archive.crawler.framework.Processor;
47 import org.archive.crawler.settings.ListType;
48 import org.archive.crawler.settings.RegularExpressionConstraint;
49 import org.archive.crawler.settings.SimpleType;
50 import org.archive.crawler.settings.StringList;
51 import org.archive.crawler.settings.Type;
52 import org.archive.io.RecordingInputStream;
53 import org.archive.io.ReplayInputStream;
54 import org.archive.net.UURI;
55 import org.archive.util.IoUtils;
56
57 /**
58    Processor module that writes the results of successful fetches to
59    files on disk.
60    
61    Writes contents of one URI to one file on disk. The files are
62    arranged in a directory hierarchy based on the URI paths. In that sense
63    they mirror the file hierarchy that might exist on the servers.
64    <p>
65    There are a number of issues involved:
66    <ul>
67    <li>
68    URIs can have arbitrary length, but file systems have length constraints.
69    </li>
70    <li>
71    URIs can contain characters that file systems prohibit.
72    </li>
73    <li>
74    URI paths are case-sensitive, but some file systems are case-insensitive.
75    </li>
76    </ul>
77    This class tries very hard to map each URI into a file system path that
78    obeys all file system constraints and yet reasonably represents
79    the original URI.
80    <p>
81    There would normally be a single instance of this class per Heritrix
82    instance. This class is thread-safe; any number of threads can be in its
83    innerProcess method at once. However, conflicts can still arise in the file
84    system. For example, if several threads try to create the same directory at
85    the same time, only one can win. Therefore, there should be at most one
86    access to a server at a given time.
87    
88    @author Howard Lee Gayle
89 */

90 public class MirrorWriterProcessor
91 extends Processor implements CoreAttributeConstants {
92
93     private static final long serialVersionUID = 301407556928389168L;
94
95     /**
96      * Key to use asking settings for case sensitive option.
97      */

98     public static final String JavaDoc ATTR_CASE_SENSITIVE = "case-sensitive";
99
100     /**
101      * Key to use asking settings for character map.
102      */

103     public static final String JavaDoc ATTR_CHAR_MAP = "character-map";
104
105     /**
106      * Key to use asking settings for content type map.
107      */

108     public static final String JavaDoc ATTR_CONTENT_TYPE_MAP = "content-type-map";
109
110     /**
111      * Key to use asking settings for dot begin replacement.
112      */

113     public static final String JavaDoc ATTR_DOT_BEGIN = "dot-begin";
114
115     /**
116      * Key to use asking settings for dot end replacement.
117      */

118     public static final String JavaDoc ATTR_DOT_END = "dot-end";
119
120     /**
121      * Key to use asking settings for directory file.
122      */

123     public static final String JavaDoc ATTR_DIRECTORY_FILE = "directory-file";
124
125     /**
126      * Key to use asking settings for host directory option.
127      */

128     public static final String JavaDoc ATTR_HOST_DIRECTORY = "host-directory";
129
130     /**
131      * Key to use asking settings for host map.
132      */

133     public static final String JavaDoc ATTR_HOST_MAP = "host-map";
134
135     /**
136      * Key to use asking settings for maximum file system path length.
137      */

138     public static final String JavaDoc ATTR_MAX_PATH_LEN = "max-path-length";
139
140     /**
141      * Key to use asking settings for maximum file system path segment length.
142      */

143     public static final String JavaDoc ATTR_MAX_SEG_LEN = "max-segment-length";
144
145     /**
146      * Key to use asking settings for base directory path value.
147      */

148     public static final String JavaDoc ATTR_PATH = "path";
149
150     /**
151      * Key to use asking settings for port directory option.
152      */

153     public static final String JavaDoc ATTR_PORT_DIRECTORY = "port-directory";
154
155     /**
156      * Key to use asking settings for suffix at end option.
157      */

158     public static final String JavaDoc ATTR_SUFFIX_AT_END = "suffix-at-end";
159
160     /**
161      * Key to use asking settings for too-long directory.
162      */

163     public static final String JavaDoc ATTR_TOO_LONG_DIRECTORY = "too-long-directory";
164
165     /**
166      * Key to use asking settings for underscore set.
167      */

168     public static final String JavaDoc ATTR_UNDERSCORE_SET = "underscore-set";
169
170     /** Default value for ATTR_DOT_BEGIN.*/
171     private static final String JavaDoc DEFAULT_DOT_BEGIN = "%2E";
172
173     /** Default maximum file system path length.*/
174     private static final int DEFAULT_MAX_PATH_LEN = 1023;
175
176     /** Default maximum file system path segment length.*/
177     private static final int DEFAULT_MAX_SEG_LEN = 255;
178
179     /** Default value for ATTR_TOO_LONG_DIRECTORY.*/
180     private static final String JavaDoc DEFAULT_TOO_LONG_DIRECTORY = "LONG";
181
182     /** An empty Map.*/
183     private static final Map JavaDoc<String JavaDoc,String JavaDoc> EMPTY_MAP
184      = Collections.unmodifiableMap(new TreeMap JavaDoc<String JavaDoc,String JavaDoc>());
185
186     /**
187        Regular expression matching a file system path segment.
188        The intent is one or more non-file-separator characters.
189        The backslash is to quote File.separator if it's also backslash.
190     */

191     private static final String JavaDoc PATH_SEGMENT_RE =
192         "[^\\" + File.separator + "]+";
193
194     /**
195        Regular expression constraint on ATTR_DIRECTORY_FILE.
196        The intent is one non-file-separator character,
197        followed by zero or more characters.
198        The backslash is to quote File.separator if it's also backslash.
199     */

200     private static final String JavaDoc TOO_LONG_DIRECTORY_RE =
201         "[^\\" + File.separator + "].*";
202
203     /**
204      * Logger.
205      */

206     private static final Logger JavaDoc logger =
207         Logger.getLogger(MirrorWriterProcessor.class.getName());
208
209     /**
210      * @param name Name of this processor.
211      */

212     public MirrorWriterProcessor(String JavaDoc name) {
213         super(name, "MirrorWriter processor. " +
214             "A writer that writes each URL to a file on disk named for " +
215             "a derivative of the URL.");
216         Type e; // Current element.
217
addElementToDefinition(new SimpleType(ATTR_CASE_SENSITIVE,
218             "True if the file system is case-sensitive, like UNIX. "
219             + "False if the file system is case-insensitive, "
220             + "like Macintosh HFS+ and Windows.",
221             Boolean.TRUE));
222         addElementToDefinition(new StringList(ATTR_CHAR_MAP,
223             "This list is grouped in pairs. "
224             + "The first string in each pair must have a length of one. "
225             + "If it occurs in a URI path, "
226             + "it is replaced by the second string in the pair. "
227             + "For UNIX, no character mapping is normally needed. "
228             + "For Macintosh, the recommended value is [: %%3A]. "
229             + "For Windows, the recommended value is "
230             + "[' ' %%20 &quot; %%22 * %%2A : %%3A < %%3C "
231             + "\\> %%3E ? %%3F \\\\ %%5C ^ %%5E | %%7C]."));
232         addElementToDefinition(new StringList(ATTR_CONTENT_TYPE_MAP,
233             "This list is grouped in pairs. "
234             + "If the content type of a resource begins (case-insensitive) "
235             + "with the first string in a pair, the suffix is set to "
236             + "the second string in the pair, replacing any suffix that may "
237             + "have been in the URI. For example, to force all HTML files "
238             + "to have the same suffix, use [text/html html]."));
239         e = addElementToDefinition(new SimpleType(ATTR_DIRECTORY_FILE,
240             "Implicitly append this to a URI ending with '/'.",
241             "index.html"));
242         e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
243             Level.SEVERE, "This must be a simple file name."));
244         e = addElementToDefinition(new SimpleType(ATTR_DOT_BEGIN,
245             "If a segment starts with '.', the '.' is replaced by this.",
246             DEFAULT_DOT_BEGIN));
247         e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
248             Level.SEVERE,
249             "This must not be empty, and must not contain " + File.separator));
250         addElementToDefinition(new SimpleType(ATTR_DOT_END,
251             "If a directory name ends with '.' it is replaced by this. "
252             + "For all file systems except Windows, '.' is recommended. "
253             + "For Windows, %%2E is recommended.",
254             "."));
255         addElementToDefinition(new StringList(ATTR_HOST_MAP,
256             "This list is grouped in pairs. "
257             + "If a host name matches (case-insensitive) the first string "
258             + "in a pair, it is replaced by the second string in the pair. "
259             + "This can be used for consistency when several names are used "
260             + "for one host, for example "
261             + "[12.34.56.78 www42.foo.com]."));
262         addElementToDefinition(new SimpleType(ATTR_HOST_DIRECTORY,
263             "Create a subdirectory named for the host in the URI.",
264             Boolean.TRUE));
265         addElementToDefinition(new SimpleType(ATTR_PATH,
266             "Top-level directory for mirror files.", "mirror"));
267
268         // TODO: Add a new Constraint subclass so ATTR_MAX_PATH_LEN and
269
// ATTR_MAX_SEG_LEN can be constained to reasonable values.
270
addElementToDefinition(new SimpleType(ATTR_MAX_PATH_LEN,
271             "Maximum file system path length.",
272             new Integer JavaDoc(DEFAULT_MAX_PATH_LEN)));
273         addElementToDefinition(new SimpleType(ATTR_MAX_SEG_LEN,
274             "Maximum file system path segment length.",
275             new Integer JavaDoc(DEFAULT_MAX_SEG_LEN)));
276         addElementToDefinition(new SimpleType(ATTR_PORT_DIRECTORY,
277             "Create a subdirectory named for the port in the URI.",
278             Boolean.FALSE));
279         addElementToDefinition(new SimpleType(ATTR_SUFFIX_AT_END,
280             "If true, the suffix is placed at the end of the path, "
281             + "after the query (if any). If false, the suffix is placed "
282             + "before the query.",
283             Boolean.TRUE));
284         e = addElementToDefinition(new SimpleType(ATTR_TOO_LONG_DIRECTORY,
285             "If all the directories in the URI would exceed, "
286             + "or come close to exceeding, the file system maximum "
287             + "path length, then they are all replaced by this.",
288             DEFAULT_TOO_LONG_DIRECTORY));
289         e.addConstraint(new RegularExpressionConstraint(TOO_LONG_DIRECTORY_RE,
290             Level.SEVERE, "This must be relative and not empty."));
291         addElementToDefinition(new StringList(ATTR_UNDERSCORE_SET,
292             "If a directory name appears (case-insensitive) in this list "
293             + "then an underscore is placed before it. "
294             + "For all file systems except Windows, this is not needed. "
295             + "For Windows, the following is recommended: "
296             + "[com1 com2 com3 com4 com5 com6 com7 com8 com9 "
297             + "lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 "
298             + "con nul prn]."));
299     }
300
301     protected void innerProcess(CrawlURI curi) {
302         if (!curi.isSuccess()) {
303             return;
304         }
305         UURI uuri = curi.getUURI(); // Current URI.
306

307         // Only http and https schemes are supported.
308
String JavaDoc scheme = uuri.getScheme();
309         if (!"http".equalsIgnoreCase(scheme)
310                 && !"https".equalsIgnoreCase(scheme)) {
311             return;
312         }
313         RecordingInputStream recis = curi.getHttpRecorder().getRecordedInput();
314         if (0L == recis.getResponseContentLength()) {
315             return;
316         }
317
318         String JavaDoc baseDir = null; // Base directory.
319
String JavaDoc baseSeg = null; // ATTR_PATH value.
320
try {
321             baseSeg = (String JavaDoc) getAttribute(ATTR_PATH, curi);
322         } catch (AttributeNotFoundException JavaDoc e) {
323             logger.warning(e.getLocalizedMessage());
324             return;
325         }
326
327         // Trim any trailing File.separatorChar characters from baseSeg.
328
while ((baseSeg.length() > 1) && baseSeg.endsWith(File.separator)) {
329             baseSeg = baseSeg.substring(0, baseSeg.length() - 1);
330         }
331         if (0 == baseSeg.length()) {
332             baseDir = getController().getDisk().getPath();
333         } else if ((new File JavaDoc(baseSeg)).isAbsolute()) {
334             baseDir = baseSeg;
335         } else {
336             baseDir = getController().getDisk().getPath() + File.separator
337                 + baseSeg;
338         }
339
340         // Already have a path for this URI.
341
boolean reCrawl = curi.containsKey(A_MIRROR_PATH);
342
343         /*
344           The file system path, relative to the value of ATTR_PATH, where
345           this resource should be written. The intent is to
346           add later a persistent mapping from URI to path.
347           This will allow a URI to be re-crawled and updated
348           if it has changed. If the resource has already been fetched
349           and written to a file before, the path to that file
350           has already been obtained from the persistent mapping
351           and placed on the AList by some other module,
352           such as the frontier.
353         */

354         String JavaDoc mps = null;
355         File JavaDoc destFile = null; // Write resource contents to this file.
356
try {
357             if (reCrawl) {
358                 mps = curi.getString(A_MIRROR_PATH);
359                 destFile = new File JavaDoc(baseDir + File.separator + mps);
360                 File JavaDoc parent = destFile.getParentFile();
361                 if (null != parent) {
362                     IoUtils.ensureWriteableDirectory(parent);
363                 }
364             } else {
365                 URIToFileReturn r = null; // Return from uriToFile().
366
try {
367                      r = uriToFile(baseDir, curi);
368                 } catch (AttributeNotFoundException JavaDoc e) {
369                     logger.warning(e.getLocalizedMessage());
370                     return;
371                 }
372                 destFile = r.getFile();
373                 mps = r.getRelativePath();
374             }
375             logger.info(uuri.toString() + " -> " + destFile.getPath());
376             writeToPath(recis, destFile);
377             if (!reCrawl) {
378                 curi.putString(A_MIRROR_PATH, mps);
379             }
380         } catch (IOException JavaDoc e) {
381             curi.addLocalizedError(this.getName(), e, "Mirror");
382         }
383     }
384
385     /**
386        Gets the directory in which the file will reside.
387        Any directories needed are created.
388        @param baseDir the path to the starting directory
389        @param host the host part of the URI, or null if the host name
390        should not be part of the returned path
391        @param port the port part of the URI, or -1 if the port
392        should not be part of the returned path
393        @param segs all the segments in the URI
394        @param maxLen the maximum path length allowed to the directory;
395        this must leave some room for the file itself
396        @return the directory, or null if maxLen would be exceeded
397        @throws IOException
398        if a needed directory could not be created
399        @throws IOException
400        if a needed directory is not writeable
401        @throws IOException
402        if a non-directory file exists with the same path as a needed directory
403     */

404     private URIToFileReturn dirPath(String JavaDoc baseDir, String JavaDoc host, int port,
405                                     PathSegment[] segs, int maxLen)
406         throws IOException JavaDoc {
407
408         // Return value.
409
URIToFileReturn r = new URIToFileReturn(baseDir, host, port);
410         r.mkdirs();
411         for (int i = 0; (segs.length - 1) != i; ++i) {
412             segs[i].addToPath(r);
413             if (r.longerThan(maxLen)) {
414                 return null;
415             }
416         }
417         return r;
418     }
419
420     /**
421        Ensures that a list contains an even number of elements.
422        If not, the last element is removed.
423        @param list the list
424     */

425     private void ensurePairs(ListType list) {
426         if (1 == (list.size() % 2)) {
427             list.remove(list.size() - 1);
428         }
429     }
430
431     /**
432        Makes a path in which a resource can be stored.
433        @param baseDir the path to the starting directory
434        @param curi the URI
435        @return a path to the file in which to store the resource
436        @throws AttributeNotFoundException
437        if a needed setting is missing
438        @throws IOException
439        if a needed directory could not be created
440        @throws IOException
441        if a needed directory is not writeable
442        @throws IOException
443        if a non-directory file exists with the same path as a needed directory
444     */

445     private URIToFileReturn uriToFile(String JavaDoc baseDir, CrawlURI curi)
446         throws AttributeNotFoundException JavaDoc, IOException JavaDoc {
447         UURI uuri = curi.getUURI(); // Current URI.
448
String JavaDoc host = null;
449         Boolean JavaDoc hd = (Boolean JavaDoc) getAttribute(ATTR_HOST_DIRECTORY, curi);
450         if (hd.booleanValue()) {
451             host = uuri.getHost();
452             StringList hostMap = (StringList) getAttribute(ATTR_HOST_MAP, curi);
453             if ((null != hostMap) && (hostMap.size() > 1)) {
454                 ensurePairs(hostMap);
455                 Iterator JavaDoc<String JavaDoc> i = hostMap.typesafe().iterator();
456                 for (boolean more = true; more && i.hasNext();) {
457                     String JavaDoc h1 = i.next();
458                     String JavaDoc h2 = i.next();
459                     if (host.equalsIgnoreCase(h1)) {
460                         more = false;
461                         if ((null != h2) && (0 != h2.length())) {
462                             host = h2;
463                         }
464                     }
465                 }
466             }
467         }
468
469         int port =
470             ((Boolean JavaDoc) getAttribute(ATTR_PORT_DIRECTORY, curi)).booleanValue()
471             ? uuri.getPort()
472             : -1;
473
474         String JavaDoc suffix = null; // Replacement suffix.
475
StringList ctm = (StringList) getAttribute(ATTR_CONTENT_TYPE_MAP, curi);
476         if ((null != ctm) && (ctm.size() > 1)) {
477             ensurePairs(ctm);
478             String JavaDoc contentType = curi.getContentType().toLowerCase();
479             Iterator JavaDoc i = ctm.iterator();
480             for (boolean more = true; more && i.hasNext();) {
481                 String JavaDoc ct = (String JavaDoc) i.next();
482                 String JavaDoc suf = (String JavaDoc) i.next();
483                 if ((null != ct) && contentType.startsWith(ct.toLowerCase())) {
484                     more = false;
485                     if ((null != suf) && (0 != suf.length())) {
486                         suffix = suf;
487                     }
488                 }
489             }
490         }
491
492         int maxSegLen =
493             ((Integer JavaDoc) getAttribute(ATTR_MAX_SEG_LEN, curi)).intValue();
494         if (maxSegLen < 2) {
495             maxSegLen = DEFAULT_MAX_SEG_LEN;
496         }
497
498         int maxPathLen =
499             ((Integer JavaDoc) getAttribute(ATTR_MAX_PATH_LEN, curi)).intValue();
500         if (maxPathLen < 2) {
501             maxPathLen = DEFAULT_MAX_PATH_LEN;
502         }
503
504         Map JavaDoc<String JavaDoc,String JavaDoc> characterMap = EMPTY_MAP;
505         StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi);
506         if ((null != cm) && (cm.size() > 1)) {
507             ensurePairs(cm);
508             characterMap = new HashMap JavaDoc<String JavaDoc,String JavaDoc>(cm.size());
509             // Above will be half full.
510
for (Iterator JavaDoc i = cm.iterator(); i.hasNext();) {
511                 String JavaDoc s1 = (String JavaDoc) i.next();
512                 String JavaDoc s2 = (String JavaDoc) i.next();
513                 if ((null != s1) && (1 == s1.length()) && (null != s2)
514                         && (0 != s2.length())) {
515                     characterMap.put(s1, s2);
516                 }
517             }
518         }
519
520         String JavaDoc dotBegin = (String JavaDoc) getAttribute(ATTR_DOT_BEGIN, curi);
521         if (".".equals(dotBegin)) {
522             dotBegin = null;
523         }
524
525         String JavaDoc dotEnd = (String JavaDoc) getAttribute(ATTR_DOT_END, curi);
526         if (".".equals(dotEnd)) {
527             dotEnd = null;
528         }
529
530         String JavaDoc tld = (String JavaDoc) getAttribute(ATTR_TOO_LONG_DIRECTORY, curi);
531         if ((null == tld) || (0 == tld.length())
532                 || (-1 != tld.indexOf(File.separatorChar))) {
533             tld = DEFAULT_TOO_LONG_DIRECTORY;
534         }
535
536         Set JavaDoc<String JavaDoc> underscoreSet = null;
537         StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET, curi);
538         if ((null != us) && (0 != us.size())) {
539             underscoreSet = new HashSet JavaDoc<String JavaDoc>(us.size(), 0.5F);
540             for (String JavaDoc s: us.typesafe()) {
541                 if ((null != s) && (0 != s.length())) {
542                     underscoreSet.add(s.toLowerCase());
543                 }
544             }
545         }
546
547         return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(),
548             suffix, baseDir, maxSegLen, maxPathLen,
549             ((Boolean JavaDoc) getAttribute(ATTR_CASE_SENSITIVE, curi)).booleanValue(),
550             (String JavaDoc) getAttribute(ATTR_DIRECTORY_FILE, curi),
551             characterMap, dotBegin, dotEnd, tld,
552             ((Boolean JavaDoc) getAttribute(ATTR_SUFFIX_AT_END, curi)).booleanValue(),
553             underscoreSet);
554     }
555
556     /**
557        Makes a path in which a resource can be stored.
558        @param curi the URI
559        @param host the host part of the URI, or null if the host name
560        should not be part of the returned path
561        @param port the port part of the URI, or -1 if the port
562        should not be part of the returned path
563        @param uriPath the path part of the URI (must be absolute)
564        @param query the query part of the URI, or null if none
565        @param suffix if non-null, use this as the suffix in preference to
566        any suffix that uriPath might have
567        @param baseDir the path to the starting directory
568        @param maxSegLen the maximum number of characters allowed in one
569        file system path segment (component)
570        @param maxPathLen the maximum number of characters allowed in a
571        file system path
572        @param caseSensitive if true, the file system is assumed to be
573        case-sensitive; otherwise the file system is assumed to be
574        case-insensitive but case-preserving
575        @param dirFile the simple file name to append to a URI path
576        ending in '/'
577        @param characterMap a map from characters (as length-1 String values) in
578        the URI path and query to replacement String values
579        @param dotBegin if non-null, this replaces a '.' at
580        the beginning of a segment
581        @param dotEnd if non-null, this replaces a '.' that appears at the end
582        of a directory name
583        @param tooLongDir if the path length would exceed or be close to
584        exceeding maxPathLen then this simple name is used as a directory
585        under baseDir instead
586        @param suffixAtEnd if true, the suffix is placed at the end of the
587        path, after the query (if any); otherwise, the suffix is placed
588        before the query
589        @param underscoreSet if non-null and a segment, after conversion
590        to lower case, is in this set, then prepend an underscore
591        to the segment
592        @return a path to the file in which to store the resource
593        @throws IOException
594        if a needed directory could not be created
595        @throws IOException
596        if a needed directory is not writeable
597        @throws IOException
598        if a non-directory file exists with the same path as a needed directory
599     */

600     private URIToFileReturn uriToFile(CrawlURI curi, String JavaDoc host, int port,
601             String JavaDoc uriPath, String JavaDoc query, String JavaDoc suffix, String JavaDoc baseDir,
602             int maxSegLen, int maxPathLen, boolean caseSensitive,
603             String JavaDoc dirFile, Map JavaDoc characterMap, String JavaDoc dotBegin, String JavaDoc dotEnd,
604             String JavaDoc tooLongDir, boolean suffixAtEnd, Set JavaDoc underscoreSet)
605             throws IOException JavaDoc {
606         assert (null == host) || (0 != host.length());
607         assert 0 != uriPath.length();
608         assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath;
609         assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath;
610         assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath;
611         assert !uriPath.endsWith("/.") : "uriPath: " + uriPath;
612         assert (null == query) || (-1 == query.indexOf('/'))
613             : "query: " + query;
614         assert (null == suffix)
615             || ((0 != suffix.length()) && (-1 == suffix.indexOf('/')))
616             : "suffix: " + suffix;
617         assert 0 != baseDir.length();
618         assert maxSegLen > 2 : "maxSegLen: " + maxSegLen;
619         assert maxPathLen > 1;
620         assert maxPathLen >= maxSegLen
621             : "maxSegLen: " + maxSegLen + " maxPathLen: " + maxPathLen;
622         assert 0 != dirFile.length();
623         assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile;
624         assert null != characterMap;
625         assert (null == dotBegin) || (0 != dotBegin.length());
626         assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: " + dotEnd;
627         assert 0 != tooLongDir.length();
628         assert '/' != tooLongDir.charAt(0) : "tooLongDir: " + tooLongDir;
629
630         int nSegs = 0; // Number of segments in the URI path.
631
for (int i = 0; uriPath.length() != i; ++i) {
632             if ('/' == uriPath.charAt(i)) {
633                 ++nSegs; // Just count slashes.
634
}
635         }
636         assert nSegs > 0 : "uriPath: " + uriPath;
637         PathSegment[] segs = new PathSegment[nSegs]; // The segments.
638
int slashIndex = 0; // Index in uriPath of current /.
639
for (int i = 0; (segs.length - 1) != i; ++i) {
640             int nsi = uriPath.indexOf('/', slashIndex + 1); // Next index.
641
assert nsi > slashIndex : "uriPath: " + uriPath;
642             segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi,
643                                      maxSegLen, caseSensitive, curi,
644                                      characterMap, dotBegin, dotEnd,
645                                      underscoreSet);
646             slashIndex = nsi;
647         }
648         if (slashIndex < (uriPath.length() - 1)) {
649
650             // There's something after the last /.
651
segs[segs.length - 1] = new EndSegment(uriPath, slashIndex + 1,
652                     uriPath.length(), maxSegLen, caseSensitive, curi,
653                     characterMap, dotBegin, query, suffix, maxPathLen,
654                     suffixAtEnd);
655         } else {
656
657             // The URI ends with a /.
658
segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile.length(),
659                     maxSegLen, caseSensitive, curi, characterMap, null,
660                     query, suffix, maxPathLen, suffixAtEnd);
661         }
662         URIToFileReturn r = dirPath(baseDir, host, port, segs,
663                                     maxPathLen - maxSegLen);
664         if (null == r) {
665
666             // The path is too long.
667
// Replace all the segment directories by tooLongDir.
668
PathSegment endSegment = segs[segs.length - 1];
669             segs = new PathSegment[2];
670             segs[0] = new DirSegment(tooLongDir, 0, tooLongDir.length(),
671                                      maxSegLen, caseSensitive, curi, EMPTY_MAP,
672                                      null, null, null);
673             segs[1] = endSegment;
674             r = dirPath(baseDir, host, port, segs, maxPathLen - maxSegLen);
675         }
676         segs[segs.length - 1].addToPath(r);
677         return r;
678     }
679
680     /**
681        Copies a resource into a file.
682        A temporary file is created and then atomically renamed to
683        the destination file.
684        This prevents leaving a partial file in case of a crash.
685        @param recis the RecordingInputStream that recorded the contents
686        of the resource
687        @param dest the destination file
688        @throws IOException on I/O error
689        @throws IOException if
690        the file rename fails
691     */

692     private void writeToPath(RecordingInputStream recis, File JavaDoc dest)
693         throws IOException JavaDoc {
694         ReplayInputStream replayis = recis.getContentReplayInputStream();
695         File JavaDoc tf = new File JavaDoc (dest.getPath() + "N");
696         FileOutputStream JavaDoc fos = new FileOutputStream JavaDoc(tf);
697         try {
698             replayis.readFullyTo(fos);
699         } finally {
700             fos.close();
701             replayis.close();
702         }
703         if (!tf.renameTo(dest)) {
704             throw new IOException JavaDoc("Can not rename " + tf.getAbsolutePath()
705                                   + " to " + dest.getAbsolutePath());
706         }
707
708     }
709
710     /**
711        This class represents one segment (component) of a URI path.
712        A segment between '/' characters is a directory segment.
713        The segment after the last '/' is the end segment.
714     */

715     abstract class PathSegment {
716         /**
717            existsMaybeCaseSensitive return code
718            for a file that does not exist.
719         */

720         protected static final int EXISTS_NOT = 1;
721
722         /**
723            existsMaybeCaseSensitive return code
724            for a file that exists.
725            Furthermore, the comparison is case-sensitive.
726         */

727         protected static final int EXISTS_EXACT_MATCH = 2;
728
729         /**
730            existsMaybeCaseSensitive return code
731            for a file that exists, using a case-insensitive comparison.
732            Furthermore, the file would not exist if the comparison
733            were case-sensitive.
734         */

735         protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3;
736
737         /** The URI, for logging and error reporting.*/
738         protected CrawlURI curi;
739
740         /**
741            The main part of this segment.
742            For a directory segment, that's all there is.
743            For an end segment, it's the part of the URI after the last '/'
744            up to but not including the '.' before the suffix (if any).
745         */

746         protected LumpyString mainPart = null;
747
748         /**
749            The maximum number of characters allowed
750            in one file system path segment.
751            A URI segment can potentially be much longer,
752            but we'll trim it to this.
753         */

754         protected int maxSegLen;
755
756         /** If true, the file system is assumed to be
757             case-sensitive; otherwise the file system is assumed to be
758             case-insensitive.
759         */

760         private boolean caseSensitive;
761
762         /**
763            Creates a new PathSegment.
764            @param maxSegLen the maximum number of characters
765            allowed in one path segment
766            @param caseSensitive if true, the file system is assumed to be
767            case-sensitive; otherwise the file system is assumed to be
768            case-insensitive
769            @param curi the URI
770            @throws IllegalArgumentException if
771            maxSegLen is too small
772         */

773         PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) {
774             if (maxSegLen < 2) {
775                 throw new IllegalArgumentException JavaDoc("maxSegLen: " + maxSegLen);
776             }
777             this.maxSegLen = maxSegLen;
778             this.caseSensitive = caseSensitive;
779             this.curi = curi;
780         }
781
782         /**
783            Adds this segment to a file path.
784            This is the key method of this class.
785            It extends the given path by one segment,
786            named to obey all constraints.
787            A new directory is created if necessary.
788            @param currentPath the current path, to which this segment is added
789            @throws IOException
790            if a needed directory could not be created
791            @throws IOException
792            if a needed directory is not writeable
793         */

794         abstract void addToPath(URIToFileReturn currentPath) throws IOException JavaDoc;
795
796         /**
797            Checks if a file (including directories) exists.
798            @param fsf the directory containing the file to be checked
799            @param segStr the simple file or directory name
800            @param check the file or directory for which to check
801            @return EXISTS_NOT if check does not exist,
802            EXISTS_EXACT_MATCH if check exists with a name that matches
803            (case-sensitive) segStr, and
804            EXISTS_CASE_INSENSITIVE_MATCH if check exists
805            with a name that matches
806            segStr using a case-insensitive match but not using a
807            case-sensitive match
808         */

809         protected int existsMaybeCaseSensitive(File JavaDoc fsf, String JavaDoc segStr,
810                                                File JavaDoc check) {
811             if (caseSensitive) {
812                 return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT;
813             }
814             if (!check.exists()) {
815                 return EXISTS_NOT;
816             }
817
818             /*
819               The JVM says the file exists, but the file system is assumed to be
820               case-insensitive, so do we have an exact match or just a
821               case-insensitive match? We get an array of all the
822               file names that match (case-insensitive) the one we're
823               checking, then we can look for a case-sensitive match.
824             */

825             String JavaDoc[] fna = fsf.list(new CaseInsensitiveFilenameFilter(segStr));
826             for (int i = 0; fna.length != i; ++i) {
827                 if (segStr.equals(fna[i])) {
828                   return EXISTS_EXACT_MATCH;
829                 }
830             }
831             return EXISTS_CASE_INSENSITIVE_MATCH;
832         }
833
834         /**
835            This class implements a FilenameFilter that matches
836            by name, ignoring case.
837         */

838         class CaseInsensitiveFilenameFilter implements FilenameFilter JavaDoc {
839             /** The file name we're looking for. */
840             private String JavaDoc target;
841
842             /**
843                Creates a CaseInsensitiveFilenameFilter.
844                @param target the target file name
845                @throws IllegalArgumentException if
846                target is null or empty.
847             */

848             CaseInsensitiveFilenameFilter(String JavaDoc target) {
849                 if (null == target) {
850                     throw new IllegalArgumentException JavaDoc("target null");
851                 }
852                 if (0 == target.length()) {
853                     throw new IllegalArgumentException JavaDoc("target empty");
854                 }
855                 this.target = target;
856             }
857
858             public boolean accept(File JavaDoc dir, String JavaDoc name) {
859                 return target.equalsIgnoreCase(name);
860             }
861         }
862     }
863
864     /**
865        This class represents one directory segment (component) of a URI path.
866     */

867     class DirSegment extends PathSegment {
868         /** If a segment name is in this set, prepend an underscore.*/
869         private Set JavaDoc underscoreSet;
870
871         /**
872            Creates a DirSegment.
873            @param uriPath the path part of the URI
874            @param beginIndex the beginning index, inclusive, of the substring
875            of uriPath to be used
876            @param endIndex the ending index, exclusive, of the substring
877            of uriPath to be used
878            @param maxSegLen the maximum number of characters allowed in one
879            file system path segment (component)
880            @param caseSensitive if true, the file system is assumed to be
881            case-sensitive; otherwise the file system is assumed to be
882            case-insensitive but case-preserving
883            @param curi the URI
884            @param characterMap a map from characters
885            (as length-1 String values) in
886            the URI path and query to replacement String values
887            @param dotBegin if non-null, this replaces a '.' at
888            the beginning of the directory name
889            @param dotEnd if non-null, this replaces a '.'
890            that appears at the end of a directory name
891            @param underscoreSet if non-null and a segment, after conversion
892            to lower case, is in this set, then prepend an underscore
893            to the segment
894            @throws IllegalArgumentException if
895            beginIndex is negative.
896            @throws IllegalArgumentException if
897            endIndex is less than beginIndex.
898            @throws IllegalArgumentException if
899            maxSegLen is too small.
900         */

901         DirSegment(String JavaDoc uriPath, int beginIndex, int endIndex, int maxSegLen,
902                    boolean caseSensitive, CrawlURI curi, Map JavaDoc characterMap,
903                    String JavaDoc dotBegin, String JavaDoc dotEnd, Set JavaDoc underscoreSet) {
904             super(maxSegLen, caseSensitive, curi);
905             mainPart = new LumpyString(uriPath, beginIndex, endIndex,
906                                        (null == dotEnd) ? 0 : dotEnd.length(),
907                                        this.maxSegLen, characterMap, dotBegin);
908             if (null != dotEnd) {
909
910                 // We might get a segment like /VeryLong............../
911
// so we have to loop to guarantee the segment doesn't
912
// end with a dot.
913
int dl = dotEnd.length();
914                 while (mainPart.endsWith('.')) {
915
916                     // Chop off the dot at the end.
917
mainPart.trimToMax(mainPart.length() - 1);
918                     if ((mainPart.length() + dl) <= this.maxSegLen) {
919                         mainPart.append(dotEnd);
920                     }
921                 }
922             }
923             this.underscoreSet = underscoreSet;
924         }
925
926         void addToPath(URIToFileReturn currentPath) throws IOException JavaDoc {
927             NumberFormat JavaDoc nf = null;
928             int startLen = mainPart.length(); // Starting length.
929
for (int i = 0; ; ++i) {
930                 if (0 != i) {
931
932                     // Try to create a unique file name by appending a
933
// number.
934
if (null == nf) {
935                         nf = NumberFormat.getIntegerInstance();
936                     }
937                     String JavaDoc ending = nf.format(i);
938                     mainPart.trimToMax(Math.min(startLen,
939                                                 maxSegLen - ending.length()));
940                     mainPart.append(ending);
941                 }
942                 String JavaDoc segStr = mainPart.toString();
943                 if ((null != underscoreSet)
944                         && underscoreSet.contains(segStr.toLowerCase())) {
945                     mainPart.prepend('_');
946                     ++startLen;
947                     mainPart.trimToMax(maxSegLen);
948                     segStr = mainPart.toString();
949                 }
950                 File JavaDoc fsf = currentPath.getFile();
951                 File JavaDoc f = new File JavaDoc(fsf, segStr);
952                 int er = existsMaybeCaseSensitive(fsf, segStr, f);
953                 switch (er) {
954                 case EXISTS_NOT:
955                     if (!f.mkdir()) {
956                         throw new IOException JavaDoc("Can not mkdir "
957                                               + f.getAbsolutePath());
958                     }
959                     currentPath.append(f, segStr);
960                     return; // Created new directory.
961

962                 case EXISTS_EXACT_MATCH:
963                     if (f.isDirectory()) {
964                         if (!f.canWrite()) {
965                             throw new IOException JavaDoc("Directory "
966                                                   + f.getAbsolutePath()
967                                                   + " not writeable.");
968                         }
969
970                         /*
971                           A writeable directory already exists.
972                           Assume it's the one we want.
973                           This assumption fails for cases like
974                           http://foo.com/a*256/b.html
975                           followed by
976                           http://foo.com/a*256z/b.html
977                           where a*256 means a sequence of the maximum allowed
978                           number of "a"s.
979                         */

980                         currentPath.append(f, segStr);
981                         return;
982                     }
983
984                     /*
985                       A segment already exists but isn't a directory.
986                       This could arise from, for example,
987                       http://foo.com/a*256
988                       followed by
989                       http://foo.com/a*256b/b.html
990                       We need to find a directory we created before in this
991                       situation, or make a new directory with a unique name.
992                       Going around the loop should eventually do that.
993                     */

994                     break;
995
996                 case EXISTS_CASE_INSENSITIVE_MATCH:
997                     /*
998                       A segment already exists that's a case-insensitive match
999                       but not an exact match. It may or may not be a directory.
1000                      This could arise, on a case-insensitive, case-preserving
1001                      file system (such as Macintosh HFS+). For example,
1002                      http://foo.com/bar/z.html
1003                      followed by
1004                      http://foo.com/BAR/z.html
1005                      would do it. We want bar and BAR to turn into different
1006                      directories.
1007                      Going around the loop should eventually do that.
1008                    */

1009                    break;
1010
1011                default:
1012                    throw new IllegalStateException JavaDoc("Code: " + er);
1013                }
1014            }
1015        }
1016    }
1017
1018    /**
1019       This class represents the last segment (component) of a URI path.
1020    */

1021    class EndSegment extends PathSegment {
1022        /**
1023           The number of characters in the path up to this EndSegment,
1024           including the final File.separatorChar.
1025        */

1026        private int dirPathLen;
1027
1028        /**
1029           The maximum number of characters allowed in a file path, minus 1.
1030           The extra 1 is reserved for temporarily appending
1031           a character so an existing file can be replaced atomically,
1032           for example, by writing
1033           <code>foo.htmlN</code>
1034           and then renaming it to
1035           <code>foo.html</code>.
1036        */

1037        private int maxPathLen;
1038
1039        /** The query part of the URI, or null if none.*/
1040        private LumpyString query = null;
1041
1042        /**
1043           The suffix, or null if none.
1044           This isn't a LumpyString because we'd only trim a suffix
1045           if space were very, very tight.
1046        */

1047        private String JavaDoc suffix = null;
1048
1049        /**
1050           True if the suffix goes at the end, after the query.
1051           False if the suffix goes before the query.
1052        */

1053        private boolean suffixAtEnd;
1054
1055        /** Appended to mainPart if necessary to create a unique file name.*/
1056        private String JavaDoc uniquePart = null;
1057
1058        /**
1059           Creates an EndSegment.
1060           @param uriPath the path part of the URI
1061           @param beginIndex the beginning index, inclusive, of the substring
1062           of uriPath to be used
1063           @param endIndex the ending index, exclusive, of the substring
1064           of uriPath to be used
1065           @param maxSegLen the maximum number of characters allowed in one
1066           file system path segment (component)
1067           @param caseSensitive if true, the file system is assumed to be
1068           case-sensitive; otherwise the file system is assumed to be
1069           case-insensitive but case-preserving
1070           @param curi the URI
1071           @param characterMap maps characters (as length-1 String values) in
1072           the URI path and query to replacement String values
1073           @param dotBegin if non-null, this replaces a '.' at
1074           the beginning of the segment
1075           @param query the query part of the URI, or null if none
1076           @param suffix if non-null, use this as the suffix in preference to
1077           any suffix that uriPath might have
1078           @param maxPathLen the maximum number of characters allowed in a
1079           file system path
1080           @param suffixAtEnd if true, the suffix is placed at the end of the
1081           path, after the query (if any); otherwise, the suffix is placed
1082           before the query
1083           @throws IllegalArgumentException if
1084           beginIndex is negative.
1085           @throws IllegalArgumentException if
1086           endIndex is less than beginIndex.
1087           @throws IllegalArgumentException if
1088           maxSegLen is too small.
1089        */

1090        EndSegment(String JavaDoc uriPath, int beginIndex, int endIndex, int maxSegLen,
1091                   boolean caseSensitive, CrawlURI curi, Map JavaDoc characterMap,
1092                   String JavaDoc dotBegin, String JavaDoc query, String JavaDoc suffix,
1093                   int maxPathLen, boolean suffixAtEnd) {
1094            super(maxSegLen - 1, caseSensitive, curi);
1095            int mpe = endIndex; // endIndex for the main part (no suffix).
1096
int ldi = uriPath.lastIndexOf('.'); // Index of last dot.
1097
if ((ldi > 0) && (ldi < (endIndex - 1)) && (ldi > beginIndex)) {
1098                mpe = ldi; // uriPath has a suffix.
1099
}
1100            this.suffix = suffix;
1101            if ((null == this.suffix) && (mpe < (endIndex - 1))) {
1102
1103                // There's no replacement suffix and uriPath has a suffix.
1104
// Run it through a LumpyString to do the character mapping.
1105
LumpyString ls = new LumpyString(uriPath, mpe + 1, endIndex, 0,
1106                                                 this.maxSegLen, characterMap,
1107                                                 null);
1108                this.suffix = ls.toString();
1109            }
1110            int pad = ((null == this.suffix) ? 0 : (1 + this.suffix.length()))
1111                + ((null == query) ? 0 : query.length());
1112            mainPart = new LumpyString(uriPath, beginIndex, mpe, pad,
1113                                       this.maxSegLen, characterMap, dotBegin);
1114            this.maxPathLen = maxPathLen - 1;
1115            if (null != query) {
1116                this.query = new LumpyString(query, 0, query.length(), 0,
1117                                             this.maxSegLen, characterMap,
1118                                             null);
1119            }
1120            this.suffixAtEnd = suffixAtEnd;
1121        }
1122
1123        void addToPath(URIToFileReturn currentPath) {
1124            File JavaDoc fsf = currentPath.getFile();
1125            NumberFormat JavaDoc nf = null;
1126            dirPathLen = 1 + fsf.getPath().length();
1127            for (int i = 0; ; ++i) {
1128                if (0 != i) {
1129                    if (null == nf) {
1130                        nf = NumberFormat.getIntegerInstance();
1131                    }
1132                    uniquePart = nf.format(i);
1133                }
1134                trimWithPadding((null == uniquePart) ? 0 : uniquePart.length());
1135                String JavaDoc segStr = joinParts(); // This EndSegment as a String.
1136
File JavaDoc f = new File JavaDoc(fsf, segStr);
1137
1138                // Code for whether file exists.
1139
int er = existsMaybeCaseSensitive(fsf, segStr, f);
1140                switch (er) {
1141                case EXISTS_NOT:
1142                    currentPath.append(f, segStr);
1143                    return;
1144
1145                case EXISTS_EXACT_MATCH:
1146                    if (f.isFile()) {
1147                        currentPath.append(f, segStr);
1148                        return;
1149                    }
1150
1151                    /*
1152                      A file already exists but isn't an ordinary file.
1153                      It might be a directory, special file, named pipe,
1154                      whatever.
1155                      We need to find an unused file name,
1156                      or an ordinary file.
1157                      Going around the loop should eventually do that.
1158                    */

1159                    break;
1160
1161                case EXISTS_CASE_INSENSITIVE_MATCH:
1162                    /*
1163                      A file already exists that's a case-insensitive match
1164                      but not an exact match.
1165                      This could arise, on a case-insensitive, case-preserving
1166                      file system (such as Macintosh HFS+). For example,
1167                      http://foo.com/files.zip
1168                      followed by
1169                      http://foo.com/FILES.ZIP
1170                      would do it. We want files.zip and FILES.ZIP to turn into
1171                      different files. Going around the loop should eventually
1172                      do that.
1173                    */

1174                    break;
1175
1176                default:
1177                    throw new IllegalStateException JavaDoc("Code: " + er);
1178                }
1179            }
1180        }
1181
1182        /**
1183           Creates a simple file name from the parts of this EndSegment.
1184           @return a simple file name constructed from the main part,
1185           unique part, query, and suffix
1186        */

1187        private String JavaDoc joinParts() {
1188            StringBuffer JavaDoc sb = new StringBuffer JavaDoc(length());
1189            sb.append(mainPart.asStringBuffer());
1190            if (null != uniquePart) {
1191                sb.append(uniquePart);
1192            }
1193            if (suffixAtEnd) {
1194                if (null != query) {
1195                    sb.append(query);
1196                }
1197                if (null != suffix) {
1198                    sb.append('.');
1199                    sb.append(suffix);
1200                }
1201            } else {
1202                if (null != suffix) {
1203                    sb.append('.');
1204                    sb.append(suffix);
1205                }
1206                if (null != query) {
1207                    sb.append(query);
1208                }
1209            }
1210            return sb.toString();
1211        }
1212
1213        /**
1214           Gets the number of available character positions.
1215           If this EndSegment were converted to a path,
1216           it would have a path length and a segment length.
1217           There are two constraints: maxSegLen and maxPathLen.
1218           The number of character positions available before bumping
1219           into the lower constraint is computed.
1220           @return the number of available positions, which may be negative
1221        */

1222        private int lenAvail() {
1223            int len = length();
1224            return Math.min(maxSegLen - len, maxPathLen - dirPathLen - len);
1225        }
1226
1227        /**
1228           Gets the length of the simple file name that would be
1229           created for this EndSegment.
1230           @return the length
1231        */

1232        private int length() {
1233            int r = mainPart.length(); // Return value.
1234
if (null != uniquePart) {
1235                r += uniquePart.length();
1236            }
1237            if (null != query) {
1238                r += query.length();
1239            }
1240            if (null != suffix) {
1241                r += 1 + suffix.length(); // 1 for the '.'
1242
}
1243            return r;
1244        }
1245
1246        /**
1247           Trims this EndSegment so a given number of characters are available.
1248           After trimming, there will be room for at least
1249           padding more characters before one of the constraints is
1250           encountered.
1251           The choices for trimming, in priority order, are:
1252           <ol>
1253           <li>Shorten the query.</li>
1254           <li>Remove the query.</li>
1255           <li>Shorten the main part.</li>
1256           <li>Shorten the suffix.</li>
1257           </ol>
1258           @param padding the number of character positions that need to be
1259           available
1260           @throws IllegalStateException
1261           if it's impossible to trim enough
1262        */

1263        private void trimWithPadding(int padding) {
1264            assert padding >= 0 : "padding: " + padding;
1265            int la = lenAvail();
1266            if (la >= padding) {
1267                return;
1268            }
1269
1270            // We need space for (padding - la) characters.
1271
// la might be negative.
1272
if (null != query) {
1273                query.trimToMax(Math.max(0, query.length() - (padding - la)));
1274                if (0 == query.length()) {
1275                    query = null;
1276                }
1277                la = lenAvail();
1278                if (la >= padding) {
1279                    return;
1280                }
1281            }
1282            mainPart.trimToMax(Math.max(1, mainPart.length() - (padding - la)));
1283            la = lenAvail();
1284            if (la >= padding) {
1285                return;
1286            }
1287            if (null != suffix) {
1288                suffix = suffix.substring(0, Math.max(1, suffix.length()
1289                                                      - (padding - la)));
1290                la = lenAvail();
1291                if (la >= padding) {
1292                    return;
1293                }
1294            }
1295            throw new IllegalStateException JavaDoc("Can not trim " + curi.toString());
1296        }
1297    }
1298
1299    /**
1300       This class represents a dynamically growable string
1301       consisting of substrings ("lumps") that
1302       are treated atomically. If the string is shortened, then an entire
1303       lump is removed. The intent is to treat each %XX escape as a lump.
1304       This class also allows single characters in a source string to be
1305       re-mapped to a different string, possible containing more than
1306       one character.
1307       Each re-mapped character is also treated as a lump.
1308       <p>
1309       For example, suppose part of a URI, between two slashes, is
1310       <code>/VeryLongString...%3A/</code>.
1311       We want to create a corresponding file system directory, but the string
1312       is a little longer than the allowed maximum.
1313       It's better to trim the entire
1314       <code>%3A</code>
1315       off the end than part of it.
1316       This is especially true if, later, we need to append some digits
1317       to create a unique directory name.
1318       So we treat the entire
1319       <code>%3A</code>
1320       as one lump.
1321    */

1322    class LumpyString {
1323        /**
1324           Lumps are indicated by an auxiliary array aux[],
1325           indexed the same as the string. The LUMP_BEGIN bit is set
1326           for a position in the string at which a lump begins.
1327        */

1328        private static final byte LUMP_BEGIN = 0x1;
1329
1330        /** Bit set for the end of a lump. */
1331        private static final byte LUMP_END = 0x2;
1332
1333        /**
1334           Bit set for all characters in a lump of length greater than 1,
1335           except the beginning and ending characters.
1336        */

1337        private static final byte LUMP_MID = 0x4;
1338
1339        /** The auxiliary array. */
1340        private byte[] aux;
1341
1342        /** Holds the string. */
1343        private StringBuffer JavaDoc string;
1344
1345        /**
1346           Creates a LumpyString.
1347           @param str the source string
1348           @param beginIndex the beginning index, inclusive, of the substring
1349           of str to be used
1350           @param endIndex the ending index, exclusive, of the substring
1351           of str to be used
1352           @param padding reserve this many additional character positions
1353           before dynamic growth is needed
1354           @param maxLen the maximum string length, regardless of the
1355           values of beginIndex, endIndex, and padding
1356           @param characterMap maps from characters in the source string
1357           (represented as length-one String values) to replacement String
1358           values (length at least 1).
1359           Each replacement string is treated as one lump.
1360           This is intended to cope with characters that a file system
1361           does not allow.
1362           @param dotBegin if non-null, this replaces a '.' at
1363           <code>str[beginIndex]</code>
1364           @throws IllegalArgumentException if
1365           beginIndex is negative.
1366           @throws IllegalArgumentException if
1367           endIndex is less than beginIndex.
1368           @throws IllegalArgumentException if
1369           padding is negative.
1370           @throws IllegalArgumentException if
1371           maxLen is less than one.
1372           @throws IllegalArgumentException if
1373           characterMap is null.
1374           @throws IllegalArgumentException if
1375           dotBegin is non-null but empty.
1376        */

1377        LumpyString(String JavaDoc str, int beginIndex, int endIndex, int padding,
1378                    int maxLen, Map JavaDoc characterMap, String JavaDoc dotBegin) {
1379            if (beginIndex < 0) {
1380                throw new IllegalArgumentException JavaDoc("beginIndex < 0: "
1381                                                   + beginIndex);
1382            }
1383            if (endIndex < beginIndex) {
1384                throw new IllegalArgumentException JavaDoc("endIndex < beginIndex "
1385                    + "beginIndex: " + beginIndex + "endIndex: " + endIndex);
1386            }
1387            if (padding < 0) {
1388                throw new IllegalArgumentException JavaDoc("padding < 0: " + padding);
1389            }
1390            if (maxLen < 1) {
1391                throw new IllegalArgumentException JavaDoc("maxLen < 1: " + maxLen);
1392            }
1393            if (null == characterMap) {
1394                throw new IllegalArgumentException JavaDoc("characterMap null");
1395            }
1396            if ((null != dotBegin) && (0 == dotBegin.length())) {
1397                throw new IllegalArgumentException JavaDoc("dotBegin empty");
1398            }
1399
1400            // Initial capacity. Leave some room for %XX lumps.
1401
// Guaranteed positive.
1402
int cap = Math.min(2 * (endIndex - beginIndex) + padding + 1,
1403                               maxLen);
1404            string = new StringBuffer JavaDoc(cap);
1405            aux = new byte[cap];
1406            for (int i = beginIndex; i != endIndex; ++i) {
1407                String JavaDoc s = str.substring(i, i + 1);
1408                String JavaDoc lump; // Next lump.
1409
if (".".equals(s) && (i == beginIndex) && (null != dotBegin)) {
1410                    lump = dotBegin;
1411                } else {
1412                    lump = (String JavaDoc) characterMap.get(s);
1413                }
1414                if (null == lump) {
1415                    if ("%".equals(s) && ((endIndex - i) > 2)
1416                            && (-1 != Character.digit(str.charAt(i + 1), 16))
1417                            && (-1 != Character.digit(str.charAt(i + 2), 16))) {
1418
1419                        // %XX escape; treat as one lump.
1420
lump = str.substring(i, i + 3);
1421                        i += 2;
1422                    } else {
1423                        lump = s;
1424                    }
1425                }
1426                if ((string.length() + lump.length()) > maxLen) {
1427                    assert checkInvariants();
1428                    return;
1429                }
1430                append(lump);
1431            }
1432            assert checkInvariants();
1433        }
1434
1435        /**
1436           Converts this LumpyString to a String.
1437           @return the current string contents
1438        */

1439        public String JavaDoc toString() {
1440            assert checkInvariants();
1441            return string.toString();
1442        }
1443
1444        /**
1445           Appends one lump to the end of this string.
1446           @param lump the lump (substring) to append
1447           @throws IllegalArgumentException if
1448           lump is null or empty.
1449        */

1450        void append(String JavaDoc lump) {
1451            if (null == lump) {
1452                throw new IllegalArgumentException JavaDoc("lump null");
1453            }
1454            int lumpLen = lump.length();
1455            if (0 == lumpLen) {
1456                throw new IllegalArgumentException JavaDoc("lump empty");
1457            }
1458            int pos = string.length(); // Current end of string.
1459
ensureCapacity(pos + lumpLen);
1460            if (1 == lumpLen) {
1461                aux[pos] = LUMP_BEGIN | LUMP_END;
1462            } else {
1463                assert lumpLen > 1;
1464                aux[pos] = LUMP_BEGIN;
1465                ++pos;
1466                for (int i = lumpLen - 2; 0 != i; --i) {
1467                    aux[pos] = LUMP_MID;
1468                    ++pos;
1469                }
1470                aux[pos] = LUMP_END;
1471            }
1472            string.append(lump);
1473            assert checkInvariants();
1474        }
1475
1476        /**
1477           Returns the string as a StringBuffer.
1478           The caller should <em>not</em> modify the return value.
1479           @return the string
1480        */

1481        StringBuffer JavaDoc asStringBuffer() {
1482            return string;
1483        }
1484
1485        /**
1486           Tests if this string ends with a character.
1487           @param ch the character to test for
1488           @return true if and only if this string ends with ch
1489        */

1490        boolean endsWith(char ch) {
1491            assert checkInvariants();
1492            int len = string.length();
1493            return (0 != len) && (string.charAt(len - 1) == ch);
1494        }
1495
1496        /**
1497           Prepends one character, as a lump, to this string.
1498           @param ch the character to prepend
1499        */

1500        void prepend(char ch) {
1501            assert checkInvariants();
1502            int oldLen = string.length();
1503            ensureCapacity(1 + oldLen);
1504            string.insert(0, ch);
1505            System.arraycopy(aux, 0, aux, 1, oldLen);
1506            aux[0] = LUMP_BEGIN | LUMP_END;
1507            assert checkInvariants();
1508        }
1509
1510        /**
1511           Gets the length of this string.
1512           @return the number of characters in this string
1513        */

1514        int length() {
1515            assert checkInvariants();
1516            return string.length();
1517        }
1518
1519        /**
1520           If necessary, trims this string to a maximum length.
1521           Any trimming is done by removing one or more complete
1522           lumps from the end of this string.
1523           @param maxLen the new maximum length.
1524           After trimming, the actual length of this string will be
1525           at most maxLen.
1526           @throws IllegalArgumentException if
1527           maxLen is negative.
1528        */

1529        void trimToMax(int maxLen) {
1530            if (maxLen < 0) {
1531                throw new IllegalArgumentException JavaDoc("maxLen < 0: " + maxLen);
1532            }
1533            assert checkInvariants();
1534            int cl = string.length(); // Current length.
1535
if (cl > maxLen) {
1536                int nl = maxLen; // New length.
1537
while ((0 != nl) && (LUMP_END != (aux[nl - 1] & LUMP_END))) {
1538                    --nl;
1539                }
1540                for (int i = nl; i != cl; ++i) {
1541                    aux[i] = 0;
1542                }
1543                string.setLength(nl);
1544            }
1545            assert checkInvariants();
1546        }
1547
1548        /**
1549           Checks some assertions on the instance variables.
1550           The intended usage is
1551           <code>assert checkInvariants();</code>
1552           so that if assertions are off, no call is made.
1553           @return true
1554        */

1555        private boolean checkInvariants() {
1556
1557            // There's an aux[] element for every character in the StringBuffer.
1558
assert aux.length >= string.length()
1559                : "aux.length: " + aux.length
1560                + " string.length(): " + string.length();
1561
1562            // The first character starts a lump.
1563
assert (0 == string.length())
1564                || (LUMP_BEGIN == (aux[0] & LUMP_BEGIN))
1565                : "aux[0]: " + aux[0];
1566
1567            // The last character ends a lump.
1568
assert (0 == string.length())
1569                || (LUMP_END == (aux[string.length() - 1] & LUMP_END))
1570                : "aux[end]: " + aux[string.length() - 1];
1571            return true;
1572        }
1573
1574        /**
1575           Ensures that the capacity is at least equal to the specified minimum.
1576           @param minCapacity the minimum desired capacity
1577        */

1578        private void ensureCapacity(int minCapacity) {
1579            assert checkInvariants();
1580            if (minCapacity > aux.length) {
1581                int nc = 2 * aux.length; // New capacity.
1582
while (nc < minCapacity) {
1583                    nc *= 2;
1584                }
1585                byte[] oldAux = aux;
1586                aux = new byte[nc];
1587                System.arraycopy(oldAux, 0, aux, 0, string.length());
1588            }
1589            string.ensureCapacity(minCapacity);
1590            assert checkInvariants();
1591        }
1592    }
1593
1594    /**
1595       This class is returned by uriToFile.
1596       It represents a file system path, both as a File and as
1597       a path relative to the base directory.
1598    */

1599    class URIToFileReturn {
1600        /** The file system path as a File.*/
1601        private File JavaDoc filePath;
1602
1603        /** The relative path from baseDir.*/
1604        private StringBuffer JavaDoc relativePath = new StringBuffer JavaDoc(255);
1605
1606        /**
1607           Creates a URIToFileReturn.
1608           @param baseDir the path to the starting directory
1609           @param host the host part of the URI, or null if the host name
1610           should not be part of the path
1611           @param port the port part of the URI, or -1 if the port
1612           should not be part of the path
1613        */

1614        URIToFileReturn(String JavaDoc baseDir, String JavaDoc host, int port) {
1615
1616            // The initial path.
1617
StringBuffer JavaDoc startPath = new StringBuffer JavaDoc(baseDir.length() + 32);
1618            startPath.append(baseDir);
1619            if (baseDir.endsWith(File.separator)) {
1620                assert 1 != baseDir.length();
1621                startPath.deleteCharAt(startPath.length() - 1);
1622            }
1623            if (null != host) {
1624                startPath.append(File.separatorChar);
1625                startPath.append(host);
1626                relativePath.append(host);
1627            }
1628            if (port > 0) {
1629                startPath.append(File.separatorChar);
1630                startPath.append(port);
1631                relativePath.append(File.separatorChar);
1632                relativePath.append(port);
1633            }
1634            filePath = new File JavaDoc(startPath.toString());
1635        }
1636
1637        /**
1638           Appends one more segment to this path.
1639           @param f a File representing the path with the next segment added
1640           @param nextSegment the next segment
1641        */

1642        void append(File JavaDoc f, String JavaDoc nextSegment) {
1643            filePath = f;
1644            if (0 != relativePath.length()) {
1645                relativePath.append(File.separatorChar);
1646            }
1647            relativePath.append(nextSegment);
1648        }
1649
1650        /**
1651           Gets this path as a File.
1652           @return this path
1653        */

1654        File JavaDoc getFile() {
1655            return filePath;
1656        }
1657
1658        /**
1659           Gets this path as a relative path from the base directory.
1660           @return the relative path
1661        */

1662        String JavaDoc getRelativePath() {
1663            return relativePath.toString();
1664        }
1665
1666        /**
1667           Tests if this path is longer than a given value.
1668           @param maxLen the value to test
1669           @return true if and only if this path is longer than maxLen
1670        */

1671        boolean longerThan(int maxLen) {
1672            return filePath.getPath().length() > maxLen;
1673        }
1674
1675        /**
1676           Creates all directories in this path as needed.
1677           @throws IOException
1678           if a needed directory could not be created
1679           @throws IOException
1680           if a needed directory is not writeable
1681           @throws IOException
1682           if a non-directory file exists
1683           with the same path as a needed directory
1684        */

1685        void mkdirs() throws IOException JavaDoc {
1686            if (!filePath.exists()) {
1687                if (!filePath.mkdirs()) {
1688                    throw new IOException JavaDoc("Can not mkdir "
1689                                          + filePath.getAbsolutePath());
1690                }
1691            } else if (!filePath.canWrite()) {
1692                throw new IOException JavaDoc("Directory " + filePath.getAbsolutePath()
1693                                      + " not writeable.");
1694            } else if (!filePath.isDirectory()) {
1695                throw new IOException JavaDoc("File " + filePath.getAbsolutePath()
1696                                      + " is not a directory.");
1697            }
1698        }
1699    }
1700}
1701
Popular Tags