KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > postprocessor > LowDiskPauseProcessor


1 /*
2  * LowDiskPauseProcessor
3  *
4  * $Id: LowDiskPauseProcessor.java,v 1.5.14.1 2007/01/13 01:31:24 stack-sf Exp $
5  *
6  * Created on Jun 5, 2003
7  *
8  * Copyright (C) 2003 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26 package org.archive.crawler.postprocessor;
27
28 import java.io.IOException JavaDoc;
29 import java.util.Arrays JavaDoc;
30 import java.util.List JavaDoc;
31 import java.util.logging.Level JavaDoc;
32 import java.util.logging.Logger JavaDoc;
33 import java.util.regex.Matcher JavaDoc;
34 import java.util.regex.Pattern JavaDoc;
35
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.Processor;
38 import org.archive.crawler.settings.SimpleType;
39 import org.archive.crawler.settings.Type;
40 import org.archive.util.IoUtils;
41
42 /**
43  * Processor module which uses 'df -k', where available and with
44  * the expected output format (on Linux), to monitor available
45  * disk space and pause the crawl if free space on monitored
46  * filesystems falls below certain thresholds.
47  */

48 public class LowDiskPauseProcessor extends Processor {
49
50     private static final long serialVersionUID = 3338337700768396302L;
51
52     /**
53      * Logger.
54      */

55     private static final Logger JavaDoc logger =
56         Logger.getLogger(LowDiskPauseProcessor.class.getName());
57
58     /**
59      * List of mounts to monitor; should match "Mounted on" column of 'df' output
60      */

61     public static final String JavaDoc ATTR_MONITOR_MOUNTS = "monitor-mounts";
62     public static final String JavaDoc DEFAULT_MONITOR_MOUNTS = "";
63     
64     /**
65      * Space available level below which a crawl-pause should be triggered.
66      */

67     public static final String JavaDoc ATTR_PAUSE_THRESHOLD = "pause-threshold-kb";
68     public static final int DEFAULT_PAUSE_THRESHOLD = 500 * 1024; // 500MB
69

70     /**
71      * Amount of content received between each recheck of free space
72      */

73     public static final String JavaDoc ATTR_RECHECK_THRESHOLD = "recheck-threshold-kb";
74     public static final int DEFAULT_RECHECK_THRESHOLD = 200 * 1024; // 200MB
75

76     protected int contentSinceCheck = 0;
77     
78     public static final Pattern JavaDoc VALID_DF_OUTPUT =
79         Pattern.compile("(?s)^Filesystem\\s+1K-blocks\\s+Used\\s+Available\\s+Use%\\s+Mounted on\\n.*");
80     public static final Pattern JavaDoc AVAILABLE_EXTRACTOR =
81         Pattern.compile("(?m)\\s(\\d+)\\s+\\d+%\\s+(\\S+)$");
82     
83     /**
84      * @param name Name of this writer.
85      */

86     public LowDiskPauseProcessor(String JavaDoc name) {
87         super(name, "LowDiskPause processor");
88         Type e = addElementToDefinition(
89             new SimpleType(ATTR_MONITOR_MOUNTS,
90                     "Space-delimited list of filessystem mounts whose " +
91                     "'available' space should be monitored via 'df' " +
92                     "(if available).",
93                 DEFAULT_MONITOR_MOUNTS));
94         e.setOverrideable(false);
95         e = addElementToDefinition(
96             new SimpleType(ATTR_PAUSE_THRESHOLD,
97                     "When available space on any monitored mounts falls " +
98                     "below this threshold, the crawl will be paused. ",
99                     new Integer JavaDoc(DEFAULT_PAUSE_THRESHOLD)));
100         e = addElementToDefinition(
101             new SimpleType(ATTR_RECHECK_THRESHOLD,
102                     "Available space via 'df' is rechecked after every " +
103                     "increment of this much content (uncompressed) is " +
104                     "observed. ",
105                     new Integer JavaDoc(DEFAULT_RECHECK_THRESHOLD)));
106         e.setOverrideable(false);
107     }
108     
109     /**
110      * Notes a CrawlURI's content size in its running tally. If the
111      * recheck increment of content has passed through since the last
112      * available-space check, checks available space and pauses the
113      * crawl if any monitored mounts are below the configured threshold.
114      *
115      * @param curi CrawlURI to process.
116      */

117     protected void innerProcess(CrawlURI curi) {
118         contentSinceCheck += curi.getContentSize();
119         synchronized (this) {
120             if (contentSinceCheck/1024 > ((Integer JavaDoc) getUncheckedAttribute(null,
121                     ATTR_RECHECK_THRESHOLD)).intValue()) {
122                 checkAvailableSpace(curi);
123                 contentSinceCheck = 0;
124             }
125         }
126     }
127
128
129     /**
130      * Probe via 'df' to see if monitored mounts have fallen
131      * below the pause available threshold. If so, request a
132      * crawl pause.
133      * @param curi Current context.
134      */

135     private void checkAvailableSpace(CrawlURI curi) {
136         try {
137             String JavaDoc df = IoUtils.readFullyAsString(Runtime.getRuntime().exec(
138                     "df -k").getInputStream());
139             Matcher JavaDoc matcher = VALID_DF_OUTPUT.matcher(df);
140             if(!matcher.matches()) {
141                 logger.severe("'df -k' output unacceptable for low-disk checking");
142                 return;
143             }
144             List JavaDoc monitoredMounts = Arrays.asList(((String JavaDoc) getUncheckedAttribute(null,
145                     ATTR_MONITOR_MOUNTS)).split("\\s*"));
146             matcher = AVAILABLE_EXTRACTOR.matcher(df);
147             while (matcher.find()) {
148                 String JavaDoc mount = matcher.group(2);
149                 if (monitoredMounts.contains(mount)) {
150                     long availKilobytes = Long.parseLong(matcher.group(1));
151                     int thresholdKilobytes = ((Integer JavaDoc) getUncheckedAttribute(
152                             null, ATTR_PAUSE_THRESHOLD)).intValue();
153                     if (availKilobytes < thresholdKilobytes ) {
154                         getController().requestCrawlPause();
155                         logger.log(Level.SEVERE, "Low Disk Pause",
156                                 availKilobytes + "K available on " + mount
157                                         + " (below threshold "
158                                         + thresholdKilobytes + "K)");
159                         break;
160                     }
161                 }
162             }
163         } catch (IOException JavaDoc e) {
164             curi.addLocalizedError(this.getName(), e,
165                     "problem checking available space via 'df'");
166         }
167     }
168 }
169
Popular Tags