KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > roller > util > Blacklist


1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. The ASF licenses this file to You
4 * under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License. For additional information regarding
15 * copyright in this work, please see the NOTICE file in the top level
16 * directory of this distribution.
17 */

18 /* Created on Nov 11, 2003 */
19 package org.apache.roller.util;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23 import java.io.BufferedReader JavaDoc;
24 import java.io.FileInputStream JavaDoc;
25 import java.io.IOException JavaDoc;
26 import java.io.InputStream JavaDoc;
27 import java.io.InputStreamReader JavaDoc;
28 import java.io.File JavaDoc;
29 import java.io.FileOutputStream JavaDoc;
30 import java.net.HttpURLConnection JavaDoc;
31 import java.net.URL JavaDoc;
32 import java.text.ParseException JavaDoc;
33 import java.text.SimpleDateFormat JavaDoc;
34 import java.util.ArrayList JavaDoc;
35 import java.util.Date JavaDoc;
36 import java.util.Iterator JavaDoc;
37 import java.util.LinkedList JavaDoc;
38 import java.util.List JavaDoc;
39 import java.util.StringTokenizer JavaDoc;
40 import java.util.regex.Matcher JavaDoc;
41 import java.util.regex.Pattern JavaDoc;
42 import org.apache.roller.config.RollerConfig;
43 import org.apache.commons.lang.StringUtils;
44
45 /**
46  * Loads MT-Blacklist style blacklist from disk and allows callers to test
47  * strings against the blacklist and (optionally) addition blacklists.
48  * <br />
49  * First looks for blacklist.txt in uploads directory, than in classpath
50  * as /blacklist.txt. Download from web feature disabed.
51  * <br />
52  * Blacklist is formatted one entry per line.
53  * Any line that begins with # is considered to be a comment.
54  * Any line that begins with ( is considered to be a regex expression.
55  * <br />
56  * For more information on the (discontinued) MT-Blacklist service:
57  * http://www.jayallen.org/projects/mt-blacklist.
58  *
59  * @author Lance Lavandowska
60  * @author Allen Gilliland
61  */

62 public class Blacklist {
63     
64     private static Log mLogger = LogFactory.getLog(Blacklist.class);
65     
66     private static Blacklist blacklist;
67     private static final String JavaDoc blacklistFile = "blacklist.txt";
68     private static final String JavaDoc lastUpdateStr = "Last update:";
69
70     /** We no longer have a blacklist update URL */
71     private static final String JavaDoc blacklistURL = null;
72
73     private Date JavaDoc lastModified = null;
74     private List JavaDoc blacklistStr = new LinkedList JavaDoc();
75     private List JavaDoc blacklistRegex = new LinkedList JavaDoc();
76     
77     // setup our singleton at class loading time
78
static {
79         mLogger.info("Initializing MT Blacklist");
80         blacklist = new Blacklist();
81         blacklist.loadBlacklistFromFile(null);
82     }
83     
84     /** Hide constructor */
85     private Blacklist() {
86     }
87       
88     /** Singleton factory method. */
89     public static Blacklist getBlacklist() {
90         return blacklist;
91     }
92     
93     /** Updated MT blacklist if necessary. */
94     public static void checkForUpdate() {
95         getBlacklist().update();
96     }
97     
98     /** Non-Static update method. */
99     public void update() {
100         if (this.blacklistURL != null) {
101             boolean blacklist_updated = this.downloadBlacklist();
102             if (blacklist_updated) {
103                 this.loadBlacklistFromFile(null);
104             }
105         }
106     }
107         
108     /** Download the MT blacklist from the web to our uploads directory. */
109     private boolean downloadBlacklist() {
110         
111         boolean blacklist_updated = false;
112         try {
113             mLogger.debug("Attempting to download MT blacklist");
114             
115             URL JavaDoc url = new URL JavaDoc(blacklistURL);
116             HttpURLConnection JavaDoc connection =
117                     (HttpURLConnection JavaDoc) url.openConnection();
118             
119             // after spending way too much time debugging i've discovered
120
// that the blacklist server is selective based on the User-Agent
121
// header. without this header set i always get a 403 response :(
122
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
123             
124             if (this.lastModified != null) {
125                 connection.setRequestProperty("If-Modified-Since",
126                         DateUtil.formatRfc822(this.lastModified));
127             }
128             
129             int responseCode = connection.getResponseCode();
130             
131             mLogger.debug("HttpConnection response = "+responseCode);
132             
133             // did the connection return NotModified? If so, no need to parse
134
if (responseCode == HttpURLConnection.HTTP_NOT_MODIFIED) {
135                 mLogger.debug("MT blacklist site says we are current");
136                 return false;
137             }
138             
139             // did the connection return a LastModified header?
140
long lastModifiedLong =
141                     connection.getHeaderFieldDate("Last-Modified", -1);
142             
143             // if the file is newer than our current then we need do update it
144
if (responseCode == HttpURLConnection.HTTP_OK &&
145                     (this.lastModified == null ||
146                     this.lastModified.getTime() < lastModifiedLong)) {
147
148                 mLogger.debug("my last modified = "+this.lastModified.getTime());
149                 mLogger.debug("MT last modified = "+lastModifiedLong);
150                 
151                 // save the new blacklist
152
InputStream JavaDoc instream = connection.getInputStream();
153                 
154                 String JavaDoc uploadDir = RollerConfig.getProperty("uploads.dir");
155                 String JavaDoc path = uploadDir + File.separator + blacklistFile;
156                 FileOutputStream JavaDoc outstream = new FileOutputStream JavaDoc(path);
157                 
158                 mLogger.debug("writing updated MT blacklist to "+path);
159                 
160                 // read from url and write to file
161
byte[] buf = new byte[4096];
162                 int length = 0;
163                 while((length = instream.read(buf)) > 0)
164                     outstream.write(buf, 0, length);
165                 
166                 outstream.close();
167                 instream.close();
168                 
169                 blacklist_updated = true;
170                 
171                 mLogger.debug("MT blacklist download completed.");
172                 
173             } else {
174                 mLogger.debug("blacklist *NOT* saved, assuming we are current");
175             }
176             
177         } catch (Exception JavaDoc e) {
178             mLogger.error("error downloading blacklist", e);
179         }
180         
181         return blacklist_updated;
182     }
183         
184     /**
185      * Load the MT blacklist from the file system.
186      * We look for a previously downloaded version of the blacklist first and
187      * if it's not found then we load the default blacklist packed with Roller.
188      * Only public for purposes of unit testing.
189      */

190     public void loadBlacklistFromFile(String JavaDoc blacklistFilePath) {
191         
192         InputStream JavaDoc txtStream = null;
193         try {
194             String JavaDoc path = blacklistFilePath;
195             if (path == null) {
196                 String JavaDoc uploadDir = RollerConfig.getProperty("uploads.dir");
197                 path = uploadDir + File.separator + blacklistFile;
198             }
199             File JavaDoc blacklistFile = new File JavaDoc(path);
200             
201             // check our lastModified date to see if we need to re-read the file
202
if (this.lastModified != null &&
203                     this.lastModified.getTime() >= blacklistFile.lastModified()) {
204                 mLogger.debug("Blacklist is current, no need to load again");
205                 return;
206             } else {
207                 this.lastModified = new Date JavaDoc(blacklistFile.lastModified());
208             }
209             txtStream = new FileInputStream JavaDoc(blacklistFile);
210             mLogger.info("Loading blacklist from "+path);
211             
212         } catch (Exception JavaDoc e) {
213             // Roller keeps a copy in the webapp just in case
214
txtStream = getClass().getResourceAsStream("/"+blacklistFile);
215             mLogger.warn("Couldn't find downloaded blacklist, "
216                         + "loading from classpath instead");
217         }
218         
219         if (txtStream != null) {
220             readFromStream(txtStream, false);
221         } else {
222             mLogger.error("Couldn't load a blacklist file from anywhere, "
223                         + "this means blacklist checking is disabled for now.");
224         }
225         mLogger.info("Number of blacklist string rules: "+blacklistStr.size());
226         mLogger.info("Number of blacklist regex rules: "+blacklistRegex.size());
227     }
228        
229     /**
230      * Read in the InputStream for rules.
231      * @param txtStream
232      */

233     private String JavaDoc readFromStream(InputStream JavaDoc txtStream, boolean saveStream) {
234         String JavaDoc line;
235         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
236         BufferedReader JavaDoc in = null;
237         try {
238             in = new BufferedReader JavaDoc(
239                     new InputStreamReader JavaDoc( txtStream, "UTF-8" ) );
240             while ((line = in.readLine()) != null) {
241                 if (line.startsWith("#")) {
242                     readComment(line);
243                 } else {
244                     readRule(line);
245                 }
246                 
247                 if (saveStream) buf.append(line).append("\n");
248             }
249         } catch (Exception JavaDoc e) {
250             mLogger.error(e);
251         } finally {
252             try {
253                 if (in != null) in.close();
254             } catch (IOException JavaDoc e1) {
255                 mLogger.error(e1);
256             }
257         }
258         return buf.toString();
259     }
260     
261     private void readRule(String JavaDoc str) {
262         if (StringUtils.isEmpty(str)) return; // bad condition
263

264         String JavaDoc rule = str.trim();
265         
266         if (str.indexOf("#") > 0) // line has a comment
267
{
268             int commentLoc = str.indexOf("#");
269             rule = str.substring(0, commentLoc-1).trim(); // strip comment
270
}
271         
272         if (rule.indexOf( "(" ) > -1) // regex rule
273
{
274             // pre-compile patterns since they will be frequently used
275
blacklistRegex.add(Pattern.compile(rule));
276         } else if (StringUtils.isNotEmpty(rule)) {
277             blacklistStr.add(rule);
278         }
279     }
280         
281     /** Read comment and try to parse out "Last update" value */
282     private void readComment(String JavaDoc str) {
283         int lastUpdatePos = str.indexOf(lastUpdateStr);
284         if (lastUpdatePos > -1) {
285             str = str.substring(lastUpdatePos + lastUpdateStr.length());
286             str = str.trim();
287             try {
288                 SimpleDateFormat JavaDoc sdf = new SimpleDateFormat JavaDoc("yyyy/MM/dd HH:mm:ss");
289                 lastModified = DateUtil.parse(str, sdf);
290             } catch (ParseException JavaDoc e) {
291                 mLogger.debug("ParseException reading " + str);
292             }
293         }
294     }
295        
296     /**
297      * Does the String argument match any of the rules in the built-in blacklist?
298      */

299     public boolean isBlacklisted(String JavaDoc str) {
300         return isBlacklisted(str, null, null);
301     }
302     
303     /**
304      * Does the String argument match any of the rules in the built-in blacklist
305      * plus additional blacklists provided by caller?
306      * @param str String to be checked against blacklist
307      * @param moreStringRules Additional string rules to consider
308      * @param moreRegexRules Additional regex rules to consider
309      */

310     public boolean isBlacklisted(
311          String JavaDoc str, List JavaDoc moreStringRules, List JavaDoc moreRegexRules) {
312         if (str == null || StringUtils.isEmpty(str)) return false;
313         
314         // First iterate over blacklist, doing indexOf.
315
// Then iterate over blacklistRegex and test.
316
// As soon as there is a hit in either case return true
317

318         // test plain String.indexOf
319
List JavaDoc stringRules = blacklistStr;
320         if (moreStringRules != null && moreStringRules.size() > 0) {
321             stringRules = new ArrayList JavaDoc();
322             stringRules.addAll(moreStringRules);
323             stringRules.addAll(blacklistStr);
324         }
325         if (testStringRules(str, stringRules)) return true;
326         
327         // test regex blacklisted
328
List JavaDoc regexRules = blacklistRegex;
329         if (moreRegexRules != null && moreRegexRules.size() > 0) {
330             regexRules = new ArrayList JavaDoc();
331             regexRules.addAll(moreRegexRules);
332             regexRules.addAll(blacklistRegex);
333         }
334         return testRegExRules(str, regexRules);
335     }
336
337     /**
338      * Test string only against rules provided by caller, NOT against built-in blacklist.
339      * @param str String to be checked against rules
340      * @param moreStringRules String rules to consider
341      * @param moreRegexRules Regex rules to consider
342      */

343     public static boolean matchesRulesOnly(
344         String JavaDoc str, List JavaDoc stringRules, List JavaDoc regexRules) {
345         if (testStringRules(str, stringRules)) return true;
346         return testRegExRules(str, regexRules);
347     }
348         
349     /** Test String against the RegularExpression rules. */
350     private static boolean testRegExRules(String JavaDoc str, List JavaDoc regexRules) {
351         boolean hit = false;
352         Pattern JavaDoc testPattern = null;
353         Iterator JavaDoc iter = regexRules.iterator();
354         while (iter.hasNext()) {
355             testPattern = (Pattern JavaDoc)iter.next();
356             
357             // want to see what it is matching on, but only in debug mode
358
if (mLogger.isDebugEnabled()) {
359                 Matcher JavaDoc matcher = testPattern.matcher(str);
360                 if (matcher.find()) {
361                     mLogger.debug(matcher.group()
362                          + " matched by " + testPattern.pattern());
363                     return true;
364                 }
365             } else {
366                 if (testPattern.matcher(str).find()) {
367                     return true;
368                 }
369             }
370         }
371         return hit;
372     }
373     
374     /** Test the String against the String rules, using simple indexOf. */
375     private static boolean testStringRules(String JavaDoc str, List JavaDoc stringRules) {
376         String JavaDoc test;
377         Iterator JavaDoc iter = stringRules.iterator();
378         boolean hit = false;
379         while (iter.hasNext()) {
380             test = (String JavaDoc)iter.next();
381             if (str.indexOf(test) > -1) {
382                 // want to see what it is matching on, but only in debug mode
383
if (mLogger.isDebugEnabled()) {
384                     mLogger.debug("matched:" + test + ":");
385                 }
386                 return true;
387             }
388         }
389         return hit;
390     }
391     
392     /** Utility method to populate lists based a blacklist in string form */
393     public static void populateSpamRules(
394         String JavaDoc blacklist, List JavaDoc stringRules, List JavaDoc regexRules, String JavaDoc addendum) {
395         String JavaDoc weblogWords = blacklist;
396         weblogWords = (weblogWords == null) ? "" : weblogWords;
397         String JavaDoc siteWords = (addendum != null) ? addendum : "";
398         StringTokenizer JavaDoc toker = new StringTokenizer JavaDoc(siteWords + weblogWords,"\n");
399         while (toker.hasMoreTokens()) {
400             String JavaDoc token = toker.nextToken().trim();
401             if (token.startsWith("#")) continue;
402             if (token.startsWith("(")) {
403                 regexRules.add(Pattern.compile(token));
404             } else {
405                 stringRules.add(token);
406             }
407         }
408     }
409         
410     /** Return pretty list of String and RegEx rules. */
411     public String JavaDoc toString() {
412         StringBuffer JavaDoc buf = new StringBuffer JavaDoc("blacklist ");
413         buf.append(blacklistStr).append("\n");
414         buf.append("Regex blacklist ").append(blacklistRegex);
415         return buf.toString();
416     }
417 }
418
Popular Tags