KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > columba > mail > spam > MacchiatoPlugin


1 // The contents of this file are subject to the Mozilla Public License Version
2
// 1.1
3
//(the "License"); you may not use this file except in compliance with the
4
//License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
5
//
6
//Software distributed under the License is distributed on an "AS IS" basis,
7
//WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
8
//for the specific language governing rights and
9
//limitations under the License.
10
//
11
//The Original Code is "The Columba Project"
12
//
13
//The Initial Developers of the Original Code are Frederik Dietz and Timo
14
// Stich.
15
//Portions created by Frederik Dietz and Timo Stich are Copyright (C) 2003.
16
//
17
//All Rights Reserved.
18
package org.columba.mail.spam;
19
20 import java.io.File JavaDoc;
21 import java.io.IOException JavaDoc;
22 import java.io.InputStream JavaDoc;
23 import java.security.NoSuchAlgorithmException JavaDoc;
24 import java.util.ArrayList JavaDoc;
25 import java.util.Enumeration JavaDoc;
26 import java.util.List JavaDoc;
27 import java.util.logging.Logger JavaDoc;
28
29 import javax.swing.JOptionPane JavaDoc;
30
31 import org.columba.core.config.DefaultConfigDirectory;
32 import org.columba.core.gui.frame.FrameManager;
33 import org.columba.core.io.CloneStreamMaster;
34 import org.columba.core.logging.Logging;
35 import org.columba.mail.folder.IMailbox;
36 import org.columba.mail.spam.command.CommandHelper;
37 import org.columba.mail.spam.rules.RuleList;
38 import org.columba.ristretto.message.Header;
39 import org.macchiato.DBWrapper;
40 import org.macchiato.Message;
41 import org.macchiato.SpamFilter;
42 import org.macchiato.SpamFilterImpl;
43 import org.macchiato.db.FrequencyDB;
44 import org.macchiato.db.MD5SumHelper;
45 import org.macchiato.db.berkleydb.BerkleyFrequencyDBImpl;
46 import org.macchiato.log.MacchiatoLogger;
47 import org.macchiato.maps.ProbabilityMap;
48
49 /**
50  * Built-in spam filter using the Macchiato library.
51  * <p>
52  * Note, that its necessary for this filter to train a few hundred messages,
53  * before its starting to work. I'm usually starting with around 1000 messages
54  * while keeping it up-to-date with messages which are scored wrong.
55  * <p>
56  * If training mode is enabled, the spam filter automatically adds messages to
57  * its frequency database.
58  *
59  * @author fdietz
60  */

61 public class MacchiatoPlugin implements ISpamPlugin {
62
63     /** JDK 1.4+ logging framework logger, used for logging. */
64     private static final Logger JavaDoc LOG = Logger
65             .getLogger("org.columba.core.gui.htmlviewer");
66
67     /**
68      * Delete messages from DB, if DB size > THRESHOLD
69      */

70     public final static int THRESHOLD = 200000;
71
72     /**
73      * Delete messages from DB after 7 days, if they don't affect the scoring
74      * process because of low occurences.
75      */

76     public final static int AGE = 7;
77
78     /**
79      * spam filter in macchiator library doing the actual work
80      */

81     private SpamFilter filter;
82
83     /**
84      * database of tokens, storing occurences of tokens, etc.
85      */

86     private FrequencyDB db;
87
88     /**
89      * file to store the token database
90      */

91     private File JavaDoc file;
92
93     /**
94      * dirty flag for database changes
95      */

96     private boolean hasChanged = false;
97
98     /**
99      * is cache already loaded?
100      */

101     private boolean alreadyLoaded = false;
102
103     /**
104      *
105      */

106     public MacchiatoPlugin() {
107         // create directory <config-folder>/mail/spamdb
108
File JavaDoc configDirectory = DefaultConfigDirectory.getInstance().getCurrentPath();
109         File JavaDoc mailDirectory = new File JavaDoc(configDirectory, "mail");
110         file = new File JavaDoc(mailDirectory, "spamdb");
111         if (!file.exists())
112             file.mkdir();
113         db = new DBWrapper(new BerkleyFrequencyDBImpl(file));
114
115         filter = new SpamFilterImpl(db);
116
117         // make Columba logger parent of macchiato logger
118
MacchiatoLogger.setParentLogger(Logger
119                 .getLogger("org.columba.mail.spam"));
120
121     }
122
123     /**
124      * Score message. Using a threshold of 90% here. Every message with at least
125      * 90% is spam. This value should be increased in the future.
126      *
127      * @see org.columba.mail.spam.ISpamPlugin#scoreMessage(org.columba.mail.folder.IMailbox,
128      * java.lang.Object)
129      */

130     public boolean scoreMessage(IMailbox mailbox, Object JavaDoc uid) throws Exception JavaDoc {
131         // load database from file
132
load();
133
134         // get inputstream of message body
135
InputStream JavaDoc istream = CommandHelper.getBodyPart(mailbox, uid);
136
137         // we are using this inpustream multiple times
138
// --> istream will be closed by CloneStreamMaster
139
CloneStreamMaster master = new CloneStreamMaster(istream);
140
141         // get stream
142
istream = master.getClone();
143
144         // apply additional handcrafted rules
145
ProbabilityMap map = RuleList.getInstance().getProbabilities(mailbox,
146                 uid);
147
148         float score = filter.scoreMessage(new Message(istream), map);
149
150         return score >= 0.9f;
151     }
152
153     /**
154      * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsSpam(org.columba.mail.folder.IMailbox,
155      * java.lang.Object)
156      */

157     public void trainMessageAsSpam(IMailbox mailbox, Object JavaDoc uid)
158             throws Exception JavaDoc {
159         // get inputstream of message body
160
InputStream JavaDoc istream = CommandHelper.getBodyPart(mailbox, uid);
161
162         // get headers
163
Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);
164
165         // put headers in list
166
Enumeration JavaDoc e = h.getKeys();
167         List JavaDoc list = new ArrayList JavaDoc();
168
169         while (e.hasMoreElements()) {
170             String JavaDoc key = (String JavaDoc) e.nextElement();
171             list.add(h.get(key));
172         }
173
174         // load database from file
175
load();
176
177         try {
178             CloneStreamMaster master = new CloneStreamMaster(istream);
179             InputStream JavaDoc inputStream = master.getClone();
180
181             byte[] md5sum = MD5SumHelper.createMD5(inputStream);
182             // close stream
183
inputStream.close();
184
185             // get new inputstream
186
inputStream = master.getClone();
187
188             Message message = new Message(inputStream, list, md5sum);
189             // check if this message was already learned
190
// -> only add if this is not the case
191
if (db.MD5SumExists(md5sum)) {
192                 // message already exists
193
// --> correct token data
194
filter.correctMessageAsSpam(message);
195             } else {
196                 // new message
197
filter.trainMessageAsSpam(message);
198             }
199
200             // close stream
201
inputStream.close();
202
203             // set dirty flag
204
hasChanged = true;
205         } catch (IOException JavaDoc e1) {
206             LOG.severe(e1.getMessage());
207             if (Logging.DEBUG)
208                 e1.printStackTrace();
209         } catch (NoSuchAlgorithmException JavaDoc nsae) {
210         } // does not occur
211

212     }
213
214     /**
215      * @see org.columba.mail.spam.ISpamPlugin#trainMessageAsHam(org.columba.mail.folder.IMailbox,
216      * java.lang.Object)
217      */

218     public void trainMessageAsHam(IMailbox mailbox, Object JavaDoc uid)
219             throws Exception JavaDoc {
220         // get inputstream of message body
221
InputStream JavaDoc istream = CommandHelper.getBodyPart(mailbox, uid);
222
223         // get headers
224
Header h = mailbox.getHeaderFields(uid, Message.HEADERFIELDS);
225
226         // put headers in list
227
Enumeration JavaDoc e = h.getKeys();
228         List JavaDoc list = new ArrayList JavaDoc();
229
230         while (e.hasMoreElements()) {
231             String JavaDoc key = (String JavaDoc) e.nextElement();
232             list.add(h.get(key));
233         }
234
235         // load database from file
236
load();
237
238         try {
239             CloneStreamMaster master = new CloneStreamMaster(istream);
240             InputStream JavaDoc inputStream = master.getClone();
241
242             byte[] md5sum = MD5SumHelper.createMD5(inputStream);
243             // close stream
244
inputStream.close();
245
246             // get new inputstream
247
inputStream = master.getClone();
248             Message message = new Message(inputStream, list, md5sum);
249
250             // check if this message was already learned
251
if (db.MD5SumExists(md5sum)) {
252                 // message already exists
253

254                 // --> correct token data
255
filter.correctMessageAsHam(message);
256             } else {
257                 // new message
258

259                 filter.trainMessageAsHam(message);
260             }
261
262             // close stream
263
inputStream.close();
264
265             // set dirty flag
266
hasChanged = true;
267         } catch (IOException JavaDoc e1) {
268             LOG.severe(e1.getMessage());
269             if (Logging.DEBUG)
270                 e1.printStackTrace();
271         } catch (NoSuchAlgorithmException JavaDoc nsae) {
272         } // does not occur
273

274     }
275
276     /**
277      * @see org.columba.mail.spam.ISpamPlugin#save()
278      */

279     public void save() {
280         try {
281             // only save if changes exist
282
if (alreadyLoaded && hasChanged) {
283                 // cleanup DB -> remove old tokens
284
db.cleanupDB(THRESHOLD);
285
286                 // close DB
287
db.close();
288             }
289         } catch (Exception JavaDoc e) {
290             if (Logging.DEBUG) {
291                 e.printStackTrace();
292             }
293             // TODO (@author fdietz): i18n
294
int value = JOptionPane.showConfirmDialog(FrameManager.getInstance()
295                     .getActiveFrame(),
296                     "An error occured while saving the spam database.\n"
297                             + "Try again?", "Error saving database",
298                     JOptionPane.YES_NO_OPTION, JOptionPane.WARNING_MESSAGE);
299             if (value == JOptionPane.YES_OPTION) {
300                 save();
301             }
302         }
303
304     }
305
306     /**
307      * @see org.columba.mail.spam.ISpamPlugin#load()
308      */

309     public void load() {
310         /*
311          * try { // only load if necessary if (!alreadyLoaded && file.exists()) {
312          * FrequencyIO.load(db, file); }
313          *
314          * alreadyLoaded = true; } catch (IOException e) {
315          * JOptionPane.showMessageDialog(
316          * MainInterface.frameModel.getActiveFrame(), "An error occured while
317          * loading the spam database.\n" + "I will use an empty one.", "Error
318          * loading database", JOptionPane.ERROR_MESSAGE); if
319          * (MainInterface.DEBUG) { e.printStackTrace(); } // fail-case db = new
320          * FrequencyDBImpl();
321          *
322          * alreadyLoaded = true; }
323          */

324     }
325
326 }
327
Popular Tags