KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > SnowMailClient > SpamFilter > WordStatistic


1 package SnowMailClient.SpamFilter;
2
3 import SnowMailClient.*;
4 import SnowMailClient.model.*;
5 import SnowMailClient.model.multipart.*;
6 import snow.utils.storage.*;
7 import SnowMailClient.utils.TextSearch.EditDistance;
8 import snow.utils.gui.*;
9 import SnowMailClient.html.HTMLTextExtractor;
10
11 import java.util.*;
12 import java.util.regex.*;
13 import java.io.*;
14 import java.text.DecimalFormat JavaDoc;
15
16
17 /** Read "A Plan for SPAM" 2002 and 2003 articles for explanations
18 */

19 public final class WordStatistic implements Vectorizable
20 {
21   // parameters
22
private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE = 17;
23   private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER = 13;
24   private static final double SPAM_PROBABILITY_LIMIT = 0.95;
25   boolean considerOnlyOneOccurenceOfEachWord = true;
26
27   private int maxDistanceApproximateSearch = 0; // Slow
28
private int foundWithTolerance1 = 0;
29
30
31   // stored data
32
//
33

34   private int nSpamMails = 0;
35   private int nHamMails = 0;
36   private int nUnclassedMails = 0;
37
38   // quick access {key = word)
39
private Map<String JavaDoc,Word> wordsHashtable = new Hashtable<String JavaDoc,Word>();
40
41   /** quick acces based on word lengths,
42       very efficient for tolerant search (search only on categories +-1 length)
43   */

44   private Map<Integer JavaDoc, Hashtable<String JavaDoc,Word>> wordsHashtableForLength = new Hashtable<Integer JavaDoc, Hashtable<String JavaDoc,Word>>();
45
46   // stats
47
public int result_false_positives = 0;
48   public int result_detected_spams = 0;
49
50
51   public int result_false_positives_header = 0;
52   public int result_detected_spams_header = 0;
53
54   private StringBuffer JavaDoc falsePositivesBuffer = new StringBuffer JavaDoc();
55   public void addFalsePositive(String JavaDoc ref)
56   {
57     falsePositivesBuffer.append("\r\n "+ref);
58   }
59
60   public int getNumberOfWords()
61   {
62     return wordsHashtable.size();
63   }
64
65   /** all words should be added here
66   */

67   private void addWord(Word word)
68   {
69     wordsHashtable.put(word.word, word);
70     if(!this.wordsHashtableForLength.containsKey(word.word.length()))
71     {
72       wordsHashtableForLength.put(word.word.length(), new Hashtable<String JavaDoc, Word>());
73     }
74
75     wordsHashtableForLength.get(word.word.length()).put(word.word, word);
76   }
77
78   /** all words should be removed here
79   */

80   private void removeWord(Word word)
81   {
82     wordsHashtable.remove(word.word);
83     wordsHashtableForLength.get(word.word.length()).remove(word.word);
84   }
85
86   /** used in the stat viewer
87   */

88   public Map<String JavaDoc,Word> getAllWords() { return wordsHashtable; }
89
90
91   /** null if not found */
92   public Word getWordExact(String JavaDoc word)
93   {
94     return wordsHashtable.get(word);
95   }
96
97   int notFoundWords = 0;
98
99
100   /** try alternatives if not found.
101    * This
102   */

103   public Word getWord_using_alternatives(final String JavaDoc prefix, final String JavaDoc word)
104   {
105     if(word.length()==0)
106     {
107        System.out.println("Zero length word !");
108        return null;
109     }
110     String JavaDoc nw = normalize(word);
111     Word w = wordsHashtable.get(prefix + nw);
112     if(w!=null) return w;
113
114     // alternatives
115
if(nw.endsWith("#"))
116     {
117       w = wordsHashtable.get(prefix + nw.substring(0,nw.length()-2));
118       if(w!=null) return w;
119
120       w = tryAlternativeCases(prefix, nw.substring(0,nw.length()-2));
121       if(w!=null) return w;
122     }
123     else
124     {
125       w = wordsHashtable.get(prefix + nw+"#");
126       if(w!=null) return w;
127
128       w = tryAlternativeCases(prefix, nw+"#");
129       if(w!=null) return w;
130     }
131
132     w = tryAlternativeCases(prefix, nw);
133     if(w!=null) return w;
134
135
136
137     // try all without prefix
138
if(prefix.length()>0)
139     {
140       w = getWord_using_alternatives("", word);
141       if(w!=null) return w;
142       return null;
143     }
144     // else
145

146     // try with edit distance (SLOW !)
147
//
148
if(maxDistanceApproximateSearch>0)
149     {
150       w = searchWord_with_tolerance(word, maxDistanceApproximateSearch);
151       if(w!=null) return w;
152     }
153
154     // not found...
155
return null;
156   }
157
158
159
160   /** apply an edit-distance algorithm to find approximate matches
161   */

162   private Word searchWord_with_tolerance(String JavaDoc word, int maxDist)
163   {
164     String JavaDoc wordUP = word.toUpperCase();
165
166     int wlen = wordUP.length();
167     if(wlen>15) return null; // too long and too small prob to find a match !
168

169     // look only potential correct lengths !
170
//int i = wlen;
171
for(int i=wlen-maxDist; i<=wlen+maxDist; i++)
172     {
173        if(wordsHashtableForLength.containsKey(i))
174        {
175           // all the words of length i
176
Hashtable<String JavaDoc, Word> wordsi = wordsHashtableForLength.get(i);
177           Collection<Word> s = wordsi.values();
178
179           for(Word w: s)
180           {
181              int d = EditDistance.editDistance(wordUP, w.word.toUpperCase(), maxDist);
182              if(d<=maxDist)
183              {
184                 System.out.println(""+w.word+" is similar to "+word);
185                 foundWithTolerance1 ++;
186                 return w;
187              }
188           }
189        }
190     }
191
192     //System.out.println("No similar word found for "+word);
193

194     return null;
195   }
196
197
198   /** Bug
199        bug
200        BUG
201   */

202   private Word tryAlternativeCases(String JavaDoc prefix, String JavaDoc nw)
203   {
204     // p*BUG
205
Word w = wordsHashtable.get(prefix + nw.toUpperCase());
206     if(w!=null) return w;
207
208     // p*bug
209
w = wordsHashtable.get(prefix + nw.toLowerCase());
210     if(w!=null) return w;
211
212     if(nw.length()>1)
213     {
214       // p*Bug
215
w = wordsHashtable.get(prefix + Character.toUpperCase(nw.charAt(0))+nw.substring(1,nw.length()-1).toLowerCase());
216       if(w!=null) return w;
217     }
218     return null;
219   }
220
221
222   public int getNumberOfHAMMails() { return nHamMails; }
223   public int getNumberOfSPAMMails() { return nSpamMails; }
224
225   public void deleteStatistic()
226   {
227     wordsHashtable.clear();
228     this.wordsHashtableForLength.clear();
229
230     nSpamMails = 0;
231     nHamMails = 0;
232     nUnclassedMails = 0;
233     result_false_positives = 0;
234     result_detected_spams = 0;
235     result_false_positives_header = 0;
236     result_detected_spams_header = 0;
237     foundWithTolerance1 = 0;
238
239     falsePositivesBuffer.setLength(0);
240   }
241
242
243   public String JavaDoc toStringStat()
244   {
245     StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
246     sb.append("\r\nStatistic has "+wordsHashtable.size()+" words");
247     sb.append("\r\n\r\n Number of HAM mails = " + nHamMails);
248     sb.append("\r\n Number of SPAM mails = " + nSpamMails);
249     sb.append("\r\n Number of unclassed mails = "+nUnclassedMails);
250     sb.append("\r\n\r\nDetection results");
251     sb.append("\r\n Number of detected spams = " + this.result_detected_spams);
252     if(nSpamMails>0)
253     {
254        DecimalFormat JavaDoc df = new DecimalFormat JavaDoc("00.00");
255        double x = (double) result_detected_spams/nSpamMails*100.0;
256        sb.append( " ("+ df.format(x)+"%)");
257     }
258     sb.append("\r\n Number of false positives = " + this.result_false_positives);
259     if(nHamMails>0)
260     {
261        DecimalFormat JavaDoc df = new DecimalFormat JavaDoc("0.0000");
262        double x = (double) result_false_positives/nHamMails*100.0;
263        sb.append( " ("+ df.format(x)+"%)");
264     }
265
266     sb.append("\r\n\r\nDetection results using only the header");
267     sb.append("\r\n Number of detected spams = " + this.result_detected_spams_header);
268     sb.append("\r\n Number of false positives = " + this.result_false_positives_header);
269
270     sb.append("\r\n\r\n Words not found = "+this.notFoundWords);
271     sb.append("\r\n Found with tolerance "+maxDistanceApproximateSearch+" = " + foundWithTolerance1);
272
273     /* Debug
274     sb.append("\r\n\r\n"+this.wordsHashtableForLength.size()+" different word lengths");
275     Set<Integer> keys = wordsHashtableForLength.keySet();
276     for(Integer i: keys)
277     {
278        Hashtable ht = wordsHashtableForLength.get(i);
279        sb.append("\r\n length "+i+": "+ht.size()+" elements");
280     }*/

281
282
283     if(falsePositivesBuffer.length()>0)
284     {
285       sb.append("\r\n\r\nFalse positives:");
286       sb.append(falsePositivesBuffer.toString());
287     }
288
289     return sb.toString();
290   }
291
292   /** call this after added all mails...
293     this will delete entries with bad stat (ratio near 0.5)
294     or rare words
295
296     This is also called after a recreation of the stats, because to save storage,
297     the words are only stored with their counts, the probs need to be recomputed
298     at each startup (this is very fast)
299   */

300   public void buildStatistics(ProgressModalDialog progress)
301   {
302
303     int nSingles = 0;
304     int nNotSeparators = 0;
305
306     // 1: calculate totals
307
//
308
for(Word w: wordsHashtable.values())
309     {
310       if(w.getSpamOccurences() + w.getHamOccurences() < 5)
311       {
312         nSingles++;
313       }
314     }
315
316     // System.out.println("Build stat "+this.nHamMails+" / "+this.nSpamMails);
317

318     // 2: calc probs and remove the not signifiant ones
319
//
320
Vector<Word> toRemove = new Vector<Word>();
321     for(Word w: wordsHashtable.values())
322     {
323        w.calculateProbs(this.nHamMails, this.nSpamMails);
324
325        if(w.getSpamProb()>0.47 && w.getSpamProb()<0.53)
326        {
327           //toRemove.add(w); // ConcurrentAccesException if directly removed !
328
nNotSeparators++;
329        }
330     }
331
332     for(Word w: toRemove)
333     {
334        wordsHashtable.remove(w);
335     }
336
337     System.out.println("Number of words not discriminating: "+nNotSeparators+", number appearing less than 5: "+nSingles);
338   }
339
340   public static boolean isSpam(double messProb)
341   {
342     return messProb>SPAM_PROBABILITY_LIMIT;
343   }
344
345   /** @return a spam probability [0..1]
346   */

347   public SpamResult calculateSpamProbability(Header head)
348   {
349     // search for words
350
Vector<Word> foundWords = new Vector<Word>();
351     for(int i=0; i<head.getEntriesCount(); i++)
352     {
353       HeaderEntry he = head.getEntryAt(i);
354       scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue());
355     }
356
357     // search for the 12 more relevant words
358
return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER);
359   }
360
361
362
363   /** @return a spam probability [0..1]
364   */

365   public SpamResult calculateSpamProbability(MailMessage mess)
366   {
367     // 1) search for words
368

369     Vector<Word> foundWords = new Vector<Word>();
370
371     // in the header
372
Header head = mess.getHeader();
373     for(int i=0; i<head.getEntriesCount(); i++)
374     {
375       HeaderEntry he = head.getEntryAt(i);
376       scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue());
377     }
378
379     // in the message
380
if(MimeUtils.isMultipart(mess))
381     {
382        // in mime
383
MimeTreeModel mimeTree = mess.getMimeTree();
384        MimePart mp = mimeTree.getRootPart();
385        scanWordsRecurse(foundWords, mp);
386     }
387     else
388     {
389        this.scanWords(foundWords, "", mess.getMessageBody());
390     }
391
392     // 2) search for the 20 more relevant words
393
return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE);
394   }
395
396
397
398   private SpamResult calculateSPAMProbability(Vector<Word> allWords, int numberOfMostRelevantWordsToConsider)
399   {
400     // search for the more relevant words
401
Vector<Word> mostRelevantWords = new Vector<Word>();
402     Collections.sort(allWords, new Word.RelevanceComparator());
403
404     String JavaDoc lastWord = "";
405     if(allWords.size()>numberOfMostRelevantWordsToConsider)
406     {
407       int max = numberOfMostRelevantWordsToConsider*14;
408       if(max>allWords.size()) max = allWords.size();
409
410       for(int i=0; i<max; i++)
411       {
412
413         Word wi = allWords.get(i);
414         if(wi.word.equals(lastWord) && considerOnlyOneOccurenceOfEachWord)
415         {
416           // don't put multiple occurences ( ??? )
417
}
418         else
419         {
420           mostRelevantWords.add( wi );
421           if(mostRelevantWords.size()>=numberOfMostRelevantWordsToConsider)
422           {
423             break;
424           }
425           lastWord = wi.word;
426         }
427       }
428     }
429
430     double probSPAM_log = 0;
431     double probHAM_log = 0;
432
433     for(Word wi: mostRelevantWords)
434     {
435       probSPAM_log += Math.log(wi.getSpamProb());
436       probHAM_log += Math.log(1.0 - wi.getSpamProb());
437     }
438
439     double p = Math.exp(probSPAM_log) / ( Math.exp(probSPAM_log) + Math.exp(probHAM_log)) ;
440
441     return new SpamResult(allWords, mostRelevantWords, p);
442   }
443
444
445   /**
446       @prefix is added to the word, example subject*
447          this is used to put header & tag infos in the stat and distinguish them.
448   */

449   private void scanWords(Vector<Word> foundWords, String JavaDoc prefix, String JavaDoc text)
450   {
451     String JavaDoc[] w = WordTokenizer.extractWords(text);
452
453     for(int i=0; i<w.length; i++)
454     {
455
456       Word word = this.getWord_using_alternatives(prefix, w[i]);
457       if(word!=null)
458       {
459         foundWords.add(word);
460       }
461       else
462       {
463         this.notFoundWords++;
464         // word not found...
465
// has a spam prob of 0.4
466
Word wo = new Word(prefix+normalize(w[i]));
467         wo.setSpamProb(0.4);
468         foundWords.add(wo);
469
470         //System.out.println("Not found: "+wo.word);
471
}
472     }
473   }
474
475   private void scanHTML(Vector<Word> foundWords, String JavaDoc htmlText)
476   {
477     try
478     {
479       // 1) html text content
480
HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false);
481       String JavaDoc[] w = WordTokenizer.extractWords(he.getTextOnly());
482
483       for(int i=0; i<w.length; i++)
484       {
485         Word word = this.getWord_using_alternatives("", w[i]);
486         if(word!=null)
487         {
488           foundWords.add(word);
489         }
490         else
491         {
492           // word not found...
493
// has a spam prob of 0.4
494
this.notFoundWords++;
495           Word wo = new Word(normalize(w[i]));
496           wo.setSpamProb(0.4);
497           foundWords.add(wo);
498         }
499       }
500
501       // 2) unknown tags
502
Vector<String JavaDoc> tags = he.getUnknownTags();
503       for(int i=0; i<tags.size(); i++)
504       {
505         String JavaDoc[] wt = WordTokenizer.extractWords( tags.elementAt(i));
506         for(int t=0; t<wt.length; t++)
507         {
508           String JavaDoc wtt = normalize(wt[t]);
509           Word word = this.getWord_using_alternatives("tag*", wtt);
510           if(word!=null)
511           {
512             foundWords.add(word);
513           }
514           else
515           {
516             this.notFoundWords++;
517             // tag not found...
518
// unknown (invalid!) tags has a predisposition to be spam
519
Word wo = new Word("tag*"+wtt);
520             wo.setSpamProb(0.95);
521             foundWords.add(wo);
522           }
523         }
524       }
525
526       // 3) hrefs
527
Vector<String JavaDoc> hrefs = he.getLinksHREFs();
528       for(int i=0; i<hrefs.size(); i++)
529       {
530         this.scanWords(foundWords, "href*", hrefs.elementAt(i));
531       }
532
533       // 4) images
534
Vector<String JavaDoc> images = he.getImageSrcs();
535       for(int i=0; i<images.size(); i++)
536       {
537         this.scanWords(foundWords, "img*", images.elementAt(i));
538       }
539     }
540     catch(Exception JavaDoc e)
541     {
542       e.printStackTrace();
543     }
544   }
545
546
547   private void scanWordsRecurse(Vector<Word> foundWords, MimePart part)
548   {
549     if(part.isLeaf())
550     {
551       if(part.getContentTYPE() == MimePart.ContentType.TEXT)
552       {
553         if(part.lookIfContentIsHTML())
554         {
555           this.scanHTML(foundWords, part.getBodyAsText());
556         }
557         else
558         {
559           this.scanWords(foundWords, "", part.getBodyAsText());
560         }
561       }
562       else
563       {
564       }
565     }
566     else
567     {
568       for(int i=0; i<part.getChildCount(); i++)
569       {
570         scanWordsRecurse(foundWords, part.getPartAt(i));
571       }
572     }
573   }
574
575
576
577   /** used to train the filter.
578       adds the words to the stat.
579       Look in the header, body and attachments...
580   */

581   public void addMessageToStat(MailMessage mess)
582   {
583     // is it spam or not ??
584
boolean isSPAM = mess.getIsSPAM();
585
586     //if neither spam or ham, unclassified message doesn't participate to stat
587
if( !isSPAM && !mess.getIsHAM() )
588     {
589       nUnclassedMails++;
590       return;
591     }
592
593
594
595     // For the global counting
596
if(isSPAM)
597     {
598       nSpamMails++;
599     }
600     else
601     {
602       nHamMails++;
603     }
604
605     // add header
606
Header head = mess.getHeader();
607     for(int i=0; i<head.getEntriesCount(); i++)
608     {
609       HeaderEntry he = head.getEntryAt(i);
610       addTextToStat(he.getKey().toLowerCase()+"*", he.getValue(), isSPAM );
611     }
612
613     // add multipart & attachment (###)
614
if(MimeUtils.isMultipart(mess))
615     {
616       MimeTreeModel mimeTree = mess.getMimeTree();
617       MimePart mp = mimeTree.getRootPart();
618       addMimePartToStatRecurse(mp, isSPAM);
619     }
620     else
621     {
622       addTextToStat("", mess.getMessageBody(), isSPAM);
623     }
624   }
625
626
627   private int minWordLength = 1;
628
629   /** adds normal unformatted text to the stat.
630   */

631   private void addTextToStat(String JavaDoc prefix, String JavaDoc text, boolean isSPAM)
632   {
633     String JavaDoc[] w = WordTokenizer.extractWords(text);
634
635     for(int i=0; i<w.length; i++)
636     {
637       String JavaDoc wi = prefix+normalize(w[i]);
638       Word word = this.getWordExact(wi);
639       if(word==null)
640       {
641         word = new Word(wi);
642         if(word.word.length() >= minWordLength)
643         {
644           addWord(word);
645         }
646       }
647
648       if(isSPAM)
649       {
650         word.addSpamOccurence();
651       }
652       else
653       {
654         word.addHamOccurence();
655       }
656     }
657   }
658
659   /** adds HTML formatted text to the stat.
660   */

661   private void addHTMLToStat(String JavaDoc htmlText, boolean isSPAM)
662   {
663     try
664     {
665       HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false);
666       this.addTextToStat("", he.getTextOnly(), isSPAM);
667
668       // bad tags
669
Vector<String JavaDoc> ut = he.getUnknownTags();
670       for(int i=0; i<ut.size(); i++)
671       {
672          this.addTextToStat("tag*", ut.elementAt(i), isSPAM);
673       }
674
675       // links
676
Vector<String JavaDoc> refs = he.getLinksHREFs();
677       for(int i=0; i<refs.size(); i++)
678       {
679          this.addTextToStat("href*", refs.elementAt(i), isSPAM);
680       }
681
682       // images
683
Vector<String JavaDoc> images = he.getImageSrcs();
684       for(int i=0; i<images.size(); i++)
685       {
686          this.addTextToStat("img*", images.elementAt(i), isSPAM);
687       }
688
689     }
690     catch(Exception JavaDoc e)
691     {
692       e.printStackTrace(); // should not occur...
693
}
694
695     // ###
696
}
697
698   /** adds the text and HTML...
699   */

700   private void addMimePartToStatRecurse(MimePart part, boolean isSPAM)
701   {
702     if(part.isLeaf())
703     {
704       // analyse part
705
if(part.getContentTYPE()== MimePart.ContentType.TEXT)
706       {
707         if(part.lookIfContentIsHTML())
708         {
709           this.addHTMLToStat(part.getBodyAsText(), isSPAM);
710         }
711         else
712         {
713           this.addTextToStat("", part.getBodyAsText(), isSPAM);
714         }
715       }
716       else
717       {
718         // not text...
719
// ### add attachment name, ...
720
}
721     }
722     else
723     {
724       // recurse
725
for(int i=0; i<part.getChildCount(); i++)
726       {
727         addMimePartToStatRecurse(part.getPartAt(i), isSPAM);
728       }
729     }
730   }
731
732   // Utils
733
//
734

735
736   public void exportStatToFile(File f) throws Exception JavaDoc
737   {
738     Vector<Word> words = new Vector<Word>( wordsHashtable.values() );
739     Collections.sort(words);
740
741
742     FileOutputStream fos = null;
743     PrintWriter pw = null;
744     try
745     {
746       fos = new FileOutputStream(f);
747       pw = new PrintWriter(fos);
748
749       pw.print(""+this.toStringStat()+"\n\n");
750
751       for(int i=0; i<words.size(); i++)
752       {
753          Word wi = words.elementAt(i);
754          pw.println(""+wi.word+"\t"+wi.getSpamProb()); //+"\t"+wi.occurencesInSPAM+"\t"+wi.occurencesInHAM);
755
}
756     }
757     catch(Exception JavaDoc e)
758     {
759       throw e;
760     }
761     finally
762     {
763       if(pw!=null) pw.close();
764       if(fos!=null) fos.close();
765     }
766   }
767
768     /** Either
769           Hello, hello: ok
770           hELLo => hELLO
771           hello!!!!! => hello#
772     */

773     public static String JavaDoc normalize(String JavaDoc s)
774     {
775       if(s.length()<2) return s;
776
777       StringBuffer JavaDoc normalized = new StringBuffer JavaDoc();
778
779       boolean hasLow = false;
780       boolean hasUp = false;
781       boolean hasDigits = false;
782       boolean hasSpecial = false;
783
784       for(int i=1; i<s.length(); i++)
785       {
786         char ci = s.charAt(i);
787         if(Character.isLowerCase(ci))
788         {
789           hasLow = true;
790           normalized.append(ci);
791         }
792         else if(Character.isUpperCase(ci))
793         {
794           hasUp = true;
795           normalized.append(ci);
796         }
797         else if(Character.isDigit(ci))
798         {
799           hasDigits = true;
800           normalized.append(ci);
801         }
802         else if(ci=='.')
803         {
804           // let them, they are constituents of IP's
805
normalized.append(ci);
806         }
807         else
808         {
809           hasSpecial = true;
810           // forget the special chars
811
}
812       }
813
814       if(hasSpecial)
815       {
816         normalized.append('#');
817       }
818
819       // look at the first char
820

821       if( Character.isUpperCase(s.charAt(0)) )
822       {
823         if(hasUp)
824         {
825           // BuG => BUG
826
return s.charAt(0) + normalized.toString().toUpperCase();
827         }
828         else
829         {
830           // Bug
831
return s.charAt(0) + normalized.toString();
832         }
833       }
834       else if( Character.isLowerCase(s.charAt(0)) )
835       {
836         if(hasUp)
837         {
838           // mismatch like bUg => bUG
839
return Character.toUpperCase(s.charAt(0)) + normalized.toString().toUpperCase();
840         }
841         else
842         {
843           // bug
844
return s.charAt(0) + normalized.toString();
845         }
846       }
847       else
848       {
849          //
850
}
851
852
853       return s.charAt(0) + normalized.toString();
854     }
855
856   // Vectorizable
857
//
858

859   public Vector<Object JavaDoc> getVectorRepresentation() throws VectorizeException
860   {
861      Vector<Object JavaDoc> v = new Vector<Object JavaDoc>();
862
863      v.addElement(2); // 0: version
864
String JavaDoc[] wl = new String JavaDoc[wordsHashtable.size()];
865      int[] occHam = new int[wordsHashtable.size()];
866      int[] occSpam = new int[wordsHashtable.size()];
867
868      int i=0;
869      for(Word w : wordsHashtable.values())
870      {
871         wl[i] = w.word;
872         occHam[i] = w.getHamOccurences();
873         occSpam[i] = w.getSpamOccurences();
874         i++;
875      }
876      v.addElement(wl); // 1
877
v.addElement(occHam);
878      v.addElement(occSpam);
879      v.addElement(this.nHamMails); // 4 // storing raw arrays is much more faster than making vectors with their elements !
880
v.addElement(this.nSpamMails);
881      v.addElement(this.nUnclassedMails); // 6
882

883      return v;
884   }
885
886   public void createFromVectorRepresentation(Vector<Object JavaDoc> v) throws VectorizeException
887   {
888     int version = (Integer JavaDoc)v.elementAt(0);
889     if(version==2)
890     {
891       this.deleteStatistic();
892
893       String JavaDoc[] wl = (String JavaDoc[]) v.elementAt(1);
894       int[] occHam = (int[]) v.elementAt(2);
895       int[] occSpam = (int[]) v.elementAt(3);
896       if(v.size()>4)
897       {
898         nHamMails = (Integer JavaDoc) v.elementAt(4);
899         nSpamMails = (Integer JavaDoc) v.elementAt(5);
900       }
901       if(v.size()>6)
902       {
903         nUnclassedMails = (Integer JavaDoc) v.elementAt(6);
904       }
905
906       for(int i=0; i<wl.length; i++)
907       {
908         Word w = new Word(wl[i], occHam[i], occSpam[i]);
909         addWord(w);
910       }
911
912       buildStatistics(null);
913     }
914     else
915     {
916       throw new VectorizeException("Bad vectorized Word version "+version);
917     }
918   }
919
920 /* TODO: visually similar character recognition for approximate matching !
921    IDEA2: Keyboard distance ?
922    IDEA3: Phonetic distance (soundex) ?
923
924    Classification: ( => Automatic ?)
925
926    abcdefghijklmnopqrstuvwxyz1234567890
927    ABCDEFGHIJKLMNOPQRSTUVWXYZ
928
929    a c e o 0 O C D G Q
930
931    b d h k
932
933    f i l r s t I J L T 1 2 7
934
935    g j p q y Y 9
936
937    m w M W
938
939    n u v N U V
940
941    x z X Z
942
943    4 5 6 7
944
945    A B E F H K P R S 3 8
946
947
948 */

949
950 } // Word
Popular Tags