WordStatistic


1   package SnowMailClient.SpamFilter;
2   
3   import SnowMailClient.*;
4   import SnowMailClient.model.*;
5   import SnowMailClient.model.multipart.*;
6   import snow.utils.storage.*;
7   import SnowMailClient.utils.TextSearch.EditDistance;
8   import snow.utils.gui.*;
9   import SnowMailClient.html.HTMLTextExtractor;
10  
11  import java.util.*;
12  import java.util.regex.*;
13  import java.io.*;
14  import java.text.DecimalFormat  ;
15  
16  
17  /** Read "A Plan for SPAM" 2002 and 2003 articles for explanations
18  */
19  public final class WordStatistic implements Vectorizable
20  {
21    // parameters
22    private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE = 17;
23    private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER = 13;
24    private static final double SPAM_PROBABILITY_LIMIT = 0.95;
25    boolean considerOnlyOneOccurenceOfEachWord = true;
26  
27    private int maxDistanceApproximateSearch = 0;  // Slow
28    private int foundWithTolerance1 = 0;
29  
30  
31    // stored data
32    //
33  
34    private int nSpamMails = 0;
35    private int nHamMails = 0;
36    private int nUnclassedMails = 0;
37  
38    // quick access {key = word)
39    private Map<String  ,Word> wordsHashtable = new Hashtable<String  ,Word>();
40  
41    /** quick acces based on word lengths,
42        very efficient for tolerant search (search only on categories +-1 length)
43    */
44    private Map<Integer  , Hashtable<String  ,Word>> wordsHashtableForLength = new Hashtable<Integer  , Hashtable<String  ,Word>>();
45  
46    // stats
47    public int result_false_positives = 0;
48    public int result_detected_spams  = 0;
49  
50  
51    public int result_false_positives_header = 0;
52    public int result_detected_spams_header  = 0;
53  
54    private StringBuffer   falsePositivesBuffer = new StringBuffer  ();
55    public void addFalsePositive(String   ref)
56    {
57      falsePositivesBuffer.append("\r\n "+ref);
58    }
59  
60    public int getNumberOfWords()
61    {
62      return wordsHashtable.size();
63    }
64  
65    /** all words should be added here
66    */
67    private void addWord(Word word)
68    {
69      wordsHashtable.put(word.word, word);
70      if(!this.wordsHashtableForLength.containsKey(word.word.length()))
71      {
72        wordsHashtableForLength.put(word.word.length(), new Hashtable<String  , Word>());
73      }
74  
75      wordsHashtableForLength.get(word.word.length()).put(word.word, word);
76    }
77  
78    /** all words should be removed here
79    */
80    private void removeWord(Word word)
81    {
82      wordsHashtable.remove(word.word);
83      wordsHashtableForLength.get(word.word.length()).remove(word.word);
84    }
85  
86    /** used in the stat viewer
87    */
88    public Map<String  ,Word> getAllWords() { return wordsHashtable; }
89  
90  
91    /** null if not found */
92    public Word getWordExact(String   word)
93    {
94      return wordsHashtable.get(word);
95    }
96  
97    int notFoundWords = 0;
98  
99  
100   /** try alternatives if not found.
101    *  This
102   */
103   public Word getWord_using_alternatives(final String   prefix, final String   word)
104   {
105     if(word.length()==0)
106     {
107        System.out.println("Zero length word !");
108        return null;
109     }
110     String   nw = normalize(word);
111     Word w = wordsHashtable.get(prefix + nw);
112     if(w!=null) return w;
113 
114     // alternatives
115     if(nw.endsWith("#"))
116     {
117       w = wordsHashtable.get(prefix + nw.substring(0,nw.length()-2));
118       if(w!=null) return w;
119 
120       w = tryAlternativeCases(prefix, nw.substring(0,nw.length()-2));
121       if(w!=null) return w;
122     }
123     else
124     {
125       w = wordsHashtable.get(prefix + nw+"#");
126       if(w!=null) return w;
127 
128       w = tryAlternativeCases(prefix, nw+"#");
129       if(w!=null) return w;
130     }
131 
132     w = tryAlternativeCases(prefix, nw);
133     if(w!=null) return w;
134 
135 
136 
137     // try all without prefix
138     if(prefix.length()>0)
139     {
140       w = getWord_using_alternatives("", word);
141       if(w!=null) return w;
142       return null;
143     }
144     // else
145 
146     // try with edit distance (SLOW !)
147     //
148     if(maxDistanceApproximateSearch>0)
149     {
150       w = searchWord_with_tolerance(word, maxDistanceApproximateSearch);
151       if(w!=null) return w;
152     }
153 
154     // not found...
155     return null;
156   }
157 
158 
159 
160   /** apply an edit-distance algorithm to find approximate matches
161   */
162   private Word searchWord_with_tolerance(String   word, int maxDist)
163   {
164     String   wordUP = word.toUpperCase();
165 
166     int wlen = wordUP.length();
167     if(wlen>15) return null;   // too long and too small prob to find a match !
168 
169     // look only potential correct lengths !
170     //int i = wlen;
171     for(int i=wlen-maxDist;  i<=wlen+maxDist;  i++)
172     {
173        if(wordsHashtableForLength.containsKey(i))
174        {
175           // all the words of length i
176           Hashtable<String  , Word> wordsi = wordsHashtableForLength.get(i);
177           Collection<Word> s = wordsi.values();
178 
179           for(Word w: s)
180           {
181              int d = EditDistance.editDistance(wordUP, w.word.toUpperCase(), maxDist);
182              if(d<=maxDist)
183              {
184                 System.out.println(""+w.word+" is similar to "+word);
185                 foundWithTolerance1 ++;
186                 return w;
187              }
188           }
189        }
190     }
191 
192     //System.out.println("No similar word found for "+word);
193 
194     return null;
195   }
196 
197 
198   /**  Bug
199        bug
200        BUG
201   */
202   private Word tryAlternativeCases(String   prefix, String   nw)
203   {
204     // p*BUG
205     Word w = wordsHashtable.get(prefix + nw.toUpperCase());
206     if(w!=null) return w;
207 
208     // p*bug
209     w = wordsHashtable.get(prefix + nw.toLowerCase());
210     if(w!=null) return w;
211 
212     if(nw.length()>1)
213     {
214       // p*Bug
215       w = wordsHashtable.get(prefix + Character.toUpperCase(nw.charAt(0))+nw.substring(1,nw.length()-1).toLowerCase());
216       if(w!=null) return w;
217     }
218     return null;
219   }
220 
221 
222   public int getNumberOfHAMMails()   { return nHamMails; }
223   public int getNumberOfSPAMMails()  { return nSpamMails; }
224 
225   public void deleteStatistic()
226   {
227     wordsHashtable.clear();
228     this.wordsHashtableForLength.clear();
229 
230     nSpamMails = 0;
231     nHamMails = 0;
232     nUnclassedMails = 0;
233     result_false_positives = 0;
234     result_detected_spams = 0;
235     result_false_positives_header = 0;
236     result_detected_spams_header = 0;
237     foundWithTolerance1 = 0;
238 
239     falsePositivesBuffer.setLength(0);
240   }
241 
242 
243   public String   toStringStat()
244   {
245     StringBuffer   sb = new StringBuffer  ();
246     sb.append("\r\nStatistic has "+wordsHashtable.size()+" words");
247     sb.append("\r\n\r\n  Number of HAM mails = " + nHamMails);
248     sb.append("\r\n  Number of SPAM mails = " + nSpamMails);
249     sb.append("\r\n  Number of unclassed mails = "+nUnclassedMails);
250     sb.append("\r\n\r\nDetection results");
251     sb.append("\r\n  Number of detected spams = " + this.result_detected_spams);
252     if(nSpamMails>0)
253     {
254        DecimalFormat   df = new DecimalFormat  ("00.00");
255        double x = (double) result_detected_spams/nSpamMails*100.0;
256        sb.append( "  ("+ df.format(x)+"%)");
257     }
258     sb.append("\r\n  Number of false positives = " + this.result_false_positives);
259     if(nHamMails>0)
260     {
261        DecimalFormat   df = new DecimalFormat  ("0.0000");
262        double x = (double) result_false_positives/nHamMails*100.0;
263        sb.append( "  ("+ df.format(x)+"%)");
264     }
265 
266     sb.append("\r\n\r\nDetection results using only the header");
267     sb.append("\r\n  Number of detected spams = " + this.result_detected_spams_header);
268     sb.append("\r\n  Number of false positives = " + this.result_false_positives_header);
269 
270     sb.append("\r\n\r\n Words not found = "+this.notFoundWords);
271     sb.append("\r\n Found with tolerance "+maxDistanceApproximateSearch+" = " + foundWithTolerance1);
272 
273     /* Debug
274     sb.append("\r\n\r\n"+this.wordsHashtableForLength.size()+" different word lengths");
275     Set<Integer> keys = wordsHashtableForLength.keySet();
276     for(Integer i: keys)
277     {
278        Hashtable ht = wordsHashtableForLength.get(i);
279        sb.append("\r\n length "+i+": "+ht.size()+" elements");
280     }*/
281 
282 
283     if(falsePositivesBuffer.length()>0)
284     {
285       sb.append("\r\n\r\nFalse positives:");
286       sb.append(falsePositivesBuffer.toString());
287     }
288 
289     return sb.toString();
290   }
291 
292   /** call this after added all mails...
293     this will delete entries with bad stat (ratio near 0.5)
294     or rare words
295 
296     This is also called after a recreation of the stats, because to save storage,
297     the words are only stored with their counts, the probs need to be recomputed
298     at each startup (this is very fast)
299   */
300   public void buildStatistics(ProgressModalDialog progress)
301   {
302 
303     int nSingles = 0;
304     int nNotSeparators = 0;
305 
306     // 1: calculate totals
307     //
308     for(Word w: wordsHashtable.values())
309     {
310       if(w.getSpamOccurences() + w.getHamOccurences() < 5)
311       {
312         nSingles++;
313       }
314     }
315 
316     // System.out.println("Build stat "+this.nHamMails+" / "+this.nSpamMails);
317 
318     // 2: calc probs and remove the not signifiant ones
319     //
320     Vector<Word> toRemove = new Vector<Word>();
321     for(Word w: wordsHashtable.values())
322     {
323        w.calculateProbs(this.nHamMails, this.nSpamMails);
324 
325        if(w.getSpamProb()>0.47 && w.getSpamProb()<0.53)
326        {
327           //toRemove.add(w);    // ConcurrentAccesException if directly removed !
328           nNotSeparators++;
329        }
330     }
331 
332     for(Word w: toRemove)
333     {
334        wordsHashtable.remove(w);
335     }
336 
337     System.out.println("Number of words not discriminating: "+nNotSeparators+", number appearing less than 5: "+nSingles);
338   }
339 
340   public static boolean isSpam(double messProb)
341   {
342     return messProb>SPAM_PROBABILITY_LIMIT;
343   }
344 
345   /** @return a spam probability [0..1]
346   */
347   public SpamResult calculateSpamProbability(Header head)
348   {
349     // search for words
350     Vector<Word> foundWords = new Vector<Word>();
351     for(int i=0; i<head.getEntriesCount(); i++)
352     {
353       HeaderEntry he = head.getEntryAt(i);
354       scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue());
355     }
356 
357     //  search for the 12 more relevant words
358     return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER);
359   }
360 
361 
362 
363   /** @return a spam probability [0..1]
364   */
365   public SpamResult calculateSpamProbability(MailMessage mess)
366   {
367     // 1) search for words
368 
369     Vector<Word> foundWords = new Vector<Word>();
370 
371     // in the header
372     Header head = mess.getHeader();
373     for(int i=0; i<head.getEntriesCount(); i++)
374     {
375       HeaderEntry he = head.getEntryAt(i);
376       scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue());
377     }
378 
379     // in the message
380     if(MimeUtils.isMultipart(mess))
381     {
382        // in mime
383        MimeTreeModel mimeTree = mess.getMimeTree();
384        MimePart mp = mimeTree.getRootPart();
385        scanWordsRecurse(foundWords, mp);
386     }
387     else
388     {
389        this.scanWords(foundWords, "", mess.getMessageBody());
390     }
391 
392     // 2) search for the 20 more relevant words
393     return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE);
394   }
395 
396 
397 
398   private SpamResult calculateSPAMProbability(Vector<Word> allWords, int numberOfMostRelevantWordsToConsider)
399   {
400     // search for the more relevant words
401     Vector<Word> mostRelevantWords = new Vector<Word>();
402     Collections.sort(allWords, new Word.RelevanceComparator());
403 
404     String   lastWord = "";
405     if(allWords.size()>numberOfMostRelevantWordsToConsider)
406     {
407       int max =  numberOfMostRelevantWordsToConsider*14;
408       if(max>allWords.size()) max = allWords.size();
409 
410       for(int i=0; i<max; i++)
411       {
412 
413         Word wi = allWords.get(i);
414         if(wi.word.equals(lastWord) && considerOnlyOneOccurenceOfEachWord)
415         {
416           // don't put multiple occurences ( ??? )
417         }
418         else
419         {
420           mostRelevantWords.add( wi );
421           if(mostRelevantWords.size()>=numberOfMostRelevantWordsToConsider)
422           {
423             break;
424           }
425           lastWord = wi.word;
426         }
427       }
428     }
429 
430     double probSPAM_log = 0;
431     double probHAM_log = 0;
432 
433     for(Word wi: mostRelevantWords)
434     {
435       probSPAM_log += Math.log(wi.getSpamProb());
436       probHAM_log += Math.log(1.0 - wi.getSpamProb());
437     }
438 
439     double p = Math.exp(probSPAM_log) / ( Math.exp(probSPAM_log) + Math.exp(probHAM_log)) ;
440 
441     return new SpamResult(allWords, mostRelevantWords, p);
442   }
443 
444 
445   /**
446       @prefix is added to the word,   example subject*
447          this is used to put header & tag infos in the stat and distinguish them.
448   */
449   private void scanWords(Vector<Word> foundWords, String   prefix, String   text)
450   {
451     String  [] w = WordTokenizer.extractWords(text);
452 
453     for(int i=0; i<w.length; i++)
454     {
455 
456       Word word = this.getWord_using_alternatives(prefix, w[i]);
457       if(word!=null)
458       {
459         foundWords.add(word);
460       }
461       else
462       {
463         this.notFoundWords++;
464         // word not found...
465         // has a spam prob of 0.4
466         Word wo = new Word(prefix+normalize(w[i]));
467         wo.setSpamProb(0.4);
468         foundWords.add(wo);
469 
470         //System.out.println("Not found: "+wo.word);
471       }
472     }
473   }
474 
475   private void scanHTML(Vector<Word> foundWords, String   htmlText)
476   {
477     try
478     {
479       // 1) html text content
480       HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false);
481       String  [] w = WordTokenizer.extractWords(he.getTextOnly());
482 
483       for(int i=0; i<w.length; i++)
484       {
485         Word word = this.getWord_using_alternatives("", w[i]);
486         if(word!=null)
487         {
488           foundWords.add(word);
489         }
490         else
491         {
492           // word not found...
493           // has a spam prob of 0.4
494           this.notFoundWords++;
495           Word wo = new Word(normalize(w[i]));
496           wo.setSpamProb(0.4);
497           foundWords.add(wo);
498         }
499       }
500 
501       // 2) unknown tags
502       Vector<String  > tags = he.getUnknownTags();
503       for(int i=0; i<tags.size(); i++)
504       {
505         String  [] wt = WordTokenizer.extractWords( tags.elementAt(i));
506         for(int t=0; t<wt.length; t++)
507         {
508           String   wtt = normalize(wt[t]);
509           Word word = this.getWord_using_alternatives("tag*", wtt);
510           if(word!=null)
511           {
512             foundWords.add(word);
513           }
514           else
515           {
516             this.notFoundWords++;
517             // tag not found...
518             // unknown (invalid!) tags has a predisposition to be spam
519             Word wo = new Word("tag*"+wtt);
520             wo.setSpamProb(0.95);
521             foundWords.add(wo);
522           }
523         }
524       }
525 
526       // 3) hrefs
527       Vector<String  > hrefs = he.getLinksHREFs();
528       for(int i=0; i<hrefs.size(); i++)
529       {
530         this.scanWords(foundWords, "href*", hrefs.elementAt(i));
531       }
532 
533       // 4) images
534       Vector<String  > images = he.getImageSrcs();
535       for(int i=0; i<images.size(); i++)
536       {
537         this.scanWords(foundWords, "img*", images.elementAt(i));
538       }
539     }
540     catch(Exception   e)
541     {
542       e.printStackTrace();
543     }
544   }
545 
546 
547   private void scanWordsRecurse(Vector<Word> foundWords, MimePart part)
548   {
549     if(part.isLeaf())
550     {
551       if(part.getContentTYPE() == MimePart.ContentType.TEXT)
552       {
553         if(part.lookIfContentIsHTML())
554         {
555           this.scanHTML(foundWords, part.getBodyAsText());
556         }
557         else
558         {
559           this.scanWords(foundWords, "", part.getBodyAsText());
560         }
561       }
562       else
563       {
564       }
565     }
566     else
567     {
568       for(int i=0; i<part.getChildCount(); i++)
569       {
570         scanWordsRecurse(foundWords, part.getPartAt(i));
571       }
572     }
573   }
574 
575 
576 
577   /** used to train the filter.
578       adds the words to the stat.
579       Look in the header, body and attachments...
580   */
581   public void addMessageToStat(MailMessage mess)
582   {
583     // is it spam or not ??
584     boolean isSPAM = mess.getIsSPAM();
585 
586     //if neither spam or ham, unclassified message doesn't participate to stat
587     if( !isSPAM && !mess.getIsHAM() )
588     {
589       nUnclassedMails++;
590       return;
591     }
592 
593 
594 
595     // For the global counting
596     if(isSPAM)
597     {
598       nSpamMails++;
599     }
600     else
601     {
602       nHamMails++;
603     }
604 
605     // add header
606     Header head = mess.getHeader();
607     for(int i=0; i<head.getEntriesCount(); i++)
608     {
609       HeaderEntry he = head.getEntryAt(i);
610       addTextToStat(he.getKey().toLowerCase()+"*", he.getValue(), isSPAM );
611     }
612 
613     // add multipart & attachment (###)
614     if(MimeUtils.isMultipart(mess))
615     {
616       MimeTreeModel mimeTree = mess.getMimeTree();
617       MimePart mp = mimeTree.getRootPart();
618       addMimePartToStatRecurse(mp, isSPAM);
619     }
620     else
621     {
622       addTextToStat("", mess.getMessageBody(), isSPAM);
623     }
624   }
625 
626 
627   private int minWordLength = 1;
628 
629   /** adds normal unformatted text to the stat.
630   */
631   private void addTextToStat(String   prefix, String   text, boolean isSPAM)
632   {
633     String  [] w = WordTokenizer.extractWords(text);
634 
635     for(int i=0; i<w.length; i++)
636     {
637       String   wi = prefix+normalize(w[i]);
638       Word word = this.getWordExact(wi);
639       if(word==null)
640       {
641         word = new Word(wi);
642         if(word.word.length() >= minWordLength)
643         {
644           addWord(word);
645         }
646       }
647 
648       if(isSPAM)
649       {
650         word.addSpamOccurence();
651       }
652       else
653       {
654         word.addHamOccurence();
655       }
656     }
657   }
658 
659   /** adds HTML formatted text to the stat.
660   */
661   private void addHTMLToStat(String   htmlText, boolean isSPAM)
662   {
663     try
664     {
665       HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false);
666       this.addTextToStat("", he.getTextOnly(), isSPAM);
667 
668       // bad tags
669       Vector<String  > ut = he.getUnknownTags();
670       for(int i=0; i<ut.size(); i++)
671       {
672          this.addTextToStat("tag*", ut.elementAt(i), isSPAM);
673       }
674 
675       // links
676       Vector<String  > refs = he.getLinksHREFs();
677       for(int i=0; i<refs.size(); i++)
678       {
679          this.addTextToStat("href*", refs.elementAt(i), isSPAM);
680       }
681 
682       // images
683       Vector<String  > images = he.getImageSrcs();
684       for(int i=0; i<images.size(); i++)
685       {
686          this.addTextToStat("img*", images.elementAt(i), isSPAM);
687       }
688 
689     }
690     catch(Exception   e)
691     {
692       e.printStackTrace();  // should not occur...
693     }
694 
695     // ###
696   }
697 
698   /** adds the text and HTML...
699   */
700   private void addMimePartToStatRecurse(MimePart part, boolean isSPAM)
701   {
702     if(part.isLeaf())
703     {
704       // analyse part
705       if(part.getContentTYPE()== MimePart.ContentType.TEXT)
706       {
707         if(part.lookIfContentIsHTML())
708         {
709           this.addHTMLToStat(part.getBodyAsText(), isSPAM);
710         }
711         else
712         {
713           this.addTextToStat("", part.getBodyAsText(), isSPAM);
714         }
715       }
716       else
717       {
718         // not text...
719         // ### add attachment name, ...
720       }
721     }
722     else
723     {
724       // recurse
725       for(int i=0; i<part.getChildCount(); i++)
726       {
727         addMimePartToStatRecurse(part.getPartAt(i), isSPAM);
728       }
729     }
730   }
731 
732   // Utils
733   //
734 
735 
736   public void exportStatToFile(File f) throws Exception  
737   {
738     Vector<Word> words = new Vector<Word>( wordsHashtable.values() );
739     Collections.sort(words);
740 
741 
742     FileOutputStream fos = null;
743     PrintWriter pw = null;
744     try
745     {
746       fos = new FileOutputStream(f);
747       pw = new PrintWriter(fos);
748 
749       pw.print(""+this.toStringStat()+"\n\n");
750 
751       for(int i=0; i<words.size(); i++)
752       {
753          Word wi = words.elementAt(i);
754          pw.println(""+wi.word+"\t"+wi.getSpamProb()); //+"\t"+wi.occurencesInSPAM+"\t"+wi.occurencesInHAM);
755       }
756     }
757     catch(Exception   e)
758     {
759       throw e;
760     }
761     finally
762     {
763       if(pw!=null) pw.close();
764       if(fos!=null) fos.close();
765     }
766   }
767 
768     /**  Either
769           Hello, hello: ok
770           hELLo => hELLO
771           hello!!!!! => hello#
772     */
773     public static String   normalize(String   s)
774     {
775       if(s.length()<2) return s;
776 
777       StringBuffer   normalized = new StringBuffer  ();
778 
779       boolean hasLow = false;
780       boolean hasUp  = false;
781       boolean hasDigits  = false;
782       boolean hasSpecial  = false;
783 
784       for(int i=1; i<s.length(); i++)
785       {
786         char ci = s.charAt(i);
787         if(Character.isLowerCase(ci))
788         {
789           hasLow = true;
790           normalized.append(ci);
791         }
792         else if(Character.isUpperCase(ci))
793         {
794           hasUp = true;
795           normalized.append(ci);
796         }
797         else if(Character.isDigit(ci))
798         {
799           hasDigits = true;
800           normalized.append(ci);
801         }
802         else if(ci=='.')
803         {
804           // let them, they are constituents of IP's
805           normalized.append(ci);
806         }
807         else
808         {
809           hasSpecial = true;
810           // forget the special chars
811         }
812       }
813 
814       if(hasSpecial)
815       {
816         normalized.append('#');
817       }
818 
819       // look at the first char
820 
821       if( Character.isUpperCase(s.charAt(0)) )
822       {
823         if(hasUp)
824         {
825           // BuG => BUG
826           return s.charAt(0) + normalized.toString().toUpperCase();
827         }
828         else
829         {
830           // Bug
831           return s.charAt(0) + normalized.toString();
832         }
833       }
834       else if( Character.isLowerCase(s.charAt(0)) )
835       {
836         if(hasUp)
837         {
838           // mismatch like bUg => bUG
839           return Character.toUpperCase(s.charAt(0)) + normalized.toString().toUpperCase();
840         }
841         else
842         {
843           // bug
844           return s.charAt(0) + normalized.toString();
845         }
846       }
847       else
848       {
849          //
850       }
851 
852 
853       return s.charAt(0) + normalized.toString();
854     }
855 
856   // Vectorizable
857   //
858 
859   public Vector<Object  > getVectorRepresentation() throws VectorizeException
860   {
861      Vector<Object  > v = new Vector<Object  >();
862 
863      v.addElement(2); // 0: version
864      String  [] wl = new String  [wordsHashtable.size()];
865      int[] occHam = new int[wordsHashtable.size()];
866      int[] occSpam = new int[wordsHashtable.size()];
867 
868      int i=0;
869      for(Word w : wordsHashtable.values())
870      {
871         wl[i] = w.word;
872         occHam[i] = w.getHamOccurences();
873         occSpam[i] = w.getSpamOccurences();
874         i++;
875      }
876      v.addElement(wl);     // 1
877      v.addElement(occHam);
878      v.addElement(occSpam);
879      v.addElement(this.nHamMails);     // 4   // storing raw arrays is much more faster than making vectors with their elements !
880      v.addElement(this.nSpamMails);
881      v.addElement(this.nUnclassedMails);  // 6
882 
883      return v;
884   }
885 
886   public void createFromVectorRepresentation(Vector<Object  > v) throws VectorizeException
887   {
888     int version = (Integer  )v.elementAt(0);
889     if(version==2)
890     {
891       this.deleteStatistic();
892 
893       String  [] wl = (String  []) v.elementAt(1);
894       int[] occHam = (int[]) v.elementAt(2);
895       int[] occSpam = (int[]) v.elementAt(3);
896       if(v.size()>4)
897       {
898         nHamMails = (Integer  ) v.elementAt(4);
899         nSpamMails = (Integer  ) v.elementAt(5);
900       }
901       if(v.size()>6)
902       {
903         nUnclassedMails = (Integer  ) v.elementAt(6);
904       }
905 
906       for(int i=0; i<wl.length; i++)
907       {
908         Word w = new Word(wl[i], occHam[i], occSpam[i]);
909         addWord(w);
910       }
911 
912       buildStatistics(null);
913     }
914     else
915     {
916       throw new VectorizeException("Bad vectorized Word version "+version);
917     }
918   }
919 
920 /* TODO: visually similar character recognition for approximate matching !
921    IDEA2: Keyboard distance ?
922    IDEA3: Phonetic distance (soundex) ?
923 
924    Classification: ( => Automatic ?)
925 
926    abcdefghijklmnopqrstuvwxyz1234567890
927    ABCDEFGHIJKLMNOPQRSTUVWXYZ
928 
929    a c e o 0 O C D G Q
930 
931    b d h k
932 
933    f i l r s t I J L T 1 2 7
934 
935    g j p q y Y 9
936 
937    m w M W
938 
939    n u v N U V
940 
941    x z X Z
942 
943    4 5 6 7
944 
945    A B E F H K P R S 3 8
946 
947 
948 */
949 
950 } // Word
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags