1 package SnowMailClient.SpamFilter; 2 3 import SnowMailClient.*; 4 import SnowMailClient.model.*; 5 import SnowMailClient.model.multipart.*; 6 import snow.utils.storage.*; 7 import SnowMailClient.utils.TextSearch.EditDistance; 8 import snow.utils.gui.*; 9 import SnowMailClient.html.HTMLTextExtractor; 10 11 import java.util.*; 12 import java.util.regex.*; 13 import java.io.*; 14 import java.text.DecimalFormat ; 15 16 17 19 public final class WordStatistic implements Vectorizable 20 { 21 private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE = 17; 23 private static final int NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER = 13; 24 private static final double SPAM_PROBABILITY_LIMIT = 0.95; 25 boolean considerOnlyOneOccurenceOfEachWord = true; 26 27 private int maxDistanceApproximateSearch = 0; private int foundWithTolerance1 = 0; 29 30 31 34 private int nSpamMails = 0; 35 private int nHamMails = 0; 36 private int nUnclassedMails = 0; 37 38 private Map<String ,Word> wordsHashtable = new Hashtable<String ,Word>(); 40 41 44 private Map<Integer , Hashtable<String ,Word>> wordsHashtableForLength = new Hashtable<Integer , Hashtable<String ,Word>>(); 45 46 public int result_false_positives = 0; 48 public int result_detected_spams = 0; 49 50 51 public int result_false_positives_header = 0; 52 public int result_detected_spams_header = 0; 53 54 private StringBuffer falsePositivesBuffer = new StringBuffer (); 55 public void addFalsePositive(String ref) 56 { 57 falsePositivesBuffer.append("\r\n "+ref); 58 } 59 60 public int getNumberOfWords() 61 { 62 return wordsHashtable.size(); 63 } 64 65 67 private void addWord(Word word) 68 { 69 wordsHashtable.put(word.word, word); 70 if(!this.wordsHashtableForLength.containsKey(word.word.length())) 71 { 72 wordsHashtableForLength.put(word.word.length(), new Hashtable<String , Word>()); 73 } 74 75 wordsHashtableForLength.get(word.word.length()).put(word.word, word); 76 } 77 78 80 private void removeWord(Word word) 81 { 82 wordsHashtable.remove(word.word); 83 wordsHashtableForLength.get(word.word.length()).remove(word.word); 84 } 85 86 88 public Map<String ,Word> getAllWords() { return wordsHashtable; } 89 90 91 92 public Word getWordExact(String word) 93 { 94 return wordsHashtable.get(word); 95 } 96 97 int notFoundWords = 0; 98 99 100 103 public Word getWord_using_alternatives(final String prefix, final String word) 104 { 105 if(word.length()==0) 106 { 107 System.out.println("Zero length word !"); 108 return null; 109 } 110 String nw = normalize(word); 111 Word w = wordsHashtable.get(prefix + nw); 112 if(w!=null) return w; 113 114 if(nw.endsWith("#")) 116 { 117 w = wordsHashtable.get(prefix + nw.substring(0,nw.length()-2)); 118 if(w!=null) return w; 119 120 w = tryAlternativeCases(prefix, nw.substring(0,nw.length()-2)); 121 if(w!=null) return w; 122 } 123 else 124 { 125 w = wordsHashtable.get(prefix + nw+"#"); 126 if(w!=null) return w; 127 128 w = tryAlternativeCases(prefix, nw+"#"); 129 if(w!=null) return w; 130 } 131 132 w = tryAlternativeCases(prefix, nw); 133 if(w!=null) return w; 134 135 136 137 if(prefix.length()>0) 139 { 140 w = getWord_using_alternatives("", word); 141 if(w!=null) return w; 142 return null; 143 } 144 146 if(maxDistanceApproximateSearch>0) 149 { 150 w = searchWord_with_tolerance(word, maxDistanceApproximateSearch); 151 if(w!=null) return w; 152 } 153 154 return null; 156 } 157 158 159 160 162 private Word searchWord_with_tolerance(String word, int maxDist) 163 { 164 String wordUP = word.toUpperCase(); 165 166 int wlen = wordUP.length(); 167 if(wlen>15) return null; 169 for(int i=wlen-maxDist; i<=wlen+maxDist; i++) 172 { 173 if(wordsHashtableForLength.containsKey(i)) 174 { 175 Hashtable<String , Word> wordsi = wordsHashtableForLength.get(i); 177 Collection<Word> s = wordsi.values(); 178 179 for(Word w: s) 180 { 181 int d = EditDistance.editDistance(wordUP, w.word.toUpperCase(), maxDist); 182 if(d<=maxDist) 183 { 184 System.out.println(""+w.word+" is similar to "+word); 185 foundWithTolerance1 ++; 186 return w; 187 } 188 } 189 } 190 } 191 192 194 return null; 195 } 196 197 198 202 private Word tryAlternativeCases(String prefix, String nw) 203 { 204 Word w = wordsHashtable.get(prefix + nw.toUpperCase()); 206 if(w!=null) return w; 207 208 w = wordsHashtable.get(prefix + nw.toLowerCase()); 210 if(w!=null) return w; 211 212 if(nw.length()>1) 213 { 214 w = wordsHashtable.get(prefix + Character.toUpperCase(nw.charAt(0))+nw.substring(1,nw.length()-1).toLowerCase()); 216 if(w!=null) return w; 217 } 218 return null; 219 } 220 221 222 public int getNumberOfHAMMails() { return nHamMails; } 223 public int getNumberOfSPAMMails() { return nSpamMails; } 224 225 public void deleteStatistic() 226 { 227 wordsHashtable.clear(); 228 this.wordsHashtableForLength.clear(); 229 230 nSpamMails = 0; 231 nHamMails = 0; 232 nUnclassedMails = 0; 233 result_false_positives = 0; 234 result_detected_spams = 0; 235 result_false_positives_header = 0; 236 result_detected_spams_header = 0; 237 foundWithTolerance1 = 0; 238 239 falsePositivesBuffer.setLength(0); 240 } 241 242 243 public String toStringStat() 244 { 245 StringBuffer sb = new StringBuffer (); 246 sb.append("\r\nStatistic has "+wordsHashtable.size()+" words"); 247 sb.append("\r\n\r\n Number of HAM mails = " + nHamMails); 248 sb.append("\r\n Number of SPAM mails = " + nSpamMails); 249 sb.append("\r\n Number of unclassed mails = "+nUnclassedMails); 250 sb.append("\r\n\r\nDetection results"); 251 sb.append("\r\n Number of detected spams = " + this.result_detected_spams); 252 if(nSpamMails>0) 253 { 254 DecimalFormat df = new DecimalFormat ("00.00"); 255 double x = (double) result_detected_spams/nSpamMails*100.0; 256 sb.append( " ("+ df.format(x)+"%)"); 257 } 258 sb.append("\r\n Number of false positives = " + this.result_false_positives); 259 if(nHamMails>0) 260 { 261 DecimalFormat df = new DecimalFormat ("0.0000"); 262 double x = (double) result_false_positives/nHamMails*100.0; 263 sb.append( " ("+ df.format(x)+"%)"); 264 } 265 266 sb.append("\r\n\r\nDetection results using only the header"); 267 sb.append("\r\n Number of detected spams = " + this.result_detected_spams_header); 268 sb.append("\r\n Number of false positives = " + this.result_false_positives_header); 269 270 sb.append("\r\n\r\n Words not found = "+this.notFoundWords); 271 sb.append("\r\n Found with tolerance "+maxDistanceApproximateSearch+" = " + foundWithTolerance1); 272 273 281 282 283 if(falsePositivesBuffer.length()>0) 284 { 285 sb.append("\r\n\r\nFalse positives:"); 286 sb.append(falsePositivesBuffer.toString()); 287 } 288 289 return sb.toString(); 290 } 291 292 300 public void buildStatistics(ProgressModalDialog progress) 301 { 302 303 int nSingles = 0; 304 int nNotSeparators = 0; 305 306 for(Word w: wordsHashtable.values()) 309 { 310 if(w.getSpamOccurences() + w.getHamOccurences() < 5) 311 { 312 nSingles++; 313 } 314 } 315 316 318 Vector<Word> toRemove = new Vector<Word>(); 321 for(Word w: wordsHashtable.values()) 322 { 323 w.calculateProbs(this.nHamMails, this.nSpamMails); 324 325 if(w.getSpamProb()>0.47 && w.getSpamProb()<0.53) 326 { 327 nNotSeparators++; 329 } 330 } 331 332 for(Word w: toRemove) 333 { 334 wordsHashtable.remove(w); 335 } 336 337 System.out.println("Number of words not discriminating: "+nNotSeparators+", number appearing less than 5: "+nSingles); 338 } 339 340 public static boolean isSpam(double messProb) 341 { 342 return messProb>SPAM_PROBABILITY_LIMIT; 343 } 344 345 347 public SpamResult calculateSpamProbability(Header head) 348 { 349 Vector<Word> foundWords = new Vector<Word>(); 351 for(int i=0; i<head.getEntriesCount(); i++) 352 { 353 HeaderEntry he = head.getEntryAt(i); 354 scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue()); 355 } 356 357 return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_HEADER); 359 } 360 361 362 363 365 public SpamResult calculateSpamProbability(MailMessage mess) 366 { 367 369 Vector<Word> foundWords = new Vector<Word>(); 370 371 Header head = mess.getHeader(); 373 for(int i=0; i<head.getEntriesCount(); i++) 374 { 375 HeaderEntry he = head.getEntryAt(i); 376 scanWords(foundWords, he.getKey().toLowerCase()+"*", he.getValue()); 377 } 378 379 if(MimeUtils.isMultipart(mess)) 381 { 382 MimeTreeModel mimeTree = mess.getMimeTree(); 384 MimePart mp = mimeTree.getRootPart(); 385 scanWordsRecurse(foundWords, mp); 386 } 387 else 388 { 389 this.scanWords(foundWords, "", mess.getMessageBody()); 390 } 391 392 return calculateSPAMProbability(foundWords, NUMBER_OF_WORD_TO_CONSIDER_FOR_A_MESSAGE); 394 } 395 396 397 398 private SpamResult calculateSPAMProbability(Vector<Word> allWords, int numberOfMostRelevantWordsToConsider) 399 { 400 Vector<Word> mostRelevantWords = new Vector<Word>(); 402 Collections.sort(allWords, new Word.RelevanceComparator()); 403 404 String lastWord = ""; 405 if(allWords.size()>numberOfMostRelevantWordsToConsider) 406 { 407 int max = numberOfMostRelevantWordsToConsider*14; 408 if(max>allWords.size()) max = allWords.size(); 409 410 for(int i=0; i<max; i++) 411 { 412 413 Word wi = allWords.get(i); 414 if(wi.word.equals(lastWord) && considerOnlyOneOccurenceOfEachWord) 415 { 416 } 418 else 419 { 420 mostRelevantWords.add( wi ); 421 if(mostRelevantWords.size()>=numberOfMostRelevantWordsToConsider) 422 { 423 break; 424 } 425 lastWord = wi.word; 426 } 427 } 428 } 429 430 double probSPAM_log = 0; 431 double probHAM_log = 0; 432 433 for(Word wi: mostRelevantWords) 434 { 435 probSPAM_log += Math.log(wi.getSpamProb()); 436 probHAM_log += Math.log(1.0 - wi.getSpamProb()); 437 } 438 439 double p = Math.exp(probSPAM_log) / ( Math.exp(probSPAM_log) + Math.exp(probHAM_log)) ; 440 441 return new SpamResult(allWords, mostRelevantWords, p); 442 } 443 444 445 449 private void scanWords(Vector<Word> foundWords, String prefix, String text) 450 { 451 String [] w = WordTokenizer.extractWords(text); 452 453 for(int i=0; i<w.length; i++) 454 { 455 456 Word word = this.getWord_using_alternatives(prefix, w[i]); 457 if(word!=null) 458 { 459 foundWords.add(word); 460 } 461 else 462 { 463 this.notFoundWords++; 464 Word wo = new Word(prefix+normalize(w[i])); 467 wo.setSpamProb(0.4); 468 foundWords.add(wo); 469 470 } 472 } 473 } 474 475 private void scanHTML(Vector<Word> foundWords, String htmlText) 476 { 477 try 478 { 479 HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false); 481 String [] w = WordTokenizer.extractWords(he.getTextOnly()); 482 483 for(int i=0; i<w.length; i++) 484 { 485 Word word = this.getWord_using_alternatives("", w[i]); 486 if(word!=null) 487 { 488 foundWords.add(word); 489 } 490 else 491 { 492 this.notFoundWords++; 495 Word wo = new Word(normalize(w[i])); 496 wo.setSpamProb(0.4); 497 foundWords.add(wo); 498 } 499 } 500 501 Vector<String > tags = he.getUnknownTags(); 503 for(int i=0; i<tags.size(); i++) 504 { 505 String [] wt = WordTokenizer.extractWords( tags.elementAt(i)); 506 for(int t=0; t<wt.length; t++) 507 { 508 String wtt = normalize(wt[t]); 509 Word word = this.getWord_using_alternatives("tag*", wtt); 510 if(word!=null) 511 { 512 foundWords.add(word); 513 } 514 else 515 { 516 this.notFoundWords++; 517 Word wo = new Word("tag*"+wtt); 520 wo.setSpamProb(0.95); 521 foundWords.add(wo); 522 } 523 } 524 } 525 526 Vector<String > hrefs = he.getLinksHREFs(); 528 for(int i=0; i<hrefs.size(); i++) 529 { 530 this.scanWords(foundWords, "href*", hrefs.elementAt(i)); 531 } 532 533 Vector<String > images = he.getImageSrcs(); 535 for(int i=0; i<images.size(); i++) 536 { 537 this.scanWords(foundWords, "img*", images.elementAt(i)); 538 } 539 } 540 catch(Exception e) 541 { 542 e.printStackTrace(); 543 } 544 } 545 546 547 private void scanWordsRecurse(Vector<Word> foundWords, MimePart part) 548 { 549 if(part.isLeaf()) 550 { 551 if(part.getContentTYPE() == MimePart.ContentType.TEXT) 552 { 553 if(part.lookIfContentIsHTML()) 554 { 555 this.scanHTML(foundWords, part.getBodyAsText()); 556 } 557 else 558 { 559 this.scanWords(foundWords, "", part.getBodyAsText()); 560 } 561 } 562 else 563 { 564 } 565 } 566 else 567 { 568 for(int i=0; i<part.getChildCount(); i++) 569 { 570 scanWordsRecurse(foundWords, part.getPartAt(i)); 571 } 572 } 573 } 574 575 576 577 581 public void addMessageToStat(MailMessage mess) 582 { 583 boolean isSPAM = mess.getIsSPAM(); 585 586 if( !isSPAM && !mess.getIsHAM() ) 588 { 589 nUnclassedMails++; 590 return; 591 } 592 593 594 595 if(isSPAM) 597 { 598 nSpamMails++; 599 } 600 else 601 { 602 nHamMails++; 603 } 604 605 Header head = mess.getHeader(); 607 for(int i=0; i<head.getEntriesCount(); i++) 608 { 609 HeaderEntry he = head.getEntryAt(i); 610 addTextToStat(he.getKey().toLowerCase()+"*", he.getValue(), isSPAM ); 611 } 612 613 if(MimeUtils.isMultipart(mess)) 615 { 616 MimeTreeModel mimeTree = mess.getMimeTree(); 617 MimePart mp = mimeTree.getRootPart(); 618 addMimePartToStatRecurse(mp, isSPAM); 619 } 620 else 621 { 622 addTextToStat("", mess.getMessageBody(), isSPAM); 623 } 624 } 625 626 627 private int minWordLength = 1; 628 629 631 private void addTextToStat(String prefix, String text, boolean isSPAM) 632 { 633 String [] w = WordTokenizer.extractWords(text); 634 635 for(int i=0; i<w.length; i++) 636 { 637 String wi = prefix+normalize(w[i]); 638 Word word = this.getWordExact(wi); 639 if(word==null) 640 { 641 word = new Word(wi); 642 if(word.word.length() >= minWordLength) 643 { 644 addWord(word); 645 } 646 } 647 648 if(isSPAM) 649 { 650 word.addSpamOccurence(); 651 } 652 else 653 { 654 word.addHamOccurence(); 655 } 656 } 657 } 658 659 661 private void addHTMLToStat(String htmlText, boolean isSPAM) 662 { 663 try 664 { 665 HTMLTextExtractor he = new HTMLTextExtractor(htmlText, false); 666 this.addTextToStat("", he.getTextOnly(), isSPAM); 667 668 Vector<String > ut = he.getUnknownTags(); 670 for(int i=0; i<ut.size(); i++) 671 { 672 this.addTextToStat("tag*", ut.elementAt(i), isSPAM); 673 } 674 675 Vector<String > refs = he.getLinksHREFs(); 677 for(int i=0; i<refs.size(); i++) 678 { 679 this.addTextToStat("href*", refs.elementAt(i), isSPAM); 680 } 681 682 Vector<String > images = he.getImageSrcs(); 684 for(int i=0; i<images.size(); i++) 685 { 686 this.addTextToStat("img*", images.elementAt(i), isSPAM); 687 } 688 689 } 690 catch(Exception e) 691 { 692 e.printStackTrace(); } 694 695 } 697 698 700 private void addMimePartToStatRecurse(MimePart part, boolean isSPAM) 701 { 702 if(part.isLeaf()) 703 { 704 if(part.getContentTYPE()== MimePart.ContentType.TEXT) 706 { 707 if(part.lookIfContentIsHTML()) 708 { 709 this.addHTMLToStat(part.getBodyAsText(), isSPAM); 710 } 711 else 712 { 713 this.addTextToStat("", part.getBodyAsText(), isSPAM); 714 } 715 } 716 else 717 { 718 } 721 } 722 else 723 { 724 for(int i=0; i<part.getChildCount(); i++) 726 { 727 addMimePartToStatRecurse(part.getPartAt(i), isSPAM); 728 } 729 } 730 } 731 732 735 736 public void exportStatToFile(File f) throws Exception 737 { 738 Vector<Word> words = new Vector<Word>( wordsHashtable.values() ); 739 Collections.sort(words); 740 741 742 FileOutputStream fos = null; 743 PrintWriter pw = null; 744 try 745 { 746 fos = new FileOutputStream(f); 747 pw = new PrintWriter(fos); 748 749 pw.print(""+this.toStringStat()+"\n\n"); 750 751 for(int i=0; i<words.size(); i++) 752 { 753 Word wi = words.elementAt(i); 754 pw.println(""+wi.word+"\t"+wi.getSpamProb()); } 756 } 757 catch(Exception e) 758 { 759 throw e; 760 } 761 finally 762 { 763 if(pw!=null) pw.close(); 764 if(fos!=null) fos.close(); 765 } 766 } 767 768 773 public static String normalize(String s) 774 { 775 if(s.length()<2) return s; 776 777 StringBuffer normalized = new StringBuffer (); 778 779 boolean hasLow = false; 780 boolean hasUp = false; 781 boolean hasDigits = false; 782 boolean hasSpecial = false; 783 784 for(int i=1; i<s.length(); i++) 785 { 786 char ci = s.charAt(i); 787 if(Character.isLowerCase(ci)) 788 { 789 hasLow = true; 790 normalized.append(ci); 791 } 792 else if(Character.isUpperCase(ci)) 793 { 794 hasUp = true; 795 normalized.append(ci); 796 } 797 else if(Character.isDigit(ci)) 798 { 799 hasDigits = true; 800 normalized.append(ci); 801 } 802 else if(ci=='.') 803 { 804 normalized.append(ci); 806 } 807 else 808 { 809 hasSpecial = true; 810 } 812 } 813 814 if(hasSpecial) 815 { 816 normalized.append('#'); 817 } 818 819 821 if( Character.isUpperCase(s.charAt(0)) ) 822 { 823 if(hasUp) 824 { 825 return s.charAt(0) + normalized.toString().toUpperCase(); 827 } 828 else 829 { 830 return s.charAt(0) + normalized.toString(); 832 } 833 } 834 else if( Character.isLowerCase(s.charAt(0)) ) 835 { 836 if(hasUp) 837 { 838 return Character.toUpperCase(s.charAt(0)) + normalized.toString().toUpperCase(); 840 } 841 else 842 { 843 return s.charAt(0) + normalized.toString(); 845 } 846 } 847 else 848 { 849 } 851 852 853 return s.charAt(0) + normalized.toString(); 854 } 855 856 859 public Vector<Object > getVectorRepresentation() throws VectorizeException 860 { 861 Vector<Object > v = new Vector<Object >(); 862 863 v.addElement(2); String [] wl = new String [wordsHashtable.size()]; 865 int[] occHam = new int[wordsHashtable.size()]; 866 int[] occSpam = new int[wordsHashtable.size()]; 867 868 int i=0; 869 for(Word w : wordsHashtable.values()) 870 { 871 wl[i] = w.word; 872 occHam[i] = w.getHamOccurences(); 873 occSpam[i] = w.getSpamOccurences(); 874 i++; 875 } 876 v.addElement(wl); v.addElement(occHam); 878 v.addElement(occSpam); 879 v.addElement(this.nHamMails); v.addElement(this.nSpamMails); 881 v.addElement(this.nUnclassedMails); 883 return v; 884 } 885 886 public void createFromVectorRepresentation(Vector<Object > v) throws VectorizeException 887 { 888 int version = (Integer )v.elementAt(0); 889 if(version==2) 890 { 891 this.deleteStatistic(); 892 893 String [] wl = (String []) v.elementAt(1); 894 int[] occHam = (int[]) v.elementAt(2); 895 int[] occSpam = (int[]) v.elementAt(3); 896 if(v.size()>4) 897 { 898 nHamMails = (Integer ) v.elementAt(4); 899 nSpamMails = (Integer ) v.elementAt(5); 900 } 901 if(v.size()>6) 902 { 903 nUnclassedMails = (Integer ) v.elementAt(6); 904 } 905 906 for(int i=0; i<wl.length; i++) 907 { 908 Word w = new Word(wl[i], occHam[i], occSpam[i]); 909 addWord(w); 910 } 911 912 buildStatistics(null); 913 } 914 else 915 { 916 throw new VectorizeException("Bad vectorized Word version "+version); 917 } 918 } 919 920 949 950 } | Popular Tags |