KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > SnowMailClient > SpamFilter > WordTokenizer


1 package SnowMailClient.SpamFilter;
2
3 import java.util.*;
4
5 /** rules:
6     1) ! is a character
7     2) . is a character if between two digits
8
9 */

10 public final class WordTokenizer implements Iterator
11 {
12   int pos = 0;
13   String JavaDoc text;
14
15   public WordTokenizer(String JavaDoc text)
16   {
17     this.text = text;
18     avanceToNextWordStart();
19   } // Constructor
20

21   public boolean hasMoreTokens()
22   {
23     return pos != text.length();
24   }
25
26   // Iterator interface ### 1.5 => specify type
27
//
28
public Object JavaDoc next() { return nextToken(); }
29   public boolean hasNext() { return hasMoreTokens(); }
30   public void remove() { throw new RuntimeException JavaDoc("Not supported"); }
31
32   public String JavaDoc nextToken()
33   {
34     StringBuffer JavaDoc token = new StringBuffer JavaDoc();
35     for(; pos<text.length(); pos++)
36     {
37        char chi = text.charAt(pos);
38        // end of word reached
39
if(!isWordChar(chi))
40        {
41          if(chi=='.' || chi==',')
42          {
43            // rule 2, if "." or "," is between two digits, it is a constituent
44
// this keeps IP's and prices intact
45
// URL's not...
46
char chi1 = text.charAt(pos-1);
47            if(pos+1==text.length()) break;
48            char chi2 = text.charAt(pos+1);
49
50            if(!Character.isDigit(chi1)
51            || !Character.isDigit(chi2)) break;
52          }
53          else
54          {
55            break;
56          }
57        }
58
59        token.append(chi);
60     }
61     avanceToNextWordStart();
62     return token.toString();
63   }
64
65   /** advance up to the next non-separator
66   */

67   private void avanceToNextWordStart()
68   {
69     for(; pos<text.length(); pos++)
70     {
71       char chi = text.charAt(pos);
72       if(isWordChar(chi)) return;
73     }
74   }
75
76   private boolean isWordChar(char c)
77   {
78     if(c>='a' && c<='z') return true;
79     if(c>='A' && c<='Z') return true;
80     if(c>='0' && c<='9') return true;
81     if(c=='-' || c=='_' || c=='!') return true;
82
83     return false;
84   }
85
86   public static String JavaDoc[] extractWords(String JavaDoc text)
87   {
88     WordTokenizer wt = new WordTokenizer(text);
89     Vector<String JavaDoc> words = new Vector<String JavaDoc>();
90     while(wt.hasMoreTokens())
91     {
92       String JavaDoc w = wt.nextToken();
93       if(w.length()!=0)
94       {
95         // #### sometimes the last word is empty... when for example terminating with <html>
96
//
97
words.add(w);
98       }
99       else
100       {
101         System.out.println("################ Problem ?");
102       }
103     }
104     return words.toArray(new String JavaDoc[words.size()]);
105   }
106
107
108  /**
109   * Static main method
110   */

111   public static void main( String JavaDoc[] arguments )
112   {
113     WordTokenizer wt = new WordTokenizer(" Hello2-t!!! th-is is teXtz 44.and another,a.b <html>Shit<pre></pre>"
114     +" http://62.65.146.182.is http://www.aaa.bbb a cool IP 34,23$ is my price</html>");
115     while(wt.hasMoreTokens())
116     {
117       System.out.println(""+wt.nextToken());
118     }
119
120     double[] t = new double[]{1,2,3};
121     test(t);
122     System.out.println(t[0]);
123   } // main
124

125
126   private static void test(double[] a)
127   {
128     a[0] = 12;
129   }
130
131 } // WordTokenizer
Popular Tags