WordTokenizer


1   package SnowMailClient.SpamFilter;
2   
3   import java.util.*;
4   
5   /** rules:
6       1) ! is a character
7       2) . is a character if between two digits
8   
9   */
10  public final class WordTokenizer implements Iterator
11  {
12    int pos = 0;
13    String   text;
14  
15    public WordTokenizer(String   text)
16    {
17      this.text = text;
18      avanceToNextWordStart();
19    } // Constructor
20  
21    public boolean hasMoreTokens()
22    {
23      return pos != text.length();
24    }
25  
26    // Iterator interface ### 1.5 => specify type
27    //
28    public Object   next() { return nextToken(); }
29    public boolean hasNext() { return hasMoreTokens(); }
30    public void remove() { throw new RuntimeException  ("Not supported"); }
31  
32    public String   nextToken()
33    {
34      StringBuffer   token = new StringBuffer  ();
35      for(; pos<text.length(); pos++)
36      {
37         char chi = text.charAt(pos);
38         // end of word reached
39         if(!isWordChar(chi))
40         {
41           if(chi=='.' || chi==',')
42           {
43             // rule 2, if "." or "," is between two digits, it is a constituent
44             // this keeps IP's and prices intact
45             // URL's not...
46             char chi1 = text.charAt(pos-1);
47             if(pos+1==text.length()) break;
48             char chi2 = text.charAt(pos+1);
49  
50             if(!Character.isDigit(chi1)
51             || !Character.isDigit(chi2)) break;
52           }
53           else
54           {
55             break;
56           }
57         }
58  
59         token.append(chi);
60      }
61      avanceToNextWordStart();
62      return token.toString();
63    }
64  
65    /** advance up to the next non-separator
66    */
67    private void avanceToNextWordStart()
68    {
69      for(; pos<text.length(); pos++)
70      {
71        char chi = text.charAt(pos);
72        if(isWordChar(chi)) return;
73      }
74    }
75  
76    private boolean isWordChar(char c)
77    {
78      if(c>='a' && c<='z') return true;
79      if(c>='A' && c<='Z') return true;
80      if(c>='0' && c<='9') return true;
81      if(c=='-' || c=='_' || c=='!') return true;
82  
83      return false;
84    }
85  
86    public static String  [] extractWords(String   text)
87    {
88      WordTokenizer wt = new WordTokenizer(text);
89      Vector<String  > words = new Vector<String  >();
90      while(wt.hasMoreTokens())
91      {
92        String   w = wt.nextToken();
93        if(w.length()!=0)
94        {
95          // #### sometimes the last word is empty... when for example terminating with <html>
96          //
97          words.add(w);
98        }
99        else
100       {
101         System.out.println("################ Problem ?");
102       }
103     }
104     return words.toArray(new String  [words.size()]);
105   }
106 
107 
108  /**
109   *  Static main method
110   */
111   public static void main( String  [] arguments )
112   {
113     WordTokenizer wt = new WordTokenizer("   Hello2-t!!!    th-is is teXtz 44.and another,a.b <html>Shit<pre></pre>"
114     +" http://62.65.146.182.is  http://www.aaa.bbb a cool IP 34,23$       is my price</html>");
115     while(wt.hasMoreTokens())
116     {
117       System.out.println(""+wt.nextToken());
118     }
119 
120     double[] t = new double[]{1,2,3};
121     test(t);
122     System.out.println(t[0]);
123   } // main
124 
125 
126   private static void test(double[] a)
127   {
128     a[0] = 12;
129   }
130 
131 } // WordTokenizer
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags