KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > daffodilwoods > daffodildb > server > sql99 > fulltext > common > ByteTokenizer


1 package com.daffodilwoods.daffodildb.server.sql99.fulltext.common;
2 import com.daffodilwoods.daffodildb.utils.field.FieldBase;
3 import com.daffodilwoods.daffodildb.utils.FieldObjectLiteral;
4 import java.util.ArrayList JavaDoc;
5 import com.daffodilwoods.daffodildb.utils.BufferRange;
6 import com.daffodilwoods.daffodildb.utils.field.FieldBinary;
7 import com.daffodilwoods.daffodildb.server.sql99.common.Datatypes;
8 import com.daffodilwoods.database.utility.P;
9 import com.daffodilwoods.fulltext.common._Tokenizer;
10 import com.daffodilwoods.fulltext.common._Token;
11
12 /**
13  * <p>Title: ByteTokenizer</p>
14  * The byte tokenizer class allows an application to break a
15  * object into tokens.Objects are break on delimiter.All other character except
16  * alphabet are considered delimiter on which bytetokenizer do tokenize the object.
17  * The following is one example of the use of the tokenizer. The code:
18  * <blockquote><pre>
19  * byte[] temp =new byte[]{100,97,102,102,111,100,105,108,32,105,115,32,97,32,115,111,102,116,119,97,114,46,110,120};
20  * ByteTokenizer bt = new ByteTokenizer(temp);
21  * while (bt.hasMoreTokens()) {
22  * println(st.nextToken());
23  * }
24  */

25
26 public class ByteTokenizer implements _Tokenizer {
27
28   /**
29    * start position represent position from which ByteTokenizer start tokenizing the object
30    */

31   int startPosition = 0;
32   /**
33    * end position represent end postion upto which ByteTokenizer have to tokenize the object
34    * ie Object is byte[] startposition is position from bytetokenizer start and endposition represent upto which tokenizer do tokenize
35    */

36   int endPosition;
37   /**
38    * token count represent position of token.
39    * Eg daffodil is a software company token count for token software is 4
40    */

41   long tokenCount;
42   /**
43    * Represent Object which is being tokenize
44    */

45
46   BufferRange TokenToBeParsed;
47   /**
48    * Represent token after stemming .
49    * Stemming is process in which token converted to its root form.
50    */

51   byte[] tokenAfterStemming;
52   /**
53    * Refrence of stemmer required for stemming.
54    */

55   Stemmer st;
56   /**
57    * Variable that represents the number of tokens in the previous column(s)
58    * in case of multiple index.
59    */

60   long length;
61
62   public ByteTokenizer(Object JavaDoc obj) { //byte[] TokenToBeParsed
63
TokenToBeParsed = ( (FieldBase) obj).getBufferRange();
64     st = new Stemmer();
65     startPosition = 0;//TokenToBeParsed.getOffSet();
66
endPosition = TokenToBeParsed.getLength();
67     length = 0;
68   }
69
70   public ByteTokenizer(Object JavaDoc obj, long length0) { //byte[] TokenToBeParsed
71
TokenToBeParsed = ( (FieldBase) obj).getBufferRange();
72     st = new Stemmer();
73     startPosition = 0;//TokenToBeParsed.getOffSet();
74
endPosition = TokenToBeParsed.getLength();
75     length = length0;
76   }
77   /**
78    * convert passed token into its lowercase.
79    * return token after converting it into lowercase.
80    * @param tokenToBeConverted
81    * @return
82    */

83
84   private byte[] convertToLowerCase(byte[] tokenToBeConverted) {
85     for (int i = 0; i < tokenToBeConverted.length; i++)
86       tokenToBeConverted[i] = (byte)Character.toLowerCase( (char) tokenToBeConverted[i]);
87     return tokenToBeConverted;
88
89   }
90
91   /**
92    * Skips delimiters starting from the start position. If isDelimiter
93    * is false, returns the index of the first non-delimiter character at or
94    * after startPos. If isDelimiter is true, continue the check of isdelimiter
95    * upto nondelimiter charater is found or endposition doesnot reach .
96    */

97
98   private int skipDelimiter() {
99     int i;
100     for (i = startPosition; i < endPosition; i++) {
101       if (!isDelimiter(i)) {
102         break;
103       }
104     }
105     return i;
106   }
107   /**
108    * hasMoreToken Check if there is more token available or not
109    * if hasMoreToken return true nextToken is called to get the token.
110    * hasMoreToken first get token then covert then token to lower case
111    * after converting token to lower case.it check token is stop word or not.
112    * (stop word are list of word that are not consider during parsing like is,
113    * was ,will etc) if token is stopword we ignore this token and call for test
114    * hasMoreToken present or not.else stemming is done.stemming convert the token
115    * into its root form like starting is converted to start.
116    * @return
117    */

118
119   public boolean hasMoreToken() {
120     startPosition = skipDelimiter();
121     boolean stopFlag = true;
122     int i, count = 1;
123     if (startPosition < endPosition) {
124       for (i = startPosition + 1; i < endPosition; i++) {
125         if (!isDelimiter(i))
126           count++;
127         else
128           break;
129       }
130       byte[] token = new byte[count];
131       byte[] tokenInLowerCase = convertToLowerCase(copyInToArray(token,startPosition));
132       startPosition = i;
133       tokenCount++; //this is used for maintaining location of particular token
134
stopFlag = StopWords.checkStopWords(tokenInLowerCase);
135       if (stopFlag)
136         return hasMoreToken();
137       tokenAfterStemming = st.stemmingToken(tokenInLowerCase);
138       return (!stopFlag && startPosition <= endPosition);
139     }
140     return false;
141   }
142
143   /**
144    *It return the next token
145    * @return
146    */

147
148   public _Token nextToken() {
149     BufferRange bf = new BufferRange(tokenAfterStemming);
150     FieldBase fb = new FieldBinary(bf, Datatypes.BINARY);
151     _Token tk = new ByteToken(fb, tokenCount+length);
152     return tk;
153   }
154
155   private byte[] copyInToArray(byte[] token,int position){
156     for(int i=position,j=0;i<position+token.length;i++,j++){
157       token[j] = TokenToBeParsed.getByte(i);
158     }
159     return token;
160   }
161
162   /**
163    * check wheather character at particular position is delimiter or not.
164    * if it is delmiter return true else return false.All other charater except
165    * alphabet are considered delimiter.
166    * @param currentPosition
167    * @return
168    */

169
170   private boolean isDelimiter(int currentPosition) {
171     byte b = TokenToBeParsed.getByte(currentPosition);
172     return b<39 || (b>39 && b < 48) || b > 122 || (b > 57 && b < 65) || (b > 90 && b < 97);
173   }
174
175
176
177
178 }
179
Popular Tags