1 package com.daffodilwoods.daffodildb.server.sql99.fulltext.common; 2 import com.daffodilwoods.daffodildb.utils.field.FieldBase; 3 import com.daffodilwoods.daffodildb.utils.FieldObjectLiteral; 4 import java.util.ArrayList ; 5 import com.daffodilwoods.daffodildb.utils.BufferRange; 6 import com.daffodilwoods.daffodildb.utils.field.FieldBinary; 7 import com.daffodilwoods.daffodildb.server.sql99.common.Datatypes; 8 import com.daffodilwoods.database.utility.P; 9 import com.daffodilwoods.fulltext.common._Tokenizer; 10 import com.daffodilwoods.fulltext.common._Token; 11 12 25 26 public class ByteTokenizer implements _Tokenizer { 27 28 31 int startPosition = 0; 32 36 int endPosition; 37 41 long tokenCount; 42 45 46 BufferRange TokenToBeParsed; 47 51 byte[] tokenAfterStemming; 52 55 Stemmer st; 56 60 long length; 61 62 public ByteTokenizer(Object obj) { TokenToBeParsed = ( (FieldBase) obj).getBufferRange(); 64 st = new Stemmer(); 65 startPosition = 0; endPosition = TokenToBeParsed.getLength(); 67 length = 0; 68 } 69 70 public ByteTokenizer(Object obj, long length0) { TokenToBeParsed = ( (FieldBase) obj).getBufferRange(); 72 st = new Stemmer(); 73 startPosition = 0; endPosition = TokenToBeParsed.getLength(); 75 length = length0; 76 } 77 83 84 private byte[] convertToLowerCase(byte[] tokenToBeConverted) { 85 for (int i = 0; i < tokenToBeConverted.length; i++) 86 tokenToBeConverted[i] = (byte)Character.toLowerCase( (char) tokenToBeConverted[i]); 87 return tokenToBeConverted; 88 89 } 90 91 97 98 private int skipDelimiter() { 99 int i; 100 for (i = startPosition; i < endPosition; i++) { 101 if (!isDelimiter(i)) { 102 break; 103 } 104 } 105 return i; 106 } 107 118 119 public boolean hasMoreToken() { 120 startPosition = skipDelimiter(); 121 boolean stopFlag = true; 122 int i, count = 1; 123 if (startPosition < endPosition) { 124 for (i = startPosition + 1; i < endPosition; i++) { 125 if (!isDelimiter(i)) 126 count++; 127 else 128 break; 129 } 130 byte[] token = new byte[count]; 131 byte[] tokenInLowerCase = convertToLowerCase(copyInToArray(token,startPosition)); 132 startPosition = i; 133 tokenCount++; stopFlag = StopWords.checkStopWords(tokenInLowerCase); 135 if (stopFlag) 136 return hasMoreToken(); 137 tokenAfterStemming = st.stemmingToken(tokenInLowerCase); 138 return (!stopFlag && startPosition <= endPosition); 139 } 140 return false; 141 } 142 143 147 148 public _Token nextToken() { 149 BufferRange bf = new BufferRange(tokenAfterStemming); 150 FieldBase fb = new FieldBinary(bf, Datatypes.BINARY); 151 _Token tk = new ByteToken(fb, tokenCount+length); 152 return tk; 153 } 154 155 private byte[] copyInToArray(byte[] token,int position){ 156 for(int i=position,j=0;i<position+token.length;i++,j++){ 157 token[j] = TokenToBeParsed.getByte(i); 158 } 159 return token; 160 } 161 162 169 170 private boolean isDelimiter(int currentPosition) { 171 byte b = TokenToBeParsed.getByte(currentPosition); 172 return b<39 || (b>39 && b < 48) || b > 122 || (b > 57 && b < 65) || (b > 90 && b < 97); 173 } 174 175 176 177 178 } 179 | Popular Tags |