KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > CharTokenizer


1 package org.apache.lucene.analysis;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 import java.io.IOException JavaDoc;
20 import java.io.Reader JavaDoc;
21
22 /** An abstract base class for simple, character-oriented tokenizers.*/
23 public abstract class CharTokenizer extends Tokenizer {
24   public CharTokenizer(Reader JavaDoc input) {
25     super(input);
26   }
27
28   private int offset = 0, bufferIndex = 0, dataLen = 0;
29   private static final int MAX_WORD_LEN = 255;
30   private static final int IO_BUFFER_SIZE = 1024;
31   private final char[] buffer = new char[MAX_WORD_LEN];
32   private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
33
34   /** Returns true iff a character should be included in a token. This
35    * tokenizer generates as tokens adjacent sequences of characters which
36    * satisfy this predicate. Characters for which this is false are used to
37    * define token boundaries and are not included in tokens. */

38   protected abstract boolean isTokenChar(char c);
39
40   /** Called on each token character to normalize it before it is added to the
41    * token. The default implementation does nothing. Subclasses may use this
42    * to, e.g., lowercase tokens. */

43   protected char normalize(char c) {
44     return c;
45   }
46
47   /** Returns the next token in the stream, or null at EOS. */
48   public final Token next() throws IOException JavaDoc {
49     int length = 0;
50     int start = offset;
51     while (true) {
52       final char c;
53
54       offset++;
55       if (bufferIndex >= dataLen) {
56         dataLen = input.read(ioBuffer);
57         bufferIndex = 0;
58       }
59       ;
60       if (dataLen == -1) {
61         if (length > 0)
62           break;
63         else
64           return null;
65       } else
66         c = ioBuffer[bufferIndex++];
67
68       if (isTokenChar(c)) { // if it's a token char
69

70         if (length == 0) // start of token
71
start = offset - 1;
72
73         buffer[length++] = normalize(c); // buffer it, normalized
74

75         if (length == MAX_WORD_LEN) // buffer overflow!
76
break;
77
78       } else if (length > 0) // at non-Letter w/ chars
79
break; // return 'em
80

81     }
82
83     return new Token(new String JavaDoc(buffer, 0, length), start, start + length);
84   }
85 }
86
Popular Tags