CharTokenizer


1   package org.apache.lucene.analysis;
2   
3   /**
4    * Copyright 2004 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.IOException  ;
20  import java.io.Reader  ;
21  
22  /** An abstract base class for simple, character-oriented tokenizers.*/
23  public abstract class CharTokenizer extends Tokenizer {
24    public CharTokenizer(Reader   input) {
25      super(input);
26    }
27  
28    private int offset = 0, bufferIndex = 0, dataLen = 0;
29    private static final int MAX_WORD_LEN = 255;
30    private static final int IO_BUFFER_SIZE = 1024;
31    private final char[] buffer = new char[MAX_WORD_LEN];
32    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
33  
34    /** Returns true iff a character should be included in a token.  This
35     * tokenizer generates as tokens adjacent sequences of characters which
36     * satisfy this predicate.  Characters for which this is false are used to
37     * define token boundaries and are not included in tokens. */
38    protected abstract boolean isTokenChar(char c);
39  
40    /** Called on each token character to normalize it before it is added to the
41     * token.  The default implementation does nothing.  Subclasses may use this
42     * to, e.g., lowercase tokens. */
43    protected char normalize(char c) {
44      return c;
45    }
46  
47    /** Returns the next token in the stream, or null at EOS. */
48    public final Token next() throws IOException   {
49      int length = 0;
50      int start = offset;
51      while (true) {
52        final char c;
53  
54        offset++;
55        if (bufferIndex >= dataLen) {
56          dataLen = input.read(ioBuffer);
57          bufferIndex = 0;
58        }
59        ;
60        if (dataLen == -1) {
61          if (length > 0)
62            break;
63          else
64            return null;
65        } else
66          c = ioBuffer[bufferIndex++];
67  
68        if (isTokenChar(c)) {               // if it's a token char
69  
70          if (length == 0)                       // start of token
71            start = offset - 1;
72  
73          buffer[length++] = normalize(c); // buffer it, normalized
74  
75          if (length == MAX_WORD_LEN)        // buffer overflow!
76            break;
77  
78        } else if (length > 0)             // at non-Letter w/ chars
79          break;                           // return 'em
80  
81      }
82  
83      return new Token(new String  (buffer, 0, length), start, start + length);
84    }
85  }
86
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags