LetterTokenizer


1   package org.apache.lucene.analysis;
2   
3   /**
4    * Copyright 2004 The Apache Software Foundation
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.Reader  ;
20  
21  /** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
22    to say, it defines tokens as maximal strings of adjacent letters, as defined
23    by java.lang.Character.isLetter() predicate.
24  
25    Note: this does a decent job for most European languages, but does a terrible
26    job for some Asian languages, where words are not separated by spaces. */
27  
28  public class LetterTokenizer extends CharTokenizer {
29    /** Construct a new LetterTokenizer. */
30    public LetterTokenizer(Reader   in) {
31      super(in);
32    }
33  
34    /** Collects only characters which satisfy
35     * {@link Character#isLetter(char)}.*/
36    protected boolean isTokenChar(char c) {
37      return Character.isLetter(c);
38    }
39  }
40

A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame

Popular Tags