1 package org.apache.lucene.analysis; 2 3 /** 4 * Copyright 2004 The Apache Software Foundation 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 import java.io.Reader; 20 21 /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's 22 to say, it defines tokens as maximal strings of adjacent letters, as defined 23 by java.lang.Character.isLetter() predicate. 24 25 Note: this does a decent job for most European languages, but does a terrible 26 job for some Asian languages, where words are not separated by spaces. */ 27 28 public class LetterTokenizer extends CharTokenizer { 29 /** Construct a new LetterTokenizer. */ 30 public LetterTokenizer(Reader in) { 31 super(in); 32 } 33 34 /** Collects only characters which satisfy 35 * {@link Character#isLetter(char)}.*/ 36 protected boolean isTokenChar(char c) { 37 return Character.isLetter(c); 38 } 39 } 40