1 11 package org.eclipse.help.internal.search; 12 13 import java.io.*; 14 import com.ibm.icu.text.BreakIterator; 15 import java.util.ArrayList ; 16 import java.util.Locale ; 17 18 import org.apache.lucene.analysis.*; 19 20 24 public final class WordTokenStream extends TokenStream { 25 private static final int BUF_LEN = 4096; 26 private static final int TOKENS_LEN = 512; 27 private final Reader reader; 28 private final BreakIterator boundary; 29 private final ArrayList tokens; 30 private int token; 31 private int noTokens; 32 private final char[] cbuf; 33 36 public WordTokenStream(String fieldName, Reader reader, Locale locale) { 37 this.reader = reader; 38 boundary = BreakIterator.getWordInstance(locale); 39 cbuf = new char[BUF_LEN]; 40 tokens = new ArrayList (TOKENS_LEN); 41 42 } 43 46 public final Token next() throws IOException { 47 while (token >= noTokens) { 48 int l; 50 while ((l = reader.read(cbuf)) <= 0) { 51 if (l < 0) { 52 reader.close(); 54 return null; 55 } 56 } 57 StringBuffer strbuf = new StringBuffer (l + 80); 58 strbuf.append(cbuf, 0, l); 59 int c; 61 while (0 <= (c = reader.read())) { 62 strbuf.append((char) c); 63 if (c == ' ' || c == '\r' || c == '\n' || c == '\t') { 64 break; 65 } 66 } 67 68 String str = strbuf.toString(); 69 boundary.setText(str); 70 71 int start = boundary.first(); 72 tokens.clear(); 73 wordsbreak : for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 74 .next()) { 75 for (int i = start; i < end; i++) { 78 if (Character.isLetterOrDigit(str.charAt(i))) { 79 tokens.add(new Token(str.substring(start, end), start, 81 end)); 82 continue wordsbreak; 83 } 84 } 85 } 86 87 if (c < 0) { 88 reader.close(); 89 tokens.add((Token) null); 90 } 91 noTokens = tokens.size(); 92 token = 0; 93 } 94 95 return (Token) tokens.get(token++); 96 97 } 98 } 99 | Popular Tags |