KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > help > internal > search > WordTokenStream


1 /*******************************************************************************
2  * Copyright (c) 2000, 2006 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.help.internal.search;
12
13 import java.io.*;
14 import com.ibm.icu.text.BreakIterator;
15 import java.util.ArrayList JavaDoc;
16 import java.util.Locale JavaDoc;
17
18 import org.apache.lucene.analysis.*;
19
20 /**
21  * WordTokenStream obtains tokens containing words appropriate for use with
22  * Lucene search engine.
23  */

24 public final class WordTokenStream extends TokenStream {
25     private static final int BUF_LEN = 4096;
26     private static final int TOKENS_LEN = 512;
27     private final Reader reader;
28     private final BreakIterator boundary;
29     private final ArrayList JavaDoc tokens;
30     private int token;
31     private int noTokens;
32     private final char[] cbuf;
33     /**
34      * Constructor
35      */

36     public WordTokenStream(String JavaDoc fieldName, Reader reader, Locale JavaDoc locale) {
37         this.reader = reader;
38         boundary = BreakIterator.getWordInstance(locale);
39         cbuf = new char[BUF_LEN];
40         tokens = new ArrayList JavaDoc(TOKENS_LEN);
41
42     }
43     /**
44      * @see TokenStream#next()
45      */

46     public final Token next() throws IOException {
47         while (token >= noTokens) {
48             // read BUF_LEN of chars
49
int l;
50             while ((l = reader.read(cbuf)) <= 0) {
51                 if (l < 0) {
52                     // EOF
53
reader.close();
54                     return null;
55                 }
56             }
57             StringBuffer JavaDoc strbuf = new StringBuffer JavaDoc(l + 80);
58             strbuf.append(cbuf, 0, l);
59             // read more until white space (or EOF)
60
int c;
61             while (0 <= (c = reader.read())) {
62                 strbuf.append((char) c);
63                 if (c == ' ' || c == '\r' || c == '\n' || c == '\t') {
64                     break;
65                 }
66             }
67
68             String JavaDoc str = strbuf.toString();
69             boundary.setText(str);
70
71             int start = boundary.first();
72             tokens.clear();
73             wordsbreak : for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary
74                     .next()) {
75                 // determine if it is a word
76
// any letter or digit between boundaries means it is a word
77
for (int i = start; i < end; i++) {
78                     if (Character.isLetterOrDigit(str.charAt(i))) {
79                         // it is a word
80
tokens.add(new Token(str.substring(start, end), start,
81                                 end));
82                         continue wordsbreak;
83                     }
84                 }
85             }
86
87             if (c < 0) {
88                 reader.close();
89                 tokens.add((Token) null);
90             }
91             noTokens = tokens.size();
92             token = 0;
93         }
94
95         return (Token) tokens.get(token++);
96
97     }
98 }
99
Popular Tags