KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > lucene > analysis > Token


1 package org.apache.lucene.analysis;
2
3 /**
4  * Copyright 2004 The Apache Software Foundation
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */

18
19 /** A Token is an occurence of a term from the text of a field. It consists of
20   a term's text, the start and end offset of the term in the text of the field,
21   and a type string.
22
23   The start and end offsets permit applications to re-associate a token with
24   its source text, e.g., to display highlighted query terms in a document
25   browser, or to show matching text fragments in a KWIC (KeyWord In Context)
26   display, etc.
27
28   The type is an interned string, assigned by a lexical analyzer
29   (a.k.a. tokenizer), naming the lexical or syntactic class that the token
30   belongs to. For example an end of sentence marker token might be implemented
31   with type "eos". The default token type is "word". */

32
33 public final class Token {
34   String JavaDoc termText; // the text of the term
35
int startOffset; // start in source text
36
int endOffset; // end in source text
37
String JavaDoc type = "word"; // lexical type
38

39   private int positionIncrement = 1;
40
41   /** Constructs a Token with the given term text, and start & end offsets.
42       The type defaults to "word." */

43   public Token(String JavaDoc text, int start, int end) {
44     termText = text;
45     startOffset = start;
46     endOffset = end;
47   }
48
49   /** Constructs a Token with the given text, start and end offsets, & type. */
50   public Token(String JavaDoc text, int start, int end, String JavaDoc typ) {
51     termText = text;
52     startOffset = start;
53     endOffset = end;
54     type = typ;
55   }
56
57   /** Set the position increment. This determines the position of this token
58    * relative to the previous Token in a {@link TokenStream}, used in phrase
59    * searching.
60    *
61    * <p>The default value is one.
62    *
63    * <p>Some common uses for this are:<ul>
64    *
65    * <li>Set it to zero to put multiple terms in the same position. This is
66    * useful if, e.g., a word has multiple stems. Searches for phrases
67    * including either stem will match. In this case, all but the first stem's
68    * increment should be set to zero: the increment of the first instance
69    * should be one. Repeating a token with an increment of zero can also be
70    * used to boost the scores of matches on that token.
71    *
72    * <li>Set it to values greater than one to inhibit exact phrase matches.
73    * If, for example, one does not want phrases to match across removed stop
74    * words, then one could build a stop word filter that removes stop words and
75    * also sets the increment to the number of stop words removed before each
76    * non-stop word. Then exact phrase queries will only match when the terms
77    * occur with no intervening stop words.
78    *
79    * </ul>
80    * @see org.apache.lucene.index.TermPositions
81    */

82   public void setPositionIncrement(int positionIncrement) {
83     if (positionIncrement < 0)
84       throw new IllegalArgumentException JavaDoc
85         ("Increment must be zero or greater: " + positionIncrement);
86     this.positionIncrement = positionIncrement;
87   }
88
89   /** Returns the position increment of this Token.
90    * @see #setPositionIncrement
91    */

92   public int getPositionIncrement() { return positionIncrement; }
93
94   /** Returns the Token's term text. */
95   public final String JavaDoc termText() { return termText; }
96
97   /** Returns this Token's starting offset, the position of the first character
98     corresponding to this token in the source text.
99
100     Note that the difference between endOffset() and startOffset() may not be
101     equal to termText.length(), as the term text may have been altered by a
102     stemmer or some other filter. */

103   public final int startOffset() { return startOffset; }
104
105   /** Returns this Token's ending offset, one greater than the position of the
106     last character corresponding to this token in the source text. */

107   public final int endOffset() { return endOffset; }
108
109   /** Returns this Token's lexical type. Defaults to "word". */
110   public final String JavaDoc type() { return type; }
111
112   public final String JavaDoc toString() {
113     StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
114     sb.append("(" + termText + "," + startOffset + "," + endOffset);
115     if (!type.equals("word"))
116       sb.append(",type="+type);
117     if (positionIncrement != 1)
118       sb.append(",posIncr="+positionIncrement);
119     sb.append(")");
120     return sb.toString();
121   }
122 }
123
Popular Tags