KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > alfresco > repo > search > impl > lucene > analysis > PathTokenFilter


1 /*
2  * Copyright (C) 2005 Alfresco, Inc.
3  *
4  * Licensed under the Mozilla Public License version 1.1
5  * with a permitted attribution clause. You may obtain a
6  * copy of the License at
7  *
8  * http://www.alfresco.org/legal/license.txt
9  *
10  * Unless required by applicable law or agreed to in writing,
11  * software distributed under the License is distributed on an
12  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13  * either express or implied. See the License for the specific
14  * language governing permissions and limitations under the
15  * License.
16  */

17 package org.alfresco.repo.search.impl.lucene.analysis;
18
19 import java.io.IOException JavaDoc;
20 import java.io.Reader JavaDoc;
21 import java.text.DecimalFormat JavaDoc;
22 import java.text.NumberFormat JavaDoc;
23 import java.util.Iterator JavaDoc;
24 import java.util.LinkedList JavaDoc;
25
26 import org.apache.lucene.analysis.Token;
27 import org.apache.lucene.analysis.Tokenizer;
28
29 /**
30  * @author andyh
31  *
32  * TODO To change the template for this generated type comment go to Window - Preferences - Java - Code Style - Code Templates
33  */

34 public class PathTokenFilter extends Tokenizer
35 {
36     public final static String JavaDoc INTEGER_FORMAT = "0000000000";
37
38     public final static char PATH_SEPARATOR = ';';
39
40     public final static char NAMESPACE_START_DELIMITER = '{';
41
42     public final static char NAMESPACE_END_DELIMITER = '}';
43
44     public final static String JavaDoc SEPARATOR_TOKEN_TEXT = ";";
45
46     public final static String JavaDoc NO_NS_TOKEN_TEXT = "<No Namespace>";
47
48     public final static String JavaDoc TOKEN_TYPE_PATH_SEP = "PATH_SEPARATOR";
49
50     public final static String JavaDoc TOKEN_TYPE_PATH_LENGTH = "PATH_LENGTH";
51
52     public final static String JavaDoc TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME";
53
54     public final static String JavaDoc TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
55     
56     public final static String JavaDoc TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
57
58     char pathSeparator;
59
60     String JavaDoc separatorTokenText;
61
62     String JavaDoc noNsTokenText;
63
64     char nsStartDelimiter;
65
66     int nsStartDelimiterLength;
67
68     char nsEndDelimiter;
69
70     int nsEndDelimiterLength;
71
72     char nsPrefixDelimiter = ':';
73
74     LinkedList JavaDoc<Token> tokens = new LinkedList JavaDoc<Token>();
75
76     Iterator JavaDoc<Token> it = null;
77
78     private boolean includeNamespace;
79
80     public PathTokenFilter(Reader JavaDoc in, char pathSeparator, String JavaDoc separatorTokenText, String JavaDoc noNsTokenText,
81             char nsStartDelimiter, char nsEndDelimiter, boolean includeNameSpace)
82     {
83         super(in);
84         this.pathSeparator = pathSeparator;
85         this.separatorTokenText = separatorTokenText;
86         this.noNsTokenText = noNsTokenText;
87         this.nsStartDelimiter = nsStartDelimiter;
88         this.nsEndDelimiter = nsEndDelimiter;
89         this.includeNamespace = includeNameSpace;
90
91         this.nsStartDelimiterLength = 1;
92         this.nsEndDelimiterLength = 1;
93
94     }
95
96     /*
97      * (non-Javadoc)
98      *
99      * @see org.apache.lucene.analysis.TokenStream#next()
100      */

101
102     public Token next() throws IOException JavaDoc
103     {
104         Token nextToken;
105         if (it == null)
106         {
107             buildTokenListAndIterator();
108         }
109         if (it.hasNext())
110         {
111             nextToken = it.next();
112         }
113         else
114         {
115             nextToken = null;
116         }
117         return nextToken;
118     }
119
120     private void buildTokenListAndIterator() throws IOException JavaDoc
121     {
122         NumberFormat JavaDoc nf = new DecimalFormat JavaDoc(INTEGER_FORMAT);
123
124         // Could optimise to read each path ata time - not just all paths
125
int insertCountAt = 0;
126         int lengthCounter = 0;
127         Token t;
128         Token pathSplitToken = null;
129         Token nameToken = null;
130         Token countToken = null;
131         Token namespaceToken = null;
132         while ((t = nextToken()) != null)
133         {
134             String JavaDoc text = t.termText();
135
136             if (text.length() == 0)
137             {
138                 continue; // Skip if we find // or /; or ;; etc
139
}
140
141             
142             if (text.charAt(text.length() - 1) == pathSeparator)
143             {
144                 text = text.substring(0, text.length() - 1);
145                 pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
146                 pathSplitToken.setPositionIncrement(1);
147
148             }
149
150             int split = -1;
151             boolean isPrefix = false;
152
153             if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
154             {
155                 split = text.indexOf(nsEndDelimiter);
156             }
157
158             if (split == -1)
159             {
160                 split = text.indexOf(nsPrefixDelimiter);
161                 isPrefix = true;
162             }
163
164             if (split == -1)
165             {
166                 namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
167                         TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
168                 nameToken = new Token(text, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
169
170             }
171             else
172             {
173                 if (isPrefix)
174                 {
175                     namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
176                             TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
177                     nameToken = new Token(text.substring(split + 1), t.startOffset()
178                             + split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
179                 }
180                 else
181                 {
182                     namespaceToken = new Token(text.substring(nsStartDelimiterLength,
183                             (split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
184                             TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
185                     nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
186                             + split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
187                 }
188             }
189
190             namespaceToken.setPositionIncrement(1);
191             nameToken.setPositionIncrement(1);
192
193             if (includeNamespace)
194             {
195                 tokens.add(namespaceToken);
196             }
197             tokens.add(nameToken);
198
199             lengthCounter++;
200
201             if (pathSplitToken != null)
202             {
203
204                 String JavaDoc countString = nf.format(lengthCounter);
205                 countToken = new Token(countString, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
206                 countToken.setPositionIncrement(1);
207
208                 tokens.add(insertCountAt, countToken);
209                 tokens.add(pathSplitToken);
210
211                 lengthCounter = 0;
212                 insertCountAt = tokens.size();
213
214                 pathSplitToken = null;
215             }
216         }
217
218         String JavaDoc countString = nf.format(lengthCounter);
219         countToken = new Token(countString, 0, 0, TOKEN_TYPE_PATH_SEP);
220         countToken.setPositionIncrement(1);
221
222         tokens.add(insertCountAt, countToken);
223
224         if ((tokens.size() == 0) || !(tokens.get(tokens.size() - 1).termText().equals(TOKEN_TYPE_PATH_SEP)))
225         {
226             pathSplitToken = new Token(separatorTokenText, 0, 0, TOKEN_TYPE_PATH_SEP);
227             pathSplitToken.setPositionIncrement(1);
228             tokens.add(pathSplitToken);
229         }
230
231         it = tokens.iterator();
232     }
233
234     int readerPosition = 0;
235
236     private Token nextToken() throws IOException JavaDoc
237     {
238         if (readerPosition == -1)
239         {
240             return null;
241         }
242         StringBuilder JavaDoc buffer = new StringBuilder JavaDoc(64);
243         boolean inNameSpace = false;
244         int start = readerPosition;
245         int current;
246         char c;
247         while ((current = input.read()) != -1)
248         {
249             c = (char) current;
250             readerPosition++;
251             if (c == nsStartDelimiter)
252             {
253                 inNameSpace = true;
254             }
255             else if (c == nsEndDelimiter)
256             {
257                 inNameSpace = false;
258             }
259             else if (!inNameSpace && (c == '/'))
260             {
261                 return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
262             }
263             else if (!inNameSpace && (c == ';'))
264             {
265                 buffer.append(c);
266                 return new Token(buffer.toString(), start, readerPosition , "LASTQNAME");
267             }
268             
269             buffer.append(c);
270         }
271         readerPosition = -1;
272         if (!inNameSpace)
273         {
274             return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
275         }
276         else
277         {
278             throw new IllegalStateException JavaDoc("QName terminated incorrectly: " + buffer.toString());
279         }
280
281     }
282 }
Popular Tags