PathTokenFilter


1   /*
2    * Copyright (C) 2005 Alfresco, Inc.
3    *
4    * Licensed under the Mozilla Public License version 1.1 
5    * with a permitted attribution clause. You may obtain a
6    * copy of the License at
7    *
8    *   http://www.alfresco.org/legal/license.txt
9    *
10   * Unless required by applicable law or agreed to in writing,
11   * software distributed under the License is distributed on an
12   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific
14   * language governing permissions and limitations under the
15   * License.
16   */
17  package org.alfresco.repo.search.impl.lucene.analysis;
18  
19  import java.io.IOException  ;
20  import java.io.Reader  ;
21  import java.text.DecimalFormat  ;
22  import java.text.NumberFormat  ;
23  import java.util.Iterator  ;
24  import java.util.LinkedList  ;
25  
26  import org.apache.lucene.analysis.Token;
27  import org.apache.lucene.analysis.Tokenizer;
28  
29  /**
30   * @author andyh
31   * 
32   * TODO To change the template for this generated type comment go to Window - Preferences - Java - Code Style - Code Templates
33   */
34  public class PathTokenFilter extends Tokenizer
35  {
36      public final static String   INTEGER_FORMAT = "0000000000";
37  
38      public final static char PATH_SEPARATOR = ';';
39  
40      public final static char NAMESPACE_START_DELIMITER = '{';
41  
42      public final static char NAMESPACE_END_DELIMITER = '}';
43  
44      public final static String   SEPARATOR_TOKEN_TEXT = ";";
45  
46      public final static String   NO_NS_TOKEN_TEXT = "<No Namespace>";
47  
48      public final static String   TOKEN_TYPE_PATH_SEP = "PATH_SEPARATOR";
49  
50      public final static String   TOKEN_TYPE_PATH_LENGTH = "PATH_LENGTH";
51  
52      public final static String   TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME";
53  
54      public final static String   TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE";
55      
56      public final static String   TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX";
57  
58      char pathSeparator;
59  
60      String   separatorTokenText;
61  
62      String   noNsTokenText;
63  
64      char nsStartDelimiter;
65  
66      int nsStartDelimiterLength;
67  
68      char nsEndDelimiter;
69  
70      int nsEndDelimiterLength;
71  
72      char nsPrefixDelimiter = ':';
73  
74      LinkedList  <Token> tokens = new LinkedList  <Token>();
75  
76      Iterator  <Token> it = null;
77  
78      private boolean includeNamespace;
79  
80      public PathTokenFilter(Reader   in, char pathSeparator, String   separatorTokenText, String   noNsTokenText,
81              char nsStartDelimiter, char nsEndDelimiter, boolean includeNameSpace)
82      {
83          super(in);
84          this.pathSeparator = pathSeparator;
85          this.separatorTokenText = separatorTokenText;
86          this.noNsTokenText = noNsTokenText;
87          this.nsStartDelimiter = nsStartDelimiter;
88          this.nsEndDelimiter = nsEndDelimiter;
89          this.includeNamespace = includeNameSpace;
90  
91          this.nsStartDelimiterLength = 1;
92          this.nsEndDelimiterLength = 1;
93  
94      }
95  
96      /*
97       * (non-Javadoc)
98       * 
99       * @see org.apache.lucene.analysis.TokenStream#next()
100      */
101 
102     public Token next() throws IOException  
103     {
104         Token nextToken;
105         if (it == null)
106         {
107             buildTokenListAndIterator();
108         }
109         if (it.hasNext())
110         {
111             nextToken = it.next();
112         }
113         else
114         {
115             nextToken = null;
116         }
117         return nextToken;
118     }
119 
120     private void buildTokenListAndIterator() throws IOException  
121     {
122         NumberFormat   nf = new DecimalFormat  (INTEGER_FORMAT);
123 
124         // Could optimise to read each path ata time - not just all paths
125         int insertCountAt = 0;
126         int lengthCounter = 0;
127         Token t;
128         Token pathSplitToken = null;
129         Token nameToken = null;
130         Token countToken = null;
131         Token namespaceToken = null;
132         while ((t = nextToken()) != null)
133         {
134             String   text = t.termText();
135 
136             if (text.length() == 0)
137             {
138                 continue; //  Skip  if we find // or /; or ;; etc 
139             }
140 
141             
142             if (text.charAt(text.length() - 1) == pathSeparator)
143             {
144                 text = text.substring(0, text.length() - 1);
145                 pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
146                 pathSplitToken.setPositionIncrement(1);
147 
148             }
149 
150             int split = -1;
151             boolean isPrefix = false;
152 
153             if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter))
154             {
155                 split = text.indexOf(nsEndDelimiter);
156             }
157 
158             if (split == -1)
159             {
160                 split = text.indexOf(nsPrefixDelimiter);
161                 isPrefix = true;
162             }
163 
164             if (split == -1)
165             {
166                 namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(),
167                         TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
168                 nameToken = new Token(text, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
169 
170             }
171             else
172             {
173                 if (isPrefix)
174                 {
175                     namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split,
176                             TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX);
177                     nameToken = new Token(text.substring(split + 1), t.startOffset()
178                             + split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
179                 }
180                 else
181                 {
182                     namespaceToken = new Token(text.substring(nsStartDelimiterLength,
183                             (split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split,
184                             TOKEN_TYPE_PATH_ELEMENT_NAMESPACE);
185                     nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset()
186                             + split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME);
187                 }
188             }
189 
190             namespaceToken.setPositionIncrement(1);
191             nameToken.setPositionIncrement(1);
192 
193             if (includeNamespace)
194             {
195                 tokens.add(namespaceToken);
196             }
197             tokens.add(nameToken);
198 
199             lengthCounter++;
200 
201             if (pathSplitToken != null)
202             {
203 
204                 String   countString = nf.format(lengthCounter);
205                 countToken = new Token(countString, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP);
206                 countToken.setPositionIncrement(1);
207 
208                 tokens.add(insertCountAt, countToken);
209                 tokens.add(pathSplitToken);
210 
211                 lengthCounter = 0;
212                 insertCountAt = tokens.size();
213 
214                 pathSplitToken = null;
215             }
216         }
217 
218         String   countString = nf.format(lengthCounter);
219         countToken = new Token(countString, 0, 0, TOKEN_TYPE_PATH_SEP);
220         countToken.setPositionIncrement(1);
221 
222         tokens.add(insertCountAt, countToken);
223 
224         if ((tokens.size() == 0) || !(tokens.get(tokens.size() - 1).termText().equals(TOKEN_TYPE_PATH_SEP)))
225         {
226             pathSplitToken = new Token(separatorTokenText, 0, 0, TOKEN_TYPE_PATH_SEP);
227             pathSplitToken.setPositionIncrement(1);
228             tokens.add(pathSplitToken);
229         }
230 
231         it = tokens.iterator();
232     }
233 
234     int readerPosition = 0;
235 
236     private Token nextToken() throws IOException  
237     {
238         if (readerPosition == -1)
239         {
240             return null;
241         }
242         StringBuilder   buffer = new StringBuilder  (64);
243         boolean inNameSpace = false;
244         int start = readerPosition;
245         int current;
246         char c;
247         while ((current = input.read()) != -1)
248         {
249             c = (char) current;
250             readerPosition++;
251             if (c == nsStartDelimiter)
252             {
253                 inNameSpace = true;
254             }
255             else if (c == nsEndDelimiter)
256             {
257                 inNameSpace = false;
258             }
259             else if (!inNameSpace && (c == '/'))
260             {
261                 return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
262             }
263             else if (!inNameSpace && (c == ';'))
264             {
265                 buffer.append(c);
266                 return new Token(buffer.toString(), start, readerPosition , "LASTQNAME");
267             }
268             
269             buffer.append(c);
270         }
271         readerPosition = -1;
272         if (!inNameSpace)
273         {
274             return new Token(buffer.toString(), start, readerPosition - 1, "QNAME");
275         }
276         else
277         {
278             throw new IllegalStateException  ("QName terminated incorrectly: " + buffer.toString());
279         }
280 
281     }
282 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags