1 17 package org.alfresco.repo.search.impl.lucene.analysis; 18 19 import java.io.IOException ; 20 import java.io.Reader ; 21 import java.text.DecimalFormat ; 22 import java.text.NumberFormat ; 23 import java.util.Iterator ; 24 import java.util.LinkedList ; 25 26 import org.apache.lucene.analysis.Token; 27 import org.apache.lucene.analysis.Tokenizer; 28 29 34 public class PathTokenFilter extends Tokenizer 35 { 36 public final static String INTEGER_FORMAT = "0000000000"; 37 38 public final static char PATH_SEPARATOR = ';'; 39 40 public final static char NAMESPACE_START_DELIMITER = '{'; 41 42 public final static char NAMESPACE_END_DELIMITER = '}'; 43 44 public final static String SEPARATOR_TOKEN_TEXT = ";"; 45 46 public final static String NO_NS_TOKEN_TEXT = "<No Namespace>"; 47 48 public final static String TOKEN_TYPE_PATH_SEP = "PATH_SEPARATOR"; 49 50 public final static String TOKEN_TYPE_PATH_LENGTH = "PATH_LENGTH"; 51 52 public final static String TOKEN_TYPE_PATH_ELEMENT_NAME = "PATH_ELEMENT_NAME"; 53 54 public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE = "PATH_ELEMENT_NAMESPACE"; 55 56 public final static String TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX = "PATH_ELEMENT_NAMESPACE_PREFIX"; 57 58 char pathSeparator; 59 60 String separatorTokenText; 61 62 String noNsTokenText; 63 64 char nsStartDelimiter; 65 66 int nsStartDelimiterLength; 67 68 char nsEndDelimiter; 69 70 int nsEndDelimiterLength; 71 72 char nsPrefixDelimiter = ':'; 73 74 LinkedList <Token> tokens = new LinkedList <Token>(); 75 76 Iterator <Token> it = null; 77 78 private boolean includeNamespace; 79 80 public PathTokenFilter(Reader in, char pathSeparator, String separatorTokenText, String noNsTokenText, 81 char nsStartDelimiter, char nsEndDelimiter, boolean includeNameSpace) 82 { 83 super(in); 84 this.pathSeparator = pathSeparator; 85 this.separatorTokenText = separatorTokenText; 86 this.noNsTokenText = noNsTokenText; 87 this.nsStartDelimiter = nsStartDelimiter; 88 this.nsEndDelimiter = nsEndDelimiter; 89 this.includeNamespace = includeNameSpace; 90 91 this.nsStartDelimiterLength = 1; 92 this.nsEndDelimiterLength = 1; 93 94 } 95 96 101 102 public Token next() throws IOException 103 { 104 Token nextToken; 105 if (it == null) 106 { 107 buildTokenListAndIterator(); 108 } 109 if (it.hasNext()) 110 { 111 nextToken = it.next(); 112 } 113 else 114 { 115 nextToken = null; 116 } 117 return nextToken; 118 } 119 120 private void buildTokenListAndIterator() throws IOException 121 { 122 NumberFormat nf = new DecimalFormat (INTEGER_FORMAT); 123 124 int insertCountAt = 0; 126 int lengthCounter = 0; 127 Token t; 128 Token pathSplitToken = null; 129 Token nameToken = null; 130 Token countToken = null; 131 Token namespaceToken = null; 132 while ((t = nextToken()) != null) 133 { 134 String text = t.termText(); 135 136 if (text.length() == 0) 137 { 138 continue; } 140 141 142 if (text.charAt(text.length() - 1) == pathSeparator) 143 { 144 text = text.substring(0, text.length() - 1); 145 pathSplitToken = new Token(separatorTokenText, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP); 146 pathSplitToken.setPositionIncrement(1); 147 148 } 149 150 int split = -1; 151 boolean isPrefix = false; 152 153 if ((text.length() > 0) && (text.charAt(0) == nsStartDelimiter)) 154 { 155 split = text.indexOf(nsEndDelimiter); 156 } 157 158 if (split == -1) 159 { 160 split = text.indexOf(nsPrefixDelimiter); 161 isPrefix = true; 162 } 163 164 if (split == -1) 165 { 166 namespaceToken = new Token(noNsTokenText, t.startOffset(), t.startOffset(), 167 TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); 168 nameToken = new Token(text, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME); 169 170 } 171 else 172 { 173 if (isPrefix) 174 { 175 namespaceToken = new Token(text.substring(0, split), t.startOffset(), t.startOffset() + split, 176 TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX); 177 nameToken = new Token(text.substring(split + 1), t.startOffset() 178 + split + 1, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME); 179 } 180 else 181 { 182 namespaceToken = new Token(text.substring(nsStartDelimiterLength, 183 (split + nsEndDelimiterLength - 1)), t.startOffset(), t.startOffset() + split, 184 TOKEN_TYPE_PATH_ELEMENT_NAMESPACE); 185 nameToken = new Token(text.substring(split + nsEndDelimiterLength), t.startOffset() 186 + split + nsEndDelimiterLength, t.endOffset(), TOKEN_TYPE_PATH_ELEMENT_NAME); 187 } 188 } 189 190 namespaceToken.setPositionIncrement(1); 191 nameToken.setPositionIncrement(1); 192 193 if (includeNamespace) 194 { 195 tokens.add(namespaceToken); 196 } 197 tokens.add(nameToken); 198 199 lengthCounter++; 200 201 if (pathSplitToken != null) 202 { 203 204 String countString = nf.format(lengthCounter); 205 countToken = new Token(countString, t.startOffset(), t.endOffset(), TOKEN_TYPE_PATH_SEP); 206 countToken.setPositionIncrement(1); 207 208 tokens.add(insertCountAt, countToken); 209 tokens.add(pathSplitToken); 210 211 lengthCounter = 0; 212 insertCountAt = tokens.size(); 213 214 pathSplitToken = null; 215 } 216 } 217 218 String countString = nf.format(lengthCounter); 219 countToken = new Token(countString, 0, 0, TOKEN_TYPE_PATH_SEP); 220 countToken.setPositionIncrement(1); 221 222 tokens.add(insertCountAt, countToken); 223 224 if ((tokens.size() == 0) || !(tokens.get(tokens.size() - 1).termText().equals(TOKEN_TYPE_PATH_SEP))) 225 { 226 pathSplitToken = new Token(separatorTokenText, 0, 0, TOKEN_TYPE_PATH_SEP); 227 pathSplitToken.setPositionIncrement(1); 228 tokens.add(pathSplitToken); 229 } 230 231 it = tokens.iterator(); 232 } 233 234 int readerPosition = 0; 235 236 private Token nextToken() throws IOException 237 { 238 if (readerPosition == -1) 239 { 240 return null; 241 } 242 StringBuilder buffer = new StringBuilder (64); 243 boolean inNameSpace = false; 244 int start = readerPosition; 245 int current; 246 char c; 247 while ((current = input.read()) != -1) 248 { 249 c = (char) current; 250 readerPosition++; 251 if (c == nsStartDelimiter) 252 { 253 inNameSpace = true; 254 } 255 else if (c == nsEndDelimiter) 256 { 257 inNameSpace = false; 258 } 259 else if (!inNameSpace && (c == '/')) 260 { 261 return new Token(buffer.toString(), start, readerPosition - 1, "QNAME"); 262 } 263 else if (!inNameSpace && (c == ';')) 264 { 265 buffer.append(c); 266 return new Token(buffer.toString(), start, readerPosition , "LASTQNAME"); 267 } 268 269 buffer.append(c); 270 } 271 readerPosition = -1; 272 if (!inNameSpace) 273 { 274 return new Token(buffer.toString(), start, readerPosition - 1, "QNAME"); 275 } 276 else 277 { 278 throw new IllegalStateException ("QName terminated incorrectly: " + buffer.toString()); 279 } 280 281 } 282 } | Popular Tags |