1 6 package com.hp.hpl.jena.graph.query.regexptrees; 7 8 import java.util.*; 9 10 16 public class PerlPatternParser 17 { 18 21 protected final String toParse; 22 23 26 protected int pointer; 27 28 31 protected final int limit; 32 33 36 protected RegexpTreeGenerator generator; 37 38 41 protected int matchPointsSeen; 42 43 46 public static final String digits = "0123456789"; 47 48 51 public static final String wordChars = 52 digits 53 + "abcdefghijklmnopqrstuvwxyz" 54 + "_" 55 + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 56 ; 57 58 62 public PerlPatternParser( String toParse ) 63 { this( toParse, new SimpleGenerator() ); } 64 65 69 public PerlPatternParser( String toParse, RegexpTreeGenerator gen ) 70 { this.toParse = toParse; 71 this.limit = toParse.length(); 72 this.generator = gen; } 73 74 77 public static RegexpTree parse( String string ) 78 { return new PerlPatternParser( string ) .parseAlts(); } 79 80 84 public static RegexpTree parse( String string, RegexpTreeGenerator gen ) 85 { return new PerlPatternParser( string, gen ) .parseAlts(); } 86 87 93 public static class SyntaxException extends RuntimeException 94 { 95 public SyntaxException( String message ) 96 { super( message ); } 97 } 98 99 102 public String getString() 103 { return toParse; } 104 105 108 public int getPointer() 109 { return pointer; } 110 111 114 protected char nextChar() 115 { 116 return toParse.charAt( pointer++ ); 117 } 118 119 126 public RegexpTree parseAtom() 127 { 128 if (pointer < limit) 129 { 130 char ch = nextChar(); 131 switch (ch) 132 { 133 case '.': return generator.getAnySingle(); 134 case '^': return generator.getStartOfLine(); 135 case '$': return generator.getEndOfLine(); 136 case '|': pointer -= 1; return generator.getNothing(); 137 case '[': return parseClass(); 138 case ')': pointer -= 1; return generator.getNothing(); 139 case '(': return parseParens(); 140 case '\\': return parseBackslash(); 141 case '*': 142 case '+': 143 case '?': 144 case '{': throw new PerlPatternParser.SyntaxException( "unbound quantifier " + ch ); 145 case ']': 146 case '}': 147 default: return generator.getText( ch ); 148 } 149 } 150 return generator.getNothing(); 151 } 152 153 156 protected RegexpTree parseClass() 157 { 158 StringBuffer b = new StringBuffer (); 159 boolean negated = parseClassNegation(); 160 while (true) 161 { 162 int ch = nextClassChar(); 163 if (ch == ']') break; 164 if (ch == '-' && b.length() > 0) 165 { 166 char begin = (char) (b.charAt( b.length() - 1 ) + 1); 167 char end = (char) Math.abs( nextClassChar() ); 168 for (char i = begin; i <= end; i += 1) b.append( i ); 169 } 170 else 171 b.append( (char) Math.abs( ch ) ); 172 } 173 pointer += 1; 174 return generator.getClass( b.toString(), negated ); 175 } 176 177 181 private int nextClassChar() 182 { 183 char ch = nextChar(); 184 if (ch == '\\') 185 { 186 RegexpTree t = parseAtom(); 187 if (t instanceof Text) return -((Text) t).getString().charAt( 0 ); 188 throw new SyntaxException( "not allowed in class" ); 189 } 190 else 191 return ch; 192 } 193 194 protected boolean parseClassNegation() 195 { 196 if (toParse.charAt( pointer ) == '^') 197 { pointer += 1; return true; } 198 else 199 return false; 200 } 201 202 207 protected RegexpTree parseParens() 208 { 209 RegexpTree operand = parseAlts(); 210 if (pointer < limit && toParse.charAt( pointer ) == ')') pointer += 1; 211 else throw new SyntaxException( "missing closing bracket" ); 212 matchPointsSeen += 1; 213 return generator.getParen( operand, matchPointsSeen ); 214 } 215 216 220 private RegexpTree parseBackslash() 221 { 222 char ch = nextChar(); 223 if ("bBAZnrtfdDwWSsxc0123456789".indexOf( ch ) < 0) 224 return generator.getText( ch ); 225 else if (ch == 'n') 226 return generator.getText( '\n' ); 227 else if (ch == 'r') 228 return generator.getText( '\r' ); 229 else if (ch == 'f') 230 return generator.getText( '\f' ); 231 else if (ch == 't') 232 return generator.getText( '\t' ); 233 else if (ch == 's') 234 return generator.getClass( " \r\n\t\f", false ); 235 else if (ch == 'S') 236 return generator.getClass( " \r\n\t\f", true ); 237 else if (ch == 'd') 238 return generator.getClass( digits, false ); 239 else if (ch == 'D') 240 return generator.getClass( digits, true ); 241 else if (ch == 'w') 242 return generator.getClass( wordChars, false ); 243 else if (ch == 'W') 244 return generator.getClass( wordChars, true ); 245 else if ('0' <= ch && ch <= '9') 246 return backReferenceOrOctalChar( ch ); 247 else if (ch == 'x') 248 return hexEscape(); 249 else if (ch == 'c') 250 return control( nextChar() ); 251 else 252 throw new PerlPatternParser.SyntaxException( "can't do \\" + ch + " yet" ); 253 } 254 255 258 protected RegexpTree control( char ch ) 259 { return Text.create( (char) (ch - 'A' + 1) ); } 260 261 265 protected RegexpTree hexEscape() 266 { 267 char hi = nextChar(), lo = nextChar(); 268 return Text.create( (char) (deHex( hi ) * 16 + deHex( lo )) ); 269 } 270 271 274 private int deHex( char ch ) 275 { 276 if (Character.isDigit( ch )) return ch - '0'; 277 if ('a' <= ch && ch <= 'f') return 10 + ch - 'a'; 278 if ('A' <= ch && ch <= 'F') return 10 + ch - 'A'; 279 throw new SyntaxException( "'" + ch + "' is not a hex digit" ); 280 } 281 282 285 protected RegexpTree backReferenceOrOctalChar( char ch ) 286 { 287 char [] chars = new char[20]; 288 int index = 0; 289 chars[index++] = ch; 290 while (pointer < limit) 291 { 292 ch = nextChar(); 293 if (!Character.isDigit( ch )) break; 294 chars[index++] = ch; 295 } 296 int n = numeric( chars, 10, index ); 297 return 0 < n && n <= matchPointsSeen 298 ? generator.getBackReference( n ) 299 : generator.getText( numeric( chars, 8, index ) ); 300 } 301 302 305 protected char numeric( char [] chars, int base, int limit ) 306 { 307 int result = 0; 308 for (int i = 0; i < limit; i += 1) result = result * base + (chars[i] - '0'); 309 return (char) result; 310 } 311 312 318 public RegexpTree parseQuantifier( RegexpTree d ) 319 { 320 if (pointer < limit) 321 { 322 char ch = toParse.charAt( pointer ); 323 switch (ch) 324 { 325 case '*': 326 pointer += 1; 327 return generator.getZeroOrMore( d ); 328 329 case '+': 330 pointer += 1; 331 return generator.getOneOrMore( d ); 332 333 case '?': 334 pointer += 1; 335 return generator.getOptional( d ); 336 337 case '{': 338 throw new SyntaxException( "numeric quantifiers not done yet" ); 339 } 340 } 341 return d; 342 } 343 344 348 public RegexpTree parseElement() 349 { return parseQuantifier( parseAtom() ); } 350 351 355 public RegexpTree parseSeq() 356 { 357 List operands = new ArrayList(); 358 while (true) 359 { 360 RegexpTree next = parseElement(); 361 if (next.equals( generator.getNothing() ) ) break; 362 operands.add( next ); 363 } 364 return generator.getSequence( operands ); 365 } 366 367 371 public RegexpTree parseAlts() 372 { 373 List operands = new ArrayList(); 374 while (true) 375 { 376 operands.add( parseSeq() ); 377 if (pointer < limit && toParse.charAt( pointer ) == '|') pointer += 1; 378 else break; 379 } 380 return generator.getAlternatives( operands ); 381 } 382 } 383 384 | Popular Tags |