KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > hp > hpl > jena > graph > query > regexptrees > PerlPatternParser


1 /*
2   (c) Copyright 2004, 2005 Hewlett-Packard Development Company, LP, all rights reserved.
3   [See end of file]
4   $Id: PerlPatternParser.java,v 1.13 2005/02/21 11:52:29 andy_seaborne Exp $
5 */

6 package com.hp.hpl.jena.graph.query.regexptrees;
7
8 import java.util.*;
9
10 /**
11      Parse Perl5 patterns into RegexpTree structures, or throw an exception for
12      cases that haven't been implemented.
13      
14     @author hedgehog
15 */

16 public class PerlPatternParser
17     {
18     /**
19          The string being parsed, as supplied to the constructor(s).
20     */

21     protected final String JavaDoc toParse;
22     
23     /**
24          The index into the string of the next undealt-with character, ie, it starts at 0.
25     */

26     protected int pointer;
27     
28     /**
29          The length of the string to parse, used as a limit.
30     */

31     protected final int limit;
32     
33     /**
34          The generator for the RegexpTree nodes to be used in the parse.
35     */

36     protected RegexpTreeGenerator generator;
37     
38     /**
39         Count of how many back-references match-points seen so far.
40     */

41     protected int matchPointsSeen;
42     
43     /**
44          The digits, in order.
45     */

46     public static final String JavaDoc digits = "0123456789";
47     
48     /**
49          The characters that are (non-)matchable by \w[W].
50     */

51     public static final String JavaDoc wordChars =
52         digits
53         + "abcdefghijklmnopqrstuvwxyz"
54         + "_"
55         + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
56         ;
57     
58     /**
59          Initialise this parser with the string to parse and with the default
60          generator (SimpleGenerator).
61     */

62     public PerlPatternParser( String JavaDoc toParse )
63         { this( toParse, new SimpleGenerator() ); }
64     
65     /**
66          Initialise this parser with the string to parse and with the generator to
67          use for node construction.
68     */

69     public PerlPatternParser( String JavaDoc toParse, RegexpTreeGenerator gen )
70         { this.toParse = toParse;
71         this.limit = toParse.length();
72         this.generator = gen; }
73     
74     /**
75         Answer the result of parsing the given string as a sequence of alternatives.
76     */

77     public static RegexpTree parse( String JavaDoc string )
78         { return new PerlPatternParser( string ) .parseAlts(); }
79     
80     /**
81         Answer the result of parsing the given string as a sequence of alternatives,
82         using the supplied generator for the pattern nodes.
83     */

84     public static RegexpTree parse( String JavaDoc string, RegexpTreeGenerator gen )
85         { return new PerlPatternParser( string, gen ) .parseAlts(); }
86     
87     /**
88          Exception thrown if a syntax error is detected. Further details are in the
89          error message - it doesn't seem worth worrying about having different
90          classes for different errors. Possibly this should be a non-static class so
91          that it can get at the current context?
92     */

93     public static class SyntaxException extends RuntimeException JavaDoc
94         {
95         public SyntaxException( String JavaDoc message )
96             { super( message ); }
97         }
98     
99     /**
100          Answer the string that this parser is parsing.
101     */

102     public String JavaDoc getString()
103         { return toParse; }
104     
105     /**
106          Answer the current index into the parse string.
107     */

108     public int getPointer()
109         { return pointer; }
110     
111     /**
112         Answer the character under the pointer, and advance the pointer.
113     */

114     protected char nextChar()
115         {
116         return toParse.charAt( pointer++ );
117         }
118
119     /**
120          Parse a single atom and return the tree for it, advancing the pointer. This
121          does not deal with quantifiers, for which see parseQuantifier. Unmatched
122          right parentheses, unexpected (hence unbound) quantifiers, and those things
123          that aren't implemented, throw exceptions. An empty atom is permitted
124          (at the end of a string or before a |).
125     */

126     public RegexpTree parseAtom()
127         {
128         if (pointer < limit)
129             {
130             char ch = nextChar();
131             switch (ch)
132                 {
133                 case '.': return generator.getAnySingle();
134                 case '^': return generator.getStartOfLine();
135                 case '$': return generator.getEndOfLine();
136                 case '|': pointer -= 1; return generator.getNothing();
137                 case '[': return parseClass();
138                 case ')': pointer -= 1; return generator.getNothing();
139                 case '(': return parseParens();
140                 case '\\': return parseBackslash();
141                 case '*':
142                 case '+':
143                 case '?':
144                 case '{': throw new PerlPatternParser.SyntaxException( "unbound quantifier " + ch );
145                 case ']':
146                 case '}':
147                 default: return generator.getText( ch );
148                 }
149             }
150         return generator.getNothing();
151         }
152     
153     /**
154          Parse a class expression and answer an appropriate tree.
155     */

156     protected RegexpTree parseClass()
157         {
158         StringBuffer JavaDoc b = new StringBuffer JavaDoc();
159         boolean negated = parseClassNegation();
160         while (true)
161             {
162             int ch = nextClassChar();
163             if (ch == ']') break;
164             if (ch == '-' && b.length() > 0)
165                 {
166                 char begin = (char) (b.charAt( b.length() - 1 ) + 1);
167                 char end = (char) Math.abs( nextClassChar() );
168                 for (char i = begin; i <= end; i += 1) b.append( i );
169                 }
170             else
171                 b.append( (char) Math.abs( ch ) );
172             }
173         pointer += 1;
174         return generator.getClass( b.toString(), negated );
175         }
176
177     /**
178          Answer the next character, if it's suitable for part of a class expression,
179          negated if it's been escaped. Iffy.
180     */

181     private int nextClassChar()
182         {
183         char ch = nextChar();
184         if (ch == '\\')
185             {
186             RegexpTree t = parseAtom();
187             if (t instanceof Text) return -((Text) t).getString().charAt( 0 );
188             throw new SyntaxException( "not allowed in class" );
189             }
190         else
191             return ch;
192         }
193
194     protected boolean parseClassNegation()
195         {
196         if (toParse.charAt( pointer ) == '^')
197             { pointer += 1; return true; }
198         else
199             return false;
200         }
201
202     /**
203         Parse a parenthesised expression. Throw a SyntaxException if the closing
204         bracket is missing. Answer the wrapped sub-expression. Does not cater
205         for the (? ...) stuff.
206     */

207     protected RegexpTree parseParens()
208         {
209         RegexpTree operand = parseAlts();
210         if (pointer < limit && toParse.charAt( pointer ) == ')') pointer += 1;
211         else throw new SyntaxException( "missing closing bracket" );
212         matchPointsSeen += 1;
213         return generator.getParen( operand, matchPointsSeen );
214         }
215
216     /**
217          Parse a backslash-escape and answer the appropriate regexp tree.
218          Unhandled escapes throw an exception.
219     */

220     private RegexpTree parseBackslash()
221         {
222         char ch = nextChar();
223         if ("bBAZnrtfdDwWSsxc0123456789".indexOf( ch ) < 0)
224             return generator.getText( ch );
225         else if (ch == 'n')
226             return generator.getText( '\n' );
227         else if (ch == 'r')
228             return generator.getText( '\r' );
229         else if (ch == 'f')
230             return generator.getText( '\f' );
231         else if (ch == 't')
232             return generator.getText( '\t' );
233         else if (ch == 's')
234             return generator.getClass( " \r\n\t\f", false );
235         else if (ch == 'S')
236             return generator.getClass( " \r\n\t\f", true );
237         else if (ch == 'd')
238             return generator.getClass( digits, false );
239         else if (ch == 'D')
240             return generator.getClass( digits, true );
241         else if (ch == 'w')
242             return generator.getClass( wordChars, false );
243         else if (ch == 'W')
244             return generator.getClass( wordChars, true );
245         else if ('0' <= ch && ch <= '9')
246             return backReferenceOrOctalChar( ch );
247         else if (ch == 'x')
248             return hexEscape();
249         else if (ch == 'c')
250             return control( nextChar() );
251         else
252             throw new PerlPatternParser.SyntaxException( "can't do \\" + ch + " yet" );
253         }
254     
255     /**
256          Answer a RegexpTree representing the single character which is CTRL-ch.
257     */

258     protected RegexpTree control( char ch )
259         { return Text.create( (char) (ch - 'A' + 1) ); }
260
261     /**
262         Answer a RegexpTree representing the single character whose value is
263         given by the next two hexadecimal digits.
264     */

265     protected RegexpTree hexEscape()
266         {
267         char hi = nextChar(), lo = nextChar();
268         return Text.create( (char) (deHex( hi ) * 16 + deHex( lo )) );
269         }
270
271     /**
272          Answer the integer value corresponding to the hex digit <code>ch</code>.
273     */

274     private int deHex( char ch )
275         {
276         if (Character.isDigit( ch )) return ch - '0';
277         if ('a' <= ch && ch <= 'f') return 10 + ch - 'a';
278         if ('A' <= ch && ch <= 'F') return 10 + ch - 'A';
279         throw new SyntaxException( "'" + ch + "' is not a hex digit" );
280         }
281
282     /**
283         Answer the backreference or octal character described by \nnnn sequences.
284     */

285     protected RegexpTree backReferenceOrOctalChar( char ch )
286         {
287         char [] chars = new char[20];
288         int index = 0;
289         chars[index++] = ch;
290         while (pointer < limit)
291             {
292             ch = nextChar();
293             if (!Character.isDigit( ch )) break;
294             chars[index++] = ch;
295             }
296         int n = numeric( chars, 10, index );
297         return 0 < n && n <= matchPointsSeen
298             ? generator.getBackReference( n )
299             : generator.getText( numeric( chars, 8, index ) );
300         }
301
302     /**
303          Answer the numeric value represented by chars[0..limit-1] in the given base.
304     */

305     protected char numeric( char [] chars, int base, int limit )
306         {
307         int result = 0;
308         for (int i = 0; i < limit; i += 1) result = result * base + (chars[i] - '0');
309         return (char) result;
310         }
311
312     /**
313          Parse any quantifier and answer the quantified version of the argument
314          tree <code>d</code>. TODO: handle non-greedy quantifiers. (These will
315          currently generate syntax errors when their flagging ? is encountered by
316          parseAtom.)
317     */

318     public RegexpTree parseQuantifier( RegexpTree d )
319         {
320         if (pointer < limit)
321             {
322             char ch = toParse.charAt( pointer );
323             switch (ch)
324                 {
325                 case '*':
326                     pointer += 1;
327                     return generator.getZeroOrMore( d );
328                     
329                 case '+':
330                     pointer += 1;
331                     return generator.getOneOrMore( d );
332                     
333                 case '?':
334                     pointer += 1;
335                     return generator.getOptional( d );
336                     
337                 case '{':
338                     throw new SyntaxException( "numeric quantifiers not done yet" );
339                 }
340             }
341         return d;
342         }
343     
344     /**
345          Parse an element (an atom and any following quantifier) and answer the
346          possibly-quantified tree.
347     */

348     public RegexpTree parseElement()
349         { return parseQuantifier( parseAtom() ); }
350
351     /**
352         Parse a sequence of elements [possibly-quantified atoms] and answer the
353         sequence (singular sequences may be reduced to its single element).
354     */

355     public RegexpTree parseSeq()
356         {
357         List operands = new ArrayList();
358         while (true)
359             {
360             RegexpTree next = parseElement();
361             if (next.equals( generator.getNothing() ) ) break;
362             operands.add( next );
363             }
364         return generator.getSequence( operands );
365         }
366
367     /**
368          Parse an alternation of sequences and answer an alternative tree (or the
369          single component if there is just one alternative).
370     */

371     public RegexpTree parseAlts()
372         {
373         List operands = new ArrayList();
374         while (true)
375             {
376             operands.add( parseSeq() );
377             if (pointer < limit && toParse.charAt( pointer ) == '|') pointer += 1;
378             else break;
379             }
380         return generator.getAlternatives( operands );
381         }
382     }
383
384 /*
385     (c) Copyright 2004, 2005 Hewlett-Packard Development Company, LP
386     All rights reserved.
387     
388     Redistribution and use in source and binary forms, with or without
389     modification, are permitted provided that the following conditions
390     are met:
391     
392     1. Redistributions of source code must retain the above copyright
393        notice, this list of conditions and the following disclaimer.
394     
395     2. Redistributions in binary form must reproduce the above copyright
396        notice, this list of conditions and the following disclaimer in the
397        documentation and/or other materials provided with the distribution.
398     
399     3. The name of the author may not be used to endorse or promote products
400        derived from this software without specific prior written permission.
401     
402     THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
403     IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
404     OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
405     IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
406     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
407     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
408     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
409     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
410     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
411     THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
412 */
Popular Tags