KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > python > modules > RegexObject


1 // Copyright (c) Corporation for National Research Initiatives
2
package org.python.modules;
3
4 import org.python.core.*;
5 import org.apache.oro.text.regex.*;
6
7
8 public class RegexObject extends PyObject
9 {
10     private static Perl5Compiler compiler = new Perl5Compiler();
11
12     private static synchronized Pattern compile(String JavaDoc pattern, int flags) {
13         try {
14             return compiler.compile(pattern, flags);
15         }
16         catch (MalformedPatternException e) {
17             throw re.ReError(e.getMessage());
18         }
19     }
20
21     private static synchronized Perl5Matcher getMatcher() {
22         Perl5Matcher matcher = new Perl5Matcher();
23         //matcher.setMultiline(false);
24
return matcher;
25     }
26
27     public String JavaDoc pattern;
28     public int flags;
29     public PyDictionary groupindex;
30     private Pattern code;
31
32     public RegexObject(String JavaDoc pattern, int flags) {
33         this.pattern = pattern;
34         this.flags = flags;
35         groupindex = new PyDictionary();
36         code = compile(fixPattern(pattern), flags);
37     }
38
39     public MatchObject match(String JavaDoc string) {
40         MatchResult result = doMatch(string);
41         if (result == null)
42             return null;
43         return new MatchObject(this, string, 0, string.length(), result);
44     }
45
46     public MatchObject match(String JavaDoc s, int pos) {
47         return match(s, pos, s.length());
48     }
49
50     public MatchObject match(String JavaDoc string, int pos, int endpos) {
51         if (endpos > string.length())
52             endpos = string.length();
53         if (endpos < pos)
54             endpos = pos;
55
56         MatchResult result =
57             doMatch(new PatternMatcherInput(string, pos, endpos-pos));
58         if (result == null)
59             return null;
60         return new MatchObject(this, string, pos, endpos, result);
61     }
62
63     private MatchResult doMatch(Object JavaDoc input) {
64         Perl5Matcher matcher = getMatcher();
65         if (input instanceof String JavaDoc) {
66             if (!matcher.matchesPrefix((String JavaDoc)input, code))
67                 return null;
68         }
69         else {
70             if (!matcher.matchesPrefix((PatternMatcherInput)input, code))
71                 return null;
72         }
73         return matcher.getMatch();
74     }
75
76     public MatchObject search(String JavaDoc string) {
77         MatchResult result = doSearch(string);
78         if (result == null)
79             return null;
80         return new MatchObject(this, string, 0, string.length(), result);
81     }
82
83     public MatchObject search(String JavaDoc s, int pos) {
84         return search(s, pos, s.length());
85     }
86
87     public MatchObject search(String JavaDoc string, int pos, int endpos) {
88         if (endpos > string.length())
89             endpos = string.length();
90         if (endpos < pos)
91             endpos = pos;
92
93         MatchResult result =
94             doSearch(new PatternMatcherInput(string, pos, endpos-pos));
95         if (result == null)
96             return null;
97         return new MatchObject(this, string, pos, endpos, result);
98     }
99
100     private MatchResult doSearch(Object JavaDoc input) {
101         Perl5Matcher matcher = getMatcher();
102
103         if (input instanceof String JavaDoc) {
104             if (!matcher.contains((String JavaDoc)input, code))
105                 return null;
106         }
107         else {
108             if (!matcher.contains((PatternMatcherInput)input, code))
109                 return null;
110         }
111         return matcher.getMatch();
112     }
113
114     public PyString sub(PyObject repl, String JavaDoc string) {
115         return sub(repl, string, 0);
116     }
117
118     public PyString sub(PyObject repl, String JavaDoc string, int count) {
119         return (PyString)subn(repl, string, count).__getitem__(0);
120     }
121
122     public PyTuple subn(PyObject repl, String JavaDoc string) {
123         return subn(repl, string, 0);
124     }
125
126     public PyTuple subn(PyObject repl, String JavaDoc string, int count) {
127         // Real work is done here
128
String JavaDoc srepl = null;
129         boolean expand = false;
130         if (repl instanceof PyString) {
131             srepl = repl.toString();
132             expand = (srepl.indexOf('\\') != -1);
133         }
134         if (count < 0) {
135             throw re.ReError("negative substitution count");
136         }
137         if (count == 0) {
138             count = Integer.MAX_VALUE;
139         }
140
141         // How to handle repl as String vs. callable?
142
int n=0;
143         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
144         Perl5Matcher matcher = getMatcher();
145         PatternMatcherInput match = new PatternMatcherInput(string);
146         int lastmatch = 0;
147
148         while (n < count && !match.endOfInput()) {
149             if (!matcher.contains(match, code))
150                 break;
151             n++;
152             int offset = match.getMatchBeginOffset();
153             //System.err.println("off: "+offset+", "+lastmatch);
154
if (offset > lastmatch) {
155                 buf.append(match.substring(lastmatch, offset));
156             }
157             if (srepl == null) {
158                 MatchObject m = new MatchObject(this, string, lastmatch,
159                                                 string.length(),
160                                                 matcher.getMatch());
161                 PyObject ret = repl.__call__(m);
162                 buf.append(ret.toString());
163             }
164             else {
165                 if (expand)
166                     buf.append(expandMatch(matcher.getMatch(), srepl));
167                 else
168                     buf.append(srepl);
169             }
170             lastmatch = match.getMatchEndOffset();
171         }
172         if (lastmatch < match.getEndOffset()) {
173             buf.append(match.substring(lastmatch, match.getEndOffset()));
174         }
175         return new PyTuple(
176             new PyObject[] {
177                 new PyString(buf.toString()),
178                 new PyInteger(n)
179             });
180     }
181
182     public PyList split(String JavaDoc string) {
183         return split(string, 0);
184     }
185
186     public PyList split(String JavaDoc string, int maxsplit) {
187         if (maxsplit < 0) {
188             throw re.ReError("maxsplit < 0");
189         }
190         if (maxsplit == 0) {
191             maxsplit = Integer.MAX_VALUE;
192         }
193
194         int n=0;
195         Perl5Matcher matcher = getMatcher();
196         PatternMatcherInput match = new PatternMatcherInput(string);
197         int lastmatch = 0;
198         PyList results = new PyList();
199
200         while (n < maxsplit && !match.endOfInput()) {
201             if (!matcher.contains(match, code))
202                 break;
203             n++;
204
205             int begin = match.getMatchBeginOffset();
206             int end = match.getMatchEndOffset();
207
208             if (begin == end) {
209                 // More needed?
210
continue;
211             }
212
213             results.append(new PyString(match.substring(lastmatch, begin)));
214
215             MatchResult m = matcher.getMatch();
216             int ngroups = m.groups();
217             if (ngroups > 1) {
218                 for (int j=1; j<ngroups; j++) {
219                     String JavaDoc tmp = m.group(j);
220                     if (tmp == null) {
221                         results.append(Py.None);
222                     }
223                     else {
224                         results.append(new PyString(tmp));
225                     }
226                 }
227             }
228             lastmatch = end;
229         }
230         results.append(
231             new PyString(match.substring(lastmatch, match.getEndOffset())));
232         return results;
233     }
234
235     private int getindex(PyString s) {
236         PyInteger v = (PyInteger)groupindex.__finditem__(s);
237         if (v == null) {
238             try {
239                 v = (PyInteger)s.__int__();
240             }
241             catch (PyException exc) {
242                 if (!isname(s.toString()))
243                     throw re.ReError("illegal character in group name");
244                 else
245                     throw Py.IndexError("group "+s.__repr__() +
246                                         " is undefined");
247             }
248         }
249         return v.getValue();
250     }
251
252     private boolean isdigit(char c) {
253         return '0' <= c && c <= '9';
254     }
255
256     private boolean isident(char c) {
257         return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_');
258     }
259
260     private boolean isname(String JavaDoc name) {
261         int n = name.length();
262         if (n <= 0 || !isident(name.charAt(0)))
263             return false;
264         for (int i = 1; i < n; i++) {
265             char c = name.charAt(i);
266             if (!isident(c) && !isdigit(c))
267                 return false;
268         }
269         return true;
270     }
271
272     private String JavaDoc fixPattern(String JavaDoc pattern) {
273         char[] chars = pattern.toCharArray();
274
275         int index=0;
276         int group=1;
277         int lasti=0;
278         int n = chars.length;
279
280         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
281
282         while (index < n) {
283             if (chars[index++] == '(') {
284                 // Ignore \( because these are literal parens
285
if (index > 2 && chars[index-2] == '\\')
286                     continue;
287
288                 if (index < n && chars[index] == '?') {
289                     index++;
290                     if (index < n && chars[index] == 'P') {
291                         index++;
292                         if (index == n)
293                             break;
294                         char c = chars[index++];
295                         int start = index;
296                         if (c == '<') {
297                             while (index < n && chars[index] != '>')
298                                 index++;
299                             if (index == n)
300                                 throw re.ReError("unmatched <");
301                             String JavaDoc name =
302                                 new String JavaDoc(chars, start, index-start);
303                             // name must be a valid Python identifier
304
if (!isname(name))
305                                 throw re.ReError("illegal character in " +
306                                                  "group name");
307                             groupindex.__setitem__(new PyString(name),
308                                                    new PyInteger(group));
309                             buf.append(chars, lasti, start-3-lasti);
310                             index++;
311                             lasti = index;
312                             group++;
313                             continue;
314                         }
315                         else {
316                             if (c == '=') {
317                                 while (index < n && chars[index] != ')') {
318                                     c = chars[index];
319                                     if (Character.isJavaIdentifierPart(c) &&
320                                         c != '$')
321                                     {
322                                         index++;
323                                     }
324                                     else {
325                                         throw re.ReError(
326                                             "illegal character in symbol");
327                                     }
328                                 }
329                                 if (index == n)
330                                     throw re.ReError("?P= not closed");
331                                 if (!(Character.isJavaIdentifierStart(
332                                     chars[start])))
333                                 {
334                                     throw re.ReError(
335                                         "illegal character starting symbol");
336                                 }
337                                 String JavaDoc name = new String JavaDoc(chars, start,
338                                                          index-start);
339                                 PyString pname = new PyString(name);
340                                 buf.append(chars, lasti, start-4-lasti);
341                                 buf.append('\\');
342                                 buf.append(getindex(pname));
343                                 index++;
344                                 lasti=index;
345                             }
346                             else {
347                                 throw re.ReError("invalid ?P grouping");
348                             }
349                         }
350                     }
351                     else {
352                         if (chars[index] == ':')
353                             continue;
354                         while (index < n && chars[index] != ')')
355                             index++;
356                     }
357                 }
358                 else {
359                     group++;
360                 }
361             }
362         }
363         if (lasti > 0) {
364             buf.append(chars, lasti, n-lasti);
365             //System.err.println("pat: "+buf.toString());
366
return buf.toString();
367         }
368         else {
369             //System.err.println("untouched: "+pattern);
370
return pattern;
371         }
372     }
373
374     public String JavaDoc expandMatch(MatchResult match, String JavaDoc repl) {
375         char[] chars = repl.toCharArray();
376
377         int index=0;
378         int lasti=0;
379         int n = chars.length;
380
381         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
382         try {
383             while (index<n) {
384                 //System.out.println("index: "+index+", "+n+", "+repl);
385

386                 if (chars[index++] == '\\') {
387                     char ch = 0;
388                     switch (chars[index++]) {
389                     case '\\':
390                         ch = '\\'; break;
391                     case 'E':
392                     case 'G':
393                     case 'L':
394                     case 'Q':
395                     case 'U':
396                     case 'l':
397                     case 'u':
398                         throw re.ReError("\\"+chars[index-1]+
399                                          " is not allowed");
400                     case 'n':
401                         ch = '\n'; break;
402                     case 't':
403                         ch = '\t'; break;
404                     case 'r':
405                         ch = '\r'; break;
406                     case 'v':
407                         ch = '\013'; break;
408                     case 'f':
409                         ch = '\f'; break;
410                     case 'a':
411                         ch = '\007'; break;
412                     case 'b':
413                         ch = '\b'; break;
414
415                     case 'g':
416                         if (chars[index++] != '<') {
417                             throw re.ReError(
418                                 "missing < in symbolic reference");
419                         }
420                         int start = index;
421                         while (index < n && chars[index] != '>')
422                             index++;
423                         if (index == n) {
424                             throw re.ReError("unfinished symbolic reference");
425                         }
426                         index++;
427                         buf.append(chars, lasti, start-3-lasti);
428                         PyString str = new PyString(new String JavaDoc(chars, start,
429                                                               index-1-start));
430                         String JavaDoc tmp = match.group(getindex(str));
431                         if (tmp == null) {
432                             throw re.ReError("group not in match: "+str);
433                         }
434                         buf.append(tmp);
435                         lasti=index;
436                         continue;
437
438                     case '1':
439                     case '2':
440                     case '3':
441                     case '4':
442                     case '5':
443                     case '6':
444                     case '7':
445                     case '8':
446                     case '9':
447                         start = index-2;
448                         int v = chars[index-1]-'0';
449                         char ch1;
450                         if (index<n) {
451                             ch = chars[index];
452                             if (ch >= '0' && ch <= '9') {
453                                 index++;
454                                 if (index < n && ch <= '7') {
455                                     ch1 = chars[index];
456                                     if (ch1 >= '0' && ch1 <= '7') {
457                                         v = v*64 +
458                                             (ch - '0')*8 +
459                                             (ch1 - '0');
460                                         buf.append(chars, lasti,
461                                                    index-2-lasti);
462                                         buf.append((char)v);
463                                         index++;
464                                         lasti=index;
465                                     }
466                                 }
467                                 v = v*10 + (ch - '0');
468                             }
469                         }
470                         buf.append(chars, lasti, start-lasti);
471                         tmp = match.group(v);
472                         if (tmp == null) {
473                             throw re.ReError("group not in match: "+v);
474                         }
475                         buf.append(tmp);
476                         lasti=index;
477                         continue;
478                     default:
479                         continue;
480                     }
481                     buf.append(chars, lasti, index-2-lasti);
482                     buf.append(ch);
483                     lasti=index;
484                 }
485             }
486         }
487         catch (ArrayIndexOutOfBoundsException JavaDoc exc) {
488             throw re.ReError("invalid expression");
489         }
490         if (lasti > 0) {
491             buf.append(chars, lasti, n-lasti);
492             return buf.toString();
493         }
494         else {
495             return repl;
496         }
497     }
498
499     public PyList findall(String JavaDoc string) {
500         Perl5Matcher matcher = getMatcher();
501         PatternMatcherInput match = new PatternMatcherInput(string);
502         PyList ret = new PyList();
503
504         while (matcher.contains(match, code)) {
505             MatchResult result = matcher.getMatch();
506             int groups = result.groups();
507
508             if (groups == 1)
509                 // no parenthetical subgroups
510
ret.append(new PyString(result.group(0)));
511             else if (groups == 2)
512                 // one parenthetical subgroup, but we only return a list of
513
// the first matching subgroup.
514
ret.append(new PyString(result.group(1)));
515             else {
516                 // two or more subgroups, so ignore the whole match
517
PyString[] submatches = new PyString[groups-1];
518                 for (int g = 1; g < groups; g++)
519                     submatches[g-1] = new PyString(result.group(g));
520                 PyTuple tup = new PyTuple(submatches);
521                 ret.append(tup);
522             }
523         }
524         return ret;
525     }
526 }
527
Popular Tags