KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > webharvest > runtime > processors > RegexpProcessor


1 /* Copyright (c) 2006-2007, Vladimir Nikic
2     All rights reserved.
3
4     Redistribution and use of this software in source and binary forms,
5     with or without modification, are permitted provided that the following
6     conditions are met:
7
8     * Redistributions of source code must retain the above
9       copyright notice, this list of conditions and the
10       following disclaimer.
11
12     * Redistributions in binary form must reproduce the above
13       copyright notice, this list of conditions and the
14       following disclaimer in the documentation and/or other
15       materials provided with the distribution.
16
17     * The name of Web-Harvest may not be used to endorse or promote
18       products derived from this software without specific prior
19       written permission.
20
21     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31     POSSIBILITY OF SUCH DAMAGE.
32
33     You can contact Vladimir Nikic by sending e-mail to
34     nikic_vladimir@yahoo.com. Please include the word "Web-Harvest" in the
35     subject line.
36 */

37 package org.webharvest.runtime.processors;
38
39 import java.util.*;
40 import java.util.regex.Matcher JavaDoc;
41 import java.util.regex.Pattern JavaDoc;
42
43 import org.webharvest.definition.RegexpDef;
44 import org.webharvest.definition.BaseElementDef;
45 import org.webharvest.runtime.Scraper;
46 import org.webharvest.runtime.ScraperContext;
47 import org.webharvest.runtime.scripting.ScriptEngine;
48 import org.webharvest.runtime.templaters.BaseTemplater;
49 import org.webharvest.runtime.variables.*;
50 import org.webharvest.utils.CommonUtil;
51 import org.webharvest.utils.Constants;
52
53 /**
54  * Regular expression replace processor.
55  */

56 public class RegexpProcessor extends BaseProcessor {
57
58     private RegexpDef regexpDef;
59
60     public RegexpProcessor(RegexpDef regexpDef) {
61         super(regexpDef);
62         this.regexpDef = regexpDef;
63     }
64
65     public IVariable execute(Scraper scraper, ScraperContext context) {
66         ScriptEngine scriptEngine = scraper.getScriptEngine();
67
68         BaseElementDef patternDef = regexpDef.getRegexpPatternDef();
69         IVariable patternVar = getBodyTextContent(patternDef, scraper, context);
70         debug(patternDef, scraper, patternVar);
71
72         BaseElementDef sourceDef = regexpDef.getRegexpSourceDef();
73         IVariable source = getBodyListContent(sourceDef, scraper, context);
74         debug(sourceDef, scraper, source);
75         
76         String JavaDoc replace = BaseTemplater.execute( regexpDef.getReplace(), scriptEngine);
77         boolean isReplace = CommonUtil.isBooleanTrue(replace);
78         
79         String JavaDoc maxLoopsString = BaseTemplater.execute( regexpDef.getMax(), scriptEngine);
80         double maxLoops = Constants.DEFAULT_MAX_LOOPS;
81         if (maxLoopsString != null && !"".equals(maxLoopsString.trim())) {
82             maxLoops = Double.parseDouble(maxLoopsString);
83         }
84         
85         Pattern JavaDoc pattern = Pattern.compile(patternVar.toString(), Pattern.DOTALL|Pattern.UNICODE_CASE);
86         
87         List resultList = new ArrayList();
88         
89         List bodyList = source.toList();
90         Iterator it = bodyList.iterator();
91         while (it.hasNext()) {
92             IVariable currVar = (IVariable) it.next();
93             String JavaDoc text = currVar.toString();
94             
95             Matcher JavaDoc matcher = pattern.matcher(text);
96             int groupCount = matcher.groupCount();
97             
98             StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
99             
100             int index = 0;
101             while ( matcher.find() ) {
102                 index++;
103
104                 // if index exceeds maximum number of matching sequences exists the loop
105
if (maxLoops < index) {
106                     break;
107                 }
108
109                 for (int i = 0; i <= groupCount; i++) {
110                     context.put("_"+i, new NodeVariable(matcher.group(i)));
111                 }
112
113                 BaseElementDef resultDef = regexpDef.getRegexpResultDef();
114                 IVariable result = getBodyTextContent(resultDef, scraper, context);
115                 debug(resultDef, scraper, result);
116                 
117                 String JavaDoc currResult = (result == null) ? matcher.group(0) : result.toString();
118                 if (isReplace) {
119                     matcher.appendReplacement(buffer, currResult);
120                 } else {
121                     resultList.add(new NodeVariable(currResult));
122                 }
123                 
124                 for (int i = 0; i <= groupCount; i++) {
125                     context.remove("_"+i);
126                 }
127             }
128             
129             if (isReplace) {
130                 matcher.appendTail(buffer);
131                 resultList.add(new NodeVariable(buffer.toString()));
132             }
133         }
134         
135
136         return new ListVariable(resultList);
137     }
138
139 }
Popular Tags