1 37 package org.webharvest.runtime.processors; 38 39 import java.util.*; 40 import java.util.regex.Matcher ; 41 import java.util.regex.Pattern ; 42 43 import org.webharvest.definition.RegexpDef; 44 import org.webharvest.definition.BaseElementDef; 45 import org.webharvest.runtime.Scraper; 46 import org.webharvest.runtime.ScraperContext; 47 import org.webharvest.runtime.scripting.ScriptEngine; 48 import org.webharvest.runtime.templaters.BaseTemplater; 49 import org.webharvest.runtime.variables.*; 50 import org.webharvest.utils.CommonUtil; 51 import org.webharvest.utils.Constants; 52 53 56 public class RegexpProcessor extends BaseProcessor { 57 58 private RegexpDef regexpDef; 59 60 public RegexpProcessor(RegexpDef regexpDef) { 61 super(regexpDef); 62 this.regexpDef = regexpDef; 63 } 64 65 public IVariable execute(Scraper scraper, ScraperContext context) { 66 ScriptEngine scriptEngine = scraper.getScriptEngine(); 67 68 BaseElementDef patternDef = regexpDef.getRegexpPatternDef(); 69 IVariable patternVar = getBodyTextContent(patternDef, scraper, context); 70 debug(patternDef, scraper, patternVar); 71 72 BaseElementDef sourceDef = regexpDef.getRegexpSourceDef(); 73 IVariable source = getBodyListContent(sourceDef, scraper, context); 74 debug(sourceDef, scraper, source); 75 76 String replace = BaseTemplater.execute( regexpDef.getReplace(), scriptEngine); 77 boolean isReplace = CommonUtil.isBooleanTrue(replace); 78 79 String maxLoopsString = BaseTemplater.execute( regexpDef.getMax(), scriptEngine); 80 double maxLoops = Constants.DEFAULT_MAX_LOOPS; 81 if (maxLoopsString != null && !"".equals(maxLoopsString.trim())) { 82 maxLoops = Double.parseDouble(maxLoopsString); 83 } 84 85 Pattern pattern = Pattern.compile(patternVar.toString(), Pattern.DOTALL|Pattern.UNICODE_CASE); 86 87 List resultList = new ArrayList(); 88 89 List bodyList = source.toList(); 90 Iterator it = bodyList.iterator(); 91 while (it.hasNext()) { 92 IVariable currVar = (IVariable) it.next(); 93 String text = currVar.toString(); 94 95 Matcher matcher = pattern.matcher(text); 96 int groupCount = matcher.groupCount(); 97 98 StringBuffer buffer = new StringBuffer (); 99 100 int index = 0; 101 while ( matcher.find() ) { 102 index++; 103 104 if (maxLoops < index) { 106 break; 107 } 108 109 for (int i = 0; i <= groupCount; i++) { 110 context.put("_"+i, new NodeVariable(matcher.group(i))); 111 } 112 113 BaseElementDef resultDef = regexpDef.getRegexpResultDef(); 114 IVariable result = getBodyTextContent(resultDef, scraper, context); 115 debug(resultDef, scraper, result); 116 117 String currResult = (result == null) ? matcher.group(0) : result.toString(); 118 if (isReplace) { 119 matcher.appendReplacement(buffer, currResult); 120 } else { 121 resultList.add(new NodeVariable(currResult)); 122 } 123 124 for (int i = 0; i <= groupCount; i++) { 125 context.remove("_"+i); 126 } 127 } 128 129 if (isReplace) { 130 matcher.appendTail(buffer); 131 resultList.add(new NodeVariable(buffer.toString())); 132 } 133 } 134 135 136 return new ListVariable(resultList); 137 } 138 139 } | Popular Tags |