1 16 17 package org.apache.taglibs.scrape; 18 19 import java.util.*; 20 import java.io.*; 21 import java.net.*; 22 import javax.servlet.jsp.*; 23 import sun.misc.BASE64Encoder; 24 import org.apache.oro.text.regex.*; 25 26 35 public class PageData { 36 37 40 public static HashMap pageurls = new HashMap(); 41 44 private static Object O = new Object (); 45 48 private HashMap scrapes = new HashMap(); 49 52 private boolean newflag; 53 57 private boolean changeflag; 58 62 private Boolean scraping = new Boolean (false); 63 67 private long lastscrape = 0; 68 72 private PageContext pagecontext; 73 78 private boolean exception; 79 82 private String exceptiontext; 83 86 private Page page; 87 90 private int pport =-1; 91 94 private String pserver = null; 95 98 private String auth = null; 99 103 private boolean ssl = false; 104 107 private String sslclientpass = null; 108 111 private ArrayList name = new ArrayList(10); 112 115 private ArrayList value = new ArrayList(10); 116 117 121 public PageData() {} 122 123 138 public static PageData getPage(String url, int port, String server, 139 String name, String pass) { 140 PageData pagedata; 143 144 if((pagedata = (PageData)pageurls.get(url)) == null) { 145 synchronized(O) { 146 if(pagedata == null) { 147 pagedata = new PageData(); pagedata.setLastScrape(new Date().getTime()); 150 pagedata.setProxyPort(port); 152 pagedata.setProxyServer(server); 153 pagedata.setAuth(name, pass); 154 165 pageurls.put(url, pagedata); 167 } 168 } 169 } 170 return pagedata; 171 } 172 173 188 public final synchronized void setScrape(String id, String begin, 189 String end, String anchors, String strip) throws JspException { 190 191 ScrapeData scrape; Boolean checkvalstrip; 194 Boolean checkvalanchors; 195 196 if((scrape = (ScrapeData)scrapes.get(id)) == null) { 198 scrape = new ScrapeData(); scrape.setBegin(begin); 202 scrape.setEnd(end); 203 if (anchors != null) 204 scrape.setanchorsFlag(anchors); 205 if (strip != null) 206 scrape.setstripFlag(strip); 207 208 scrapes.put(id, scrape); newflag = true; } else if (((scrape.getBegin().compareTo(begin)) != 0) || 211 ((scrape.getEnd().compareTo(end)) != 0)) { 212 throw new JspException ("scrape id " + id + " is already in use"); 214 } else if (((scrape.getBegin().compareTo(begin)) == 0) || 215 ((scrape.getEnd().compareTo(end)) == 0)) { 216 checkvalstrip = new Boolean (strip); 218 checkvalanchors = new Boolean (anchors); 219 220 if (scrape.getanchorsFlag() != checkvalanchors.booleanValue()) { 222 if (anchors != null) { 224 scrape.setanchorsFlag(anchors); 225 changeflag = true; 227 } 228 } 229 if (scrape.getstripFlag() != checkvalstrip.booleanValue()) { 230 if (strip != null) { 232 scrape.setstripFlag(anchors); 233 changeflag = true; 235 } 236 } 237 } 238 } 239 240 241 249 public String getResults(String id) throws JspException { 250 ScrapeData scrape = (ScrapeData)scrapes.get(id); 251 252 try { 254 return scrape.getResult(); 255 } catch (NullPointerException ne) { 256 throw new JspException 257 ("page or scrapeid in result tag do not exist"); 258 } 259 } 260 261 267 protected final void setHeader(String name, String value) { 268 if (name == null) { 269 this.name = new ArrayList(5); 270 this.value = new ArrayList(5); 271 } 272 this.name.add(name); 273 this.value.add(value); 274 } 275 276 283 public ArrayList getHeaders() { 284 if (name == null) { 285 return null; 286 } else { 287 ArrayList list = new ArrayList(2); 288 list.add(name); 289 list.add(value); 290 return list; 291 } 292 } 293 294 298 public void setNewflag() { 299 newflag = false; 300 } 301 302 308 public boolean getNewFlag() { 309 return newflag; 310 } 311 312 318 public final void setProxyPort(int value) { 319 pport = value; 320 } 321 322 328 public final int getProxyPort() { 329 return pport; 330 } 331 332 338 public final void setProxyServer(String value) { 339 pserver = value; 340 } 341 342 348 public final String getProxyServer() { 349 return pserver; 350 } 351 357 public final void setClientPass(String value) { 358 sslclientpass = value; 359 } 360 361 369 public final void setAuth(String name, String pass) { 370 if (name != null && pass != null) 371 auth = "Basic " + 372 new BASE64Encoder().encode((name + ":" + pass).getBytes()); 373 } 374 375 381 public final String getAuth() { 382 return auth; 383 } 384 385 392 public final void setSSL(boolean value) { 393 ssl = value; 394 } 395 396 401 public final boolean getSSL() { 402 return ssl; 403 } 404 405 411 public final Set getKeySet() { 412 return scrapes.keySet(); 413 } 414 415 421 public void setLastScrape(long time) { 422 lastscrape = time; 423 } 424 425 431 public long getLastScrape() { 432 return lastscrape; 433 } 434 435 443 public void setExceptionText(String begin, String end) { 444 exceptiontext = new String 445 ("there is a syntax error in " + begin + " or " + end + 446 " for the scrape. A character probably needs to be escaped for perl\n" 447 + " See docs for help if you don't know perl"); 448 } 449 450 455 public void setException() { 456 exception = true; 457 } 458 459 466 public boolean getChangeFlag() { 467 return changeflag; 468 } 469 470 478 public ScrapeData getScrape(String key) { 479 return (ScrapeData)scrapes.get(key); 480 } 481 482 488 public void setPageContext(PageContext page) { 489 pagecontext = page; 490 } 491 492 498 public PageContext getPageContext() { 499 return pagecontext; 500 } 501 502 515 public void scrapePage(String url, long time, PageContext pc, String cs) 516 throws JspException { 517 long currenttime = new Date().getTime(); 519 if (((currenttime - lastscrape) > time) || newflag || changeflag) { 521 if (scraping.booleanValue() && !page.isAlive()) { 523 scraping = new Boolean (false); 524 } 525 if (!scraping.booleanValue()) { 526 synchronized (scraping) { 528 if ((page == null) || !page.isAlive()) { 529 try { 532 page = new Page(url, this, pc, cs); 533 540 } catch (MalformedURLException mue) { 541 pc.getServletContext().log("PageData.scrapePage(): " 542 + mue.getMessage()); 543 } 544 } 545 if ((((currenttime - lastscrape) > time) || newflag || 546 changeflag) && page != null) { 547 scraping = new Boolean (true); 549 page.start(); 550 } 551 } 552 } 553 } 554 555 if (scraping.booleanValue() && !page.isAlive()) { 558 scraping = new Boolean (false); 559 } 560 561 if (scraping.booleanValue() && (newflag || changeflag) && (page != null)) { 562 try { 563 page.join(); changeflag = false; scraping = new Boolean (false); } catch (InterruptedException ie) { 567 pc.getServletContext(). 569 log("PageData.scrapePage(): Page thread interrupted " 570 + ie.toString()); 571 } 572 } 573 if (exception) { 576 exception = false; throw new JspException(exceptiontext); 578 } 579 } 580 } 581 582 588 class Page extends Thread { 589 590 private HttpConnection connection; private long lastmodified; private long expires; private URL url; private PageData pagedata; private char source[]; 597 private final long MAX_BUFFER_SIZE = 50000; 599 private PageContext pageContext; 601 private boolean proxy = false; 603 private int pport = -1; 605 private String pserver = null; 607 private String authstring = null; 609 private boolean ssl = false; 612 private String charset = null; 614 615 627 Page(String url, PageData page, PageContext pc, String cs) 628 throws MalformedURLException { 630 this.url = new URL(url); 631 if(this.url.getFile().length() == 0) 634 this.url = new URL(url + "/"); 635 pagedata = page; 636 pageContext = pc; 637 charset = cs; 638 } 640 641 656 Page(String url, PageData page, PageContext pc, int port, String server, 657 String proxyauth) throws MalformedURLException { 658 this.url = new URL(url); 660 if(this.url.getFile().length() == 0) 663 this.url = new URL(url + "/"); 664 pagedata = page; 665 pageContext = pc; 666 pport = port; 667 pserver = server; 668 authstring = proxyauth; 669 proxy = true; 670 } 672 673 public void run() { 674 long current = new Date().getTime(); 676 try { 678 connection = new HttpConnection(url, pagedata, pageContext); 680 687 connection.setRequestMethod("HEAD"); 688 connection.connect(); 689 connection.sendRequest(); 690 691 if (connection.getResponseCode() >= 300) { 694 pageContext.getServletContext(). 695 log("Page.run(): Error Occured: " 696 + connection.getResponseMessage()); 697 } else { 698 if ((expires =(long)connection.getExpiration()) == 0) 700 expires = current - 1; 702 703 if((expires < current) || pagedata.getNewFlag() || 706 pagedata.getChangeFlag()) { 707 708 if ((lastmodified = (long)connection.getLastModified()) == 0) 711 lastmodified = pagedata.getLastScrape() + 1; 713 714 if ((pagedata.getLastScrape() < lastmodified) || 717 pagedata.getNewFlag() || pagedata.getChangeFlag()) { 718 719 connection.disconnect(); 721 722 pagedata.setLastScrape(current); 724 725 connection.setRequestMethod("GET"); 727 connection.connect(); 729 connection.sendRequest(); 730 731 if (connection.getResponseCode() >= 300) { 733 pageContext.getServletContext(). 734 log("Page.run(): Error Occured: " + 735 connection.getResponseMessage()); 736 return; 738 } 739 740 if (streamtochararray(connection.getInputStream(),charset)) { 743 scrape(); 745 } 746 connection.disconnect(); 748 } 749 } 750 } 751 } catch (IOException ee) { 752 pageContext.getServletContext(). 753 log("Page.run(): " + ee.toString()); 754 } 755 } 756 757 766 private boolean streamtochararray(InputStream in, String charset) { 767 long sourcelength = 50000; StringBuffer temp; boolean returnvalue = true; InputStreamReader input = null; 772 if ( charset == null ) { 773 input = new InputStreamReader(in); 774 } else { 775 try { 776 input = new InputStreamReader(in, charset); 777 } catch( UnsupportedEncodingException exc ) { 778 System.err.println( "WARNING: unsupported charset " + charset + ". Using default." ); 779 input = new InputStreamReader(in); 780 } 781 } 782 boolean chop = false; int offset = 0; int num; 786 787 sourcelength = (long)connection.getHeaderFieldInt("Content-Length", 788 (int)MAX_BUFFER_SIZE); 789 790 if ((sourcelength > MAX_BUFFER_SIZE)) { 792 sourcelength = MAX_BUFFER_SIZE; 793 } 794 source = new char[(int)sourcelength]; 795 796 boolean check = false; 799 try { while((num = input.read(source, offset, 801 (int)(sourcelength - offset))) > 0) { 802 offset += num; 803 check = true; 804 } 805 } catch (IOException e) { 807 if (!check) 808 returnvalue = false; 809 pageContext.getServletContext().log("Page.streamtochararray(): Error " + 810 "ocured while reading the " + 811 "inputstream " + e.toString()); 812 } 813 814 if (chop) { 815 temp = new StringBuffer ().append(source); 817 source = new char[temp.length() + 1]; temp.getChars(0, temp.length(), source,0); } 820 return returnvalue; 821 } 822 823 829 public void scrape() { 830 831 Perl5Compiler compiler = new Perl5Compiler(); 833 Perl5Matcher matcher = new Perl5Matcher(); 835 Perl5Pattern pattern = null; MatchResult result; PatternMatcherInput input; 840 String match; ScrapeData sd; Set scrapedatakeys = pagedata.getKeySet(); 844 Iterator scrapesit = scrapedatakeys.iterator(); 846 Iterator scrapesit1 = scrapedatakeys.iterator(); 847 String regex = new String (); 849 850 while(scrapesit.hasNext()) { 852 853 sd = pagedata.getScrape((String )scrapesit.next()); 855 856 regex = regex.concat(sd.getBegin().concat(".*?").concat(sd.getEnd())); 858 859 try { 861 pattern = (Perl5Pattern)compiler.compile(regex, 863 Perl5Compiler.SINGLELINE_MASK); 864 } catch (MalformedPatternException e) { 865 pagedata.setException(); 868 pagedata.setExceptionText(sd.getBegin(), sd.getEnd()); 869 } 870 871 matcher.contains(source, pattern); 873 result = matcher.getMatch(); 875 if(result != null) { 876 StringBuffer matchbuffer = new StringBuffer (); 878 matchbuffer.append(result.toString()); 879 match = new String (matchbuffer); 880 881 if(!sd.getanchorsFlag()) { 884 match = match.substring(sd.getBegin().length(), 886 match.lastIndexOf(sd.getEnd())); 887 } 888 889 if(sd.getstripFlag()) { 891 regex = ">.*?<"; 892 String finalresult = new String (); 894 try { 897 pattern = (Perl5Pattern)compiler.compile(regex, 899 Perl5Compiler.SINGLELINE_MASK); 900 } catch (MalformedPatternException e) { 901 } 904 905 if ((match.indexOf('<') < match.indexOf('>')) && 908 (match.indexOf('<') != 0)) 909 finalresult = match.substring(0, match.indexOf('<')). 910 concat(" "); 911 912 input = new PatternMatcherInput(match); 914 915 while(matcher.contains(input, pattern)) { 917 918 matchbuffer.setLength(0); 920 result = matcher.getMatch(); matchbuffer.append(result.toString()); 923 match = new String (matchbuffer); 924 925 if (match.length() > 2) { 927 finalresult = finalresult.concat(match.substring(1, 928 match.indexOf('<'))).concat(" "); 929 } 930 } 931 932 match = input.toString(); 935 936 if (match.lastIndexOf('>') > match.lastIndexOf('<')) 938 finalresult = finalresult.concat(match.substring( 939 match.lastIndexOf('>') + 1, match.length())); 940 941 match = finalresult; 943 } 944 sd.setResult(match); pagedata.setNewflag(); } else { 947 sd.setResult(""); 948 } 949 regex = ""; match = null; } 952 } 953 } 954 | Popular Tags |