KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > taglibs > scrape > PageData


1 /*
2  * Copyright 1999,2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17 package org.apache.taglibs.scrape;
18
19 import java.util.*;
20 import java.io.*;
21 import java.net.*;
22 import javax.servlet.jsp.*;
23 import sun.misc.BASE64Encoder;
24 import org.apache.oro.text.regex.*;
25
26 /**
27  * PageData - An object used to store information about a scrape done by the
28  * tags in the scrape package.
29  *
30  * @author Rich Catlett
31  *
32  * @version 1.0
33  *
34  */

35 public class PageData {
36
37     /**
38      * static HashMap holds all of the pagedata objects keyed to the url
39      */

40     public static HashMap pageurls = new HashMap();
41     /**
42      * static object used for synchronization
43      */

44     private static Object JavaDoc O = new Object JavaDoc();
45     /**
46      *stores data on each scrape, the data is stored in the order of the scrape
47      */

48     private HashMap scrapes = new HashMap();
49     /**
50      * boolean flag tells object that new scrapes for this page have been added
51      */

52     private boolean newflag;
53     /**
54      * boolean flag that marks if a change has been made to the flags in
55      * a scrape data object
56      */

57     private boolean changeflag;
58     /**
59      * Boolean object used to synchronize scrapes and it keeps track of when a
60      * scrape is occuring
61      */

62     private Boolean JavaDoc scraping = new Boolean JavaDoc(false);
63     /**
64      * time the last scrape occured determines if page needs to be requested
65      * again
66      */

67     private long lastscrape = 0;
68     /**
69      * PageContext object for the calling JSP page used to access the
70      * ServletContext for logging error to the server
71      */

72     private PageContext pagecontext;
73     // the following two variables are needed because the start method of
74
// Thread cannot throw a JspException
75
/**
76      * flag marks if a MalformedPatternException was thrown in scrape()
77      */

78     private boolean exception;
79     /**
80      * if an exception was thrown holds info on the error
81      */

82     private String JavaDoc exceptiontext;
83     /**
84      * thread that gets the requested page and then runs the scrape on the page
85      */

86     private Page page;
87     /**
88      * the port to use for the proxy connection
89      */

90     private int pport =-1;
91     /**
92      * the proxy server to use for the connection
93      */

94     private String JavaDoc pserver = null;
95     /**
96      * auth string for basic authentication to the proxy server username:password
97      */

98     private String JavaDoc auth = null;
99     /**
100      * boolean value determines if the connection to to travel via a secure
101      * connection
102      */

103     private boolean ssl = false;
104     /**
105      * password to the client keystore for ssl client side authentication
106      */

107     private String JavaDoc sslclientpass = null;
108     /**
109      * list of names extra headers to add
110      */

111     private ArrayList name = new ArrayList(10);
112     /**
113      * list of values of extra headers to add
114      */

115     private ArrayList value = new ArrayList(10);
116
117     /**
118      * constructor for the class simply creates an instance of the PageData
119      * object
120      */

121     public PageData() {}
122
123     /**
124      * method checks the static HashMap pageurls for the given url if it exists
125      * it is returned, otherwise a new pagedata object is created and added
126      * to the HashMap pageurls
127      *
128      * @param url the url of the page given in the calling PageTag
129      * @param port the port to use for the connection to the proxy server
130      * @param server the proxy server
131      * @param secure is the link going over https
132      * @param name user name for authentication
133      * @param pass password for authentication
134      *
135      * @return a PageData object either newly created or one already in
136      * existence
137      */

138     public static PageData getPage(String JavaDoc url, int port, String JavaDoc server,
139                    String JavaDoc name, String JavaDoc pass) {
140          //public static PageData getPage(String url, int port, String server,
141
// boolean ssl, String name, String pass, String sslpass) {
142
PageData pagedata;
143
144     if((pagedata = (PageData)pageurls.get(url)) == null) {
145         synchronized(O) {
146             if(pagedata == null) {
147                 pagedata = new PageData(); // create new pagedata object
148
// set time of last scrape to current system time
149
pagedata.setLastScrape(new Date().getTime());
150             // set the proxyport and proxy server for this connection
151
pagedata.setProxyPort(port);
152             pagedata.setProxyServer(server);
153             pagedata.setAuth(name, pass);
154             /*
155             // set secure for the connection
156             pagedata.setSSL(ssl);
157             // set the keystore password for client side authentication
158             pagedata.setClientPass(sslpass);
159             // if secure == true set system property so that https will be
160             // understood
161             if (ssl)
162             System.setProperty("java.protocol.handler.pkgs",
163                        "com.sun.net.ssl.internal.www.protocol");
164             */

165                     // add pagedata object to static Hashmap
166
pageurls.put(url, pagedata);
167         }
168         }
169     }
170     return pagedata;
171     }
172
173     /**
174      * method to add an object to the HashMap scrapes, it first checks to see
175      * if the object already exists
176      *
177      * @param id unique identifier of the scrape the following attributes
178      * define
179      * @param begin beginning anchor for the scrape refered to by id
180      * @param end ending anchor for the scrape refered to by id
181      * @param anchors boolean flag that determines if begin and end anchors are
182      * part of the result
183      * @param strip boolean flag that determines if tags are to be striped from
184      * the result
185      *
186      * @return a boolean value, true if a new scrape has been added else false
187      */

188     public final synchronized void setScrape(String JavaDoc id, String JavaDoc begin,
189          String JavaDoc end, String JavaDoc anchors, String JavaDoc strip) throws JspException {
190
191     ScrapeData scrape; // used to test for equality of ScrapeData objects
192
// boolean objects used to check if a flag value has been changed
193
Boolean JavaDoc checkvalstrip;
194     Boolean JavaDoc checkvalanchors;
195
196     // check to see if this scrape already exists
197
if((scrape = (ScrapeData)scrapes.get(id)) == null) {
198         // scrape does not already exist
199
scrape = new ScrapeData(); // create new scrapedata object
200
// set the attributes of the new scrapedata object
201
scrape.setBegin(begin);
202         scrape.setEnd(end);
203         if (anchors != null)
204         scrape.setanchorsFlag(anchors);
205         if (strip != null)
206         scrape.setstripFlag(strip);
207
208             scrapes.put(id, scrape); // add scrape to HashMap scrapes
209
newflag = true; // a new scrape has been added to this page
210
} else if (((scrape.getBegin().compareTo(begin)) != 0) ||
211                    ((scrape.getEnd().compareTo(end)) != 0)) {
212             // the two scrapedata objects are different
213
throw new JspException ("scrape id " + id + " is already in use");
214     } else if (((scrape.getBegin().compareTo(begin)) == 0) ||
215                    ((scrape.getEnd().compareTo(end)) == 0)) {
216         // set check objects so that boolean can be compared to String
217
checkvalstrip = new Boolean JavaDoc(strip);
218         checkvalanchors = new Boolean JavaDoc(anchors);
219
220         // scrape is old scrape check to see if any flags have been changed
221
if (scrape.getanchorsFlag() != checkvalanchors.booleanValue()) {
222             // reset anchors flag
223
if (anchors != null) {
224             scrape.setanchorsFlag(anchors);
225             // indicate change has been made to flags
226
changeflag = true;
227             }
228         }
229         if (scrape.getstripFlag() != checkvalstrip.booleanValue()) {
230             // reset strip flag
231
if (strip != null) {
232             scrape.setstripFlag(anchors);
233             // indicate change has been made to flags
234
changeflag = true;
235             }
236         }
237     }
238     }
239
240
241     /**
242      * getter method for a result string from a scrapedata object
243      *
244      * @param id unique key for requested scrape
245      *
246      * @return results from scrapeidentified by id
247      *
248      */

249     public String JavaDoc getResults(String JavaDoc id) throws JspException {
250     ScrapeData scrape = (ScrapeData)scrapes.get(id);
251
252         // error will occur if page or scrapeid in result tag does not exist
253
try {
254         return scrape.getResult();
255     } catch (NullPointerException JavaDoc ne) {
256         throw new JspException
257                             ("page or scrapeid in result tag do not exist");
258     }
259     }
260
261     /**
262      * set the name and value of any extra headers to be sent
263      *
264      * @param name string that is the name of an extra header to be sent
265      * @param value string that is the value of an extra header to be sent
266      */

267     protected final void setHeader(String JavaDoc name, String JavaDoc value) {
268         if (name == null) {
269             this.name = new ArrayList(5);
270             this.value = new ArrayList(5);
271     }
272     this.name.add(name);
273         this.value.add(value);
274     }
275
276     /**
277      * get the http headers
278      *
279      * @return name ArrayList of the names of http headers to be sent
280      * @return value ArrayList or the values of http headers to be sent
281      *
282      */

283     public ArrayList getHeaders() {
284         if (name == null) {
285             return null;
286     } else {
287             ArrayList list = new ArrayList(2);
288             list.add(name);
289             list.add(value);
290             return list;
291     }
292     }
293
294     /**
295      * setter method for newflag only called to set newflag to false
296      *
297      */

298     public void setNewflag() {
299     newflag = false;
300     }
301
302     /**
303      * getter method for newflag
304      *
305      * @return boolean value of newflag true if new scrape exists else false
306      *
307      */

308     public boolean getNewFlag() {
309     return newflag;
310     }
311
312     /**
313      * set the value of proxy port
314      *
315      * @param value the proxy port to use for the connection as a String
316      *
317      */

318     public final void setProxyPort(int value) {
319     pport = value;
320     }
321
322     /**
323      * get the value of the proxy port
324      *
325      * @return - int the proxy port number
326      *
327      */

328     public final int getProxyPort() {
329         return pport;
330     }
331
332     /**
333      * set the value of proxy server
334      *
335      * @param value the proxy server to use for the connection
336      *
337      */

338     public final void setProxyServer(String JavaDoc value) {
339     pserver = value;
340     }
341
342     /**
343      * get the value of the proxy port
344      *
345      * @return - String the name or the proxy server
346      *
347      */

348     public final String JavaDoc getProxyServer() {
349         return pserver;
350     }
351    /**
352      * set the pass word to access the client keystore
353      *
354      * @param value password to the client keystore
355      *
356      */

357     public final void setClientPass(String JavaDoc value) {
358     sslclientpass = value;
359     }
360
361     /**
362      * set the username and password values for authentication to the proxy server
363      *
364      * @param name username
365      * @param pass password
366      * @param base64 base64 encoded username and password
367      *
368      */

369     public final void setAuth(String JavaDoc name, String JavaDoc pass) {
370     if (name != null && pass != null)
371         auth = "Basic " +
372            new BASE64Encoder().encode((name + ":" + pass).getBytes());
373     }
374
375     /**
376      * get the base64 encoded auth string for proxy authorization
377      *
378      * @return - String base64 encoded authorization for the proxy server
379      *
380      */

381     public final String JavaDoc getAuth() {
382         return auth;
383     }
384
385     /**
386      * set the value of secure
387      *
388      * @param value true if the connection is to be made via https the default
389      * is false
390      *
391      */

392     public final void setSSL(boolean value) {
393     ssl = value;
394     }
395
396     /**
397      * get secure
398      *
399      * @return - boolean value of secure flag
400      */

401     public final boolean getSSL() {
402         return ssl;
403     }
404
405     /**
406      * getter method for the key set of the HashMap scrapes
407      *
408      * @return the keys that the scrapes for this page are hashed to
409      *
410      */

411     public final Set getKeySet() {
412     return scrapes.keySet();
413     }
414
415     /**
416      * setter method for lastscrape
417      *
418      * @param time current time
419      *
420      */

421     public void setLastScrape(long time) {
422     lastscrape = time;
423     }
424
425     /**
426      * getter method for lastscrape
427      *
428      * @param time current time
429      *
430      */

431     public long getLastScrape() {
432     return lastscrape;
433     }
434
435     /**
436      * sets the exception text to make the error in the jsp page easier for the
437      * author to find
438      *
439      * @param begin the beginning marker for scrape where error occured
440      * @param end the end marker for the scrape where error occured
441      *
442      */

443     public void setExceptionText(String JavaDoc begin, String JavaDoc end) {
444         exceptiontext = new String JavaDoc
445                     ("there is a syntax error in " + begin + " or " + end +
446        " for the scrape. A character probably needs to be escaped for perl\n"
447        + " See docs for help if you don't know perl");
448     }
449
450     /**
451      * set exception to true, a malformedpatternexception has been thrown in
452      * page
453      *
454      */

455     public void setException() {
456     exception = true;
457     }
458
459     /**
460      * getter method for changeflag
461      *
462      * @return boolean value of changeflag true if a scrape has been changed
463      * else false
464      *
465      */

466     public boolean getChangeFlag() {
467     return changeflag;
468     }
469
470     /**
471      * getter method for a single scrapedata object from HashMap scrapes
472      *
473      * @param key the value the requested scrapedata object is keyed to
474      *
475      * @return the requested scrapedata boject
476      *
477      */

478     public ScrapeData getScrape(String JavaDoc key) {
479     return (ScrapeData)scrapes.get(key);
480     }
481
482     /**
483      * setter method for pagecontext
484      *
485      * @param page the page context object for this page
486      *
487      */

488     public void setPageContext(PageContext page) {
489     pagecontext = page;
490     }
491
492     /**
493      * getter method for pagecontext
494      *
495      * returns the PageContext object for this page
496      *
497      */

498     public PageContext getPageContext() {
499     return pagecontext;
500     }
501
502     /**
503      * checks the scrapeint, if enough time has passed it starts the getPage
504      * thread to go out and get the page, and sets scrapeing to true, then if
505      * newflag it will wait for the thread to finish running, otherwise it will
506      * just fall through and return the already stored results
507      *
508      * @param url url of the page to be scraped
509      * @param time length of time to wait before rescrape
510      * @param proxy boolean value that says whether or not to use a proxy server
511      * @param pc PageContext for this JSP page
512      * @param cs charset to be used to scrape the page
513      *
514      */

515    public void scrapePage(String JavaDoc url, long time, PageContext pc, String JavaDoc cs)
516      throws JspException {
517     long currenttime = new Date().getTime(); // get the current time
518

519     // check to see if a scrape is needed
520
if (((currenttime - lastscrape) > time) || newflag || changeflag) {
521         // if it is time to rescrape but scraping did not get reset, reset it
522
if (scraping.booleanValue() && !page.isAlive()) {
523         scraping = new Boolean JavaDoc(false);
524         }
525         if (!scraping.booleanValue()) {
526                 // if a scrape is in progress wait until scrape is finished
527
synchronized (scraping) {
528                 if ((page == null) || !page.isAlive()) {
529                         // create thread page if it doesn't exist check for a
530
// proxy connection
531
try {
532                             page = new Page(url, this, pc, cs);
533                 /*if (pport != -1 && pserver != null)
534                 page = new Page(url, this, pc, pport, pserver, auth);
535              //page = new Page(url, this, pc, pport, pserver, ssl, auth);
536                            else
537                 page = new Page(url, this, pc);
538                 //page = new Page(url, this, pc, ssl);
539                             */

540             } catch (MalformedURLException mue) {
541                 pc.getServletContext().log("PageData.scrapePage(): "
542                                + mue.getMessage());
543             }
544             }
545                 if ((((currenttime - lastscrape) > time) || newflag ||
546             changeflag) && page != null) {
547             // set scraping flag
548
scraping = new Boolean JavaDoc(true);
549                         page.start();
550                 }
551                 }
552         }
553     }
554
555     // reset the scraping flag to false if the page thread is not running but
556
// the scraping flag is true
557
if (scraping.booleanValue() && !page.isAlive()) {
558         scraping = new Boolean JavaDoc(false);
559     }
560
561         if (scraping.booleanValue() && (newflag || changeflag) && (page != null)) {
562         try {
563             page.join(); // wait for scrape to finish
564
changeflag = false; // reset changeflag
565
scraping = new Boolean JavaDoc(false); // done scraping reset flag
566
} catch (InterruptedException JavaDoc ie) {
567         // exception shouldn't happen if it does log it to the server
568
pc.getServletContext().
569                                log("PageData.scrapePage(): Page thread interrupted "
570                    + ie.toString());
571         }
572     }
573     // check to see if a MalformedPatternException was thrown if so throw
574
// JspException for JSP page builder
575
if (exception) {
576         exception = false; // reset exception flag
577
throw new JspException(exceptiontext);
578     }
579     }
580 }
581
582  /**
583   * Create an http request for the specified URL, check to see if time has
584   * elapsed, if so get page, check last modified header of page, and if
585   * necessary make the request
586   *
587   */

588 class Page extends Thread JavaDoc {
589
590     private HttpConnection connection; // object to create an http request
591
private long lastmodified; // time the page was last modified
592
private long expires; // http header = time the page expires
593
private URL url; // url from the page to be scraped
594
private PageData pagedata; // pagedata object that holds data on this url
595
// char array to hold the source page from the http request
596
private char source[];
597     // max size of the buffer that the http request is read into
598
private final long MAX_BUFFER_SIZE = 50000;
599     // pagecontext that the servlet resides in, used for logging to the server
600
private PageContext pageContext;
601     // value determines if a proxy server is to be used for the http connection
602
private boolean proxy = false;
603     // the port to use for the proxy connection
604
private int pport = -1;
605     // the proxy server to use for the connection
606
private String JavaDoc pserver = null;
607     // authentication string for authentication to the proxy server name:password
608
private String JavaDoc authstring = null;
609     // boolean value determines if the connection to to travel via a secure
610
// connection
611
private boolean ssl = false;
612     // charset to be used to scrape the page
613
private String JavaDoc charset = null;
614
615     /**
616      * Constructor for Page
617      *
618      * @param url the URL of the page to get scraped
619      * @param page PageData object for the page to get scraped
620      * @param pc PageContext the taglibrary is running in used for logging
621      * @param secure boolean flag to determine if the connection is via http of https
622      * @param cs charset to be used to scrape the page
623      *
624      * @throws MalformedURLException -
625      *
626      */

627     Page(String JavaDoc url, PageData page, PageContext pc, String JavaDoc cs)
628     //Page(String url, PageData page, PageContext pc, boolean secure)
629
throws MalformedURLException {
630     this.url = new URL(url);
631     // get the file part of the url make it "/" if it doesn't exist so that the
632
// front page can be accessed
633
if(this.url.getFile().length() == 0)
634             this.url = new URL(url + "/");
635     pagedata = page;
636     pageContext = pc;
637     charset = cs;
638     //ssl = secure;
639
}
640
641     /**
642      * Constructor for Page
643      *
644      * @param url the URL of the page to get scraped
645      * @param page PageData object for the page to get scraped
646      * @param pc PageContext the taglibrary is running in used for logging
647      * @param port the port to connect to on the proxy server
648      * @param server the proxy server to connect to
649      * @param secure boolean flag to determine if the connection is via http of https
650      * @param name username used for authentication to the proxy server
651      * @param pass password used for authentication to the proxy server
652      *
653      * @throws MalformedURLException -
654      *
655      */

656     Page(String JavaDoc url, PageData page, PageContext pc, int port, String JavaDoc server,
657          String JavaDoc proxyauth) throws MalformedURLException {
658         //boolean secure, String proxyauth) throws MalformedURLException {
659
this.url = new URL(url);
660     // get the file part of the url make it "/" if it doesn't exist so that the
661
// front page can be accessed
662
if(this.url.getFile().length() == 0)
663             this.url = new URL(url + "/");
664     pagedata = page;
665     pageContext = pc;
666     pport = port;
667     pserver = server;
668     authstring = proxyauth;
669     proxy = true;
670     //ssl = secure;
671
}
672
673     public void run() {
674         long current = new Date().getTime(); // get current time
675

676         // make http connection to url
677
try {
678          // create new HttpUrlConnection
679
connection = new HttpConnection(url, pagedata, pageContext);
680          /*if (!proxy)
681          //connection = new HttpConnection(url, ssl, pageContext);
682          connection = new HttpConnection(url, pageContext);
683              else
684          //new HttpConnection(url, pport, pserver, ssl, authstring);
685          connection = new HttpConnection(url, pport, pserver,
686          authstring, pageContext);*/

687          connection.setRequestMethod("HEAD");
688          connection.connect();
689          connection.sendRequest();
690
691          // check response status code a code of 200 is a successful
692
// connection
693
if (connection.getResponseCode() >= 300) {
694          pageContext.getServletContext().
695                    log("Page.run(): Error Occured: "
696                + connection.getResponseMessage());
697          } else {
698          // get expires header
699
if ((expires =(long)connection.getExpiration()) == 0)
700              // do this if header does not exist
701
expires = current - 1;
702
703              // check for a new scrape for this page or that the Expires
704
// time for the page has passed
705
if((expires < current) || pagedata.getNewFlag() ||
706             pagedata.getChangeFlag()) {
707
708              // get lastmodified header
709
// getLastModified returns 0 if header does not exist
710
if ((lastmodified = (long)connection.getLastModified()) == 0)
711              // do this if header does not exist
712
lastmodified = pagedata.getLastScrape() + 1;
713
714              // check for a new scrape for this page or that Last-
715
// Modified time for the page has passed
716
if ((pagedata.getLastScrape() < lastmodified) ||
717                             pagedata.getNewFlag() || pagedata.getChangeFlag()) {
718
719              // dissconnect so a new connection for a GET can be made
720
connection.disconnect();
721
722              // set lastscrape
723
pagedata.setLastScrape(current);
724
725              // set the request method to get
726
connection.setRequestMethod("GET");
727              // make the connection
728
connection.connect();
729              connection.sendRequest();
730
731              // check responce code from connection
732
if (connection.getResponseCode() >= 300) {
733                      pageContext.getServletContext().
734                                 log("Page.run(): Error Occured: " +
735                         connection.getResponseMessage());
736                  // the connection did not occur return cached data
737
return;
738              }
739
740              // read http request into buffer return value is false
741
// if an error occured
742
if (streamtochararray(connection.getInputStream(),charset)) {
743                  // perform the scrapes on this page
744
scrape();
745              }
746              // close the connection
747
connection.disconnect();
748              }
749              }
750          }
751      } catch (IOException ee) {
752          pageContext.getServletContext().
753              log("Page.run(): " + ee.toString());
754          }
755      }
756
757     /**
758      * Helper routine to read the input stream into a char array
759      *
760      * @param in The inputstream from the Http request
761      *
762      * @return a value of true if no error occured in reading the input stream
763      * otherwise false
764      *
765      */

766     private boolean streamtochararray(InputStream in, String JavaDoc charset) {
767         long sourcelength = 50000; // length of buffer inputstream is read into
768
StringBuffer JavaDoc temp; // buffer used to chop unused portion of source
769
boolean returnvalue = true; // no error in reading from input stream
770
// create a char stream from a byte stream
771
InputStreamReader input = null;
772     if ( charset == null ) {
773       input = new InputStreamReader(in);
774     } else {
775       try {
776         input = new InputStreamReader(in, charset);
777       } catch( UnsupportedEncodingException exc ) {
778         System.err.println( "WARNING: unsupported charset " + charset + ". Using default." );
779         input = new InputStreamReader(in);
780       }
781     }
782     boolean chop = false; // flag tells whether or not to truncate buffer
783
int offset = 0; // offset in the input stream to start reading from
784
int num; // number of chars read from the input stream
785

786
787         sourcelength = (long)connection.getHeaderFieldInt("Content-Length",
788                    (int)MAX_BUFFER_SIZE);
789
790         // check that sourcelength is not greater than max allowed or 0
791
if ((sourcelength > MAX_BUFFER_SIZE)) {
792         sourcelength = MAX_BUFFER_SIZE;
793     }
794     source = new char[(int)sourcelength];
795
796     // flag marks if the inputstream was read at least once. it may contain
797
// enough to scrape
798
boolean check = false;
799         try { // read the input stream into the buffer
800
while((num = input.read(source, offset,
801                           (int)(sourcelength - offset))) > 0) {
802         offset += num;
803         check = true;
804             }
805     // error occured in reading input stream set return value to false
806
} catch (IOException e) {
807         if (!check)
808         returnvalue = false;
809         pageContext.getServletContext().log("Page.streamtochararray(): Error " +
810                                                 "ocured while reading the " +
811                         "inputstream " + e.toString());
812     }
813
814     if (chop) {
815         // truncate any extra buffer space if it wasn't needed
816
temp = new StringBuffer JavaDoc().append(source);
817             source = new char[temp.length() + 1]; // create new buffer
818
temp.getChars(0, temp.length(), source,0); // fill new buffer
819
}
820         return returnvalue;
821     }
822
823     /**
824      * Using regular expressions, parse the source from the http request for a
825      * string specified by the delimiter strings obtained from the Scrape tag
826      * then check flags and store the results
827      *
828      */

829   public void scrape() {
830
831     // object to compile regular expressions
832
Perl5Compiler compiler = new Perl5Compiler();
833     // object to match compiled regular expressions
834
Perl5Matcher matcher = new Perl5Matcher();
835     Perl5Pattern pattern = null; // pattern to be compiled into regex
836
// used to perserve state across calls to contains() method
837
MatchResult result; // class for accessing results of pattern match
838
// matcher class used to preserve position across calls to contains
839
PatternMatcherInput input;
840     String JavaDoc match; // string value of result for dropping of end markers
841
ScrapeData sd; // data object that holds data on current scrape
842
// set of keys for the hashmap scrapes
843
Set scrapedatakeys = pagedata.getKeySet();
844     // iterator for scrapedatakeys
845
Iterator scrapesit = scrapedatakeys.iterator();
846     Iterator scrapesit1 = scrapedatakeys.iterator();
847     // String variable that will become the regular expression
848
String JavaDoc regex = new String JavaDoc();
849
850     // iterate through the scrapedata objects and perform a scrape for each one
851
while(scrapesit.hasNext()) {
852
853         // get next item from HashMap scrapedata
854
sd = pagedata.getScrape((String JavaDoc)scrapesit.next());
855
856         // build the regular expression
857
regex = regex.concat(sd.getBegin().concat(".*?").concat(sd.getEnd()));
858
859         //attempt to compile the pattern try to catch MalformedPatternException
860
try {
861            // compile pattern with singleline_mask
862
pattern = (Perl5Pattern)compiler.compile(regex,
863                                Perl5Compiler.SINGLELINE_MASK);
864         } catch (MalformedPatternException e) {
865         // if exception occurs store it to be kicked out later otherwise
866
// could not get out of run()
867
pagedata.setException();
868         pagedata.setExceptionText(sd.getBegin(), sd.getEnd());
869         }
870
871         matcher.contains(source, pattern); // attempt to match the regex
872

873         result = matcher.getMatch(); // get returned match
874

875         if(result != null) {
876             // get the result to a String value from a string
877
StringBuffer JavaDoc matchbuffer = new StringBuffer JavaDoc();
878             matchbuffer.append(result.toString());
879             match = new String JavaDoc(matchbuffer);
880
881         // default value is false and begin and end anchors are not part of
882
// the scrape
883
if(!sd.getanchorsFlag()) {
884         // chop begin and end marker from final result
885
match = match.substring(sd.getBegin().length(),
886                                          match.lastIndexOf(sd.getEnd()));
887             }
888
889         // if stripflag remove tags from result string
890
if(sd.getstripFlag()) {
891                 regex = ">.*?<";
892                 // string that is built as tags are removed from result
893
String JavaDoc finalresult = new String JavaDoc();
894                 // attempt to compile the pattern try to catch
895
//MalformedPatternException
896
try {
897                     // compile pattern with singleline_mask
898
pattern = (Perl5Pattern)compiler.compile(regex,
899                                Perl5Compiler.SINGLELINE_MASK);
900                 } catch (MalformedPatternException e) {
901             // an error will never occur here since the regex is
902
// predefined
903
}
904
905         // if there is text before the first tag add it to the
906
// final result
907
if ((match.indexOf('<') < match.indexOf('>')) &&
908                                             (match.indexOf('<') != 0))
909             finalresult = match.substring(0, match.indexOf('<')).
910                                                            concat(" ");
911
912         // set patternmatcherinput so multiple matches can be made
913
input = new PatternMatcherInput(match);
914
915                 // loop until no tags are left in result string
916
while(matcher.contains(input, pattern)) {
917
918             matchbuffer.setLength(0); // reset matchbuffer for reuse
919

920                     result = matcher.getMatch(); // get the result
921
// add current result to to final result string
922
matchbuffer.append(result.toString());
923                     match = new String JavaDoc(matchbuffer);
924
925             // check to see that match is not a match on ><
926
if (match.length() > 2) {
927                         finalresult = finalresult.concat(match.substring(1,
928                                            match.indexOf('<'))).concat(" ");
929             }
930             }
931
932         // reset match to original string so that any trailing text can
933
// be added to final result
934
match = input.toString();
935
936         // if there is trailing text outside of tags add to the result
937
if (match.lastIndexOf('>') > match.lastIndexOf('<'))
938             finalresult = finalresult.concat(match.substring(
939                                   match.lastIndexOf('>') + 1, match.length()));
940
941             match = finalresult; // set final result of scrape to match
942

943         }
944             sd.setResult(match); // set scrape results in scrapedata object
945
pagedata.setNewflag(); // successful scrape set newflag to false
946
} else {
947             sd.setResult("");
948         }
949     regex = ""; // clear regex for use again
950
match = null; // clear match for reuse
951
}
952   }
953 }
954
Popular Tags