PageData


1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.taglibs.scrape;
18  
19  import java.util.*;
20  import java.io.*;
21  import java.net.*;
22  import javax.servlet.jsp.*;
23  import sun.misc.BASE64Encoder;
24  import org.apache.oro.text.regex.*;
25  
26  /**
27   * PageData - An object used to store information about a scrape done by the 
28   *            tags in the scrape package.
29   *
30   * @author Rich Catlett
31   *
32   * @version 1.0
33   *
34   */
35  public class PageData {
36  
37      /**
38       * static HashMap holds all of the pagedata objects keyed to the url
39       */
40      public static HashMap pageurls = new HashMap();
41      /**
42       * static object used for synchronization
43       */
44      private static Object   O = new Object  ();
45      /**
46       *stores data on each scrape, the data is stored in the order of the scrape
47       */
48      private HashMap scrapes = new HashMap();
49      /**
50       * boolean flag tells object that new scrapes for this page have been added
51       */
52      private boolean newflag;
53      /**
54       * boolean flag that marks if a change has been made to the flags in 
55       * a scrape data object
56       */
57      private boolean changeflag;
58      /**
59       * Boolean object used to synchronize scrapes and it keeps track of when a
60       * scrape is occuring
61       */
62      private Boolean   scraping = new Boolean  (false);
63      /**
64       * time the last scrape occured determines if page needs to be requested
65       * again
66       */
67      private long lastscrape = 0;
68      /**
69       * PageContext object for the calling JSP page used to access the
70       * ServletContext for logging error to the server
71       */
72      private PageContext pagecontext;
73      // the following two variables are needed because the start method of 
74      // Thread cannot throw a JspException
75      /**
76       * flag marks if a MalformedPatternException was thrown in scrape()
77       */
78      private boolean exception;
79      /**
80       * if an exception was thrown holds info on the error
81       */
82      private String   exceptiontext;
83      /**
84       * thread that gets the requested page and then runs the scrape on the page
85       */
86      private Page page;
87      /**
88       * the port to use for the proxy connection
89       */
90      private int pport =-1;
91      /**
92       * the proxy server to use for the connection
93       */
94      private String   pserver = null;
95      /**
96       * auth string for basic authentication to the proxy server username:password
97       */
98      private String   auth = null;
99      /**
100      * boolean value determines if the connection to to travel via a secure
101      * connection
102      */
103     private boolean ssl = false;
104     /**
105      * password to the client keystore for ssl client side authentication
106      */
107     private String   sslclientpass = null;
108     /**
109      * list of names extra headers to add
110      */
111     private ArrayList name = new ArrayList(10);
112     /**
113      * list of values of extra headers to add
114      */
115     private ArrayList value = new ArrayList(10);
116 
117     /**
118      * constructor for the class simply creates an instance of the PageData
119      * object
120      */
121     public PageData() {}
122 
123     /**
124      * method checks the static HashMap pageurls for the given url if it exists
125      * it is returned, otherwise a new pagedata object is created and added
126      * to the HashMap pageurls
127      *
128      * @param url  the url of the page given in the calling PageTag
129      * @param port  the port to use for the connection to the proxy server
130      * @param server  the proxy server
131      * @param secure  is the link going over https
132      * @param name  user name for authentication
133      * @param pass  password for authentication
134      *
135      * @return a PageData object either newly created or one already in
136      *         existence
137      */
138     public static PageData getPage(String   url, int port, String   server,
139                    String   name, String   pass) {
140          //public static PageData getPage(String url, int port, String server,
141      //   boolean ssl, String name, String pass, String sslpass) {
142     PageData pagedata;
143 
144     if((pagedata = (PageData)pageurls.get(url)) == null) {
145         synchronized(O) {
146             if(pagedata == null) {
147                 pagedata = new PageData(); // create new pagedata object
148             // set time of last scrape to current system time
149                 pagedata.setLastScrape(new Date().getTime());
150             // set the proxyport and proxy server for this connection
151             pagedata.setProxyPort(port);
152             pagedata.setProxyServer(server);
153             pagedata.setAuth(name, pass);
154             /*
155             // set secure for the connection
156             pagedata.setSSL(ssl);
157             // set the keystore password for client side authentication
158             pagedata.setClientPass(sslpass);
159             // if secure == true set system property so that https will be
160             // understood
161             if (ssl)
162             System.setProperty("java.protocol.handler.pkgs", 
163                        "com.sun.net.ssl.internal.www.protocol");
164             */
165                     // add pagedata object to static Hashmap
166                 pageurls.put(url, pagedata);
167         }
168         }
169     }
170     return pagedata;
171     }
172 
173     /**
174      * method to add an object to the HashMap scrapes, it first checks to see
175      * if the object already exists
176      *
177      * @param id  unique identifier of the scrape the following attributes
178      *            define
179      * @param begin  beginning anchor for the scrape refered to by id
180      * @param end  ending anchor for the scrape refered to by id
181      * @param anchors  boolean flag that determines if begin and end anchors are
182      *                 part of the result
183      * @param strip  boolean flag that determines if tags are to be striped from
184      *               the result
185      *
186      * @return a boolean value, true if a new scrape has been added else false
187      */
188     public final synchronized void setScrape(String   id, String   begin, 
189          String   end, String   anchors, String   strip) throws JspException {
190 
191     ScrapeData scrape;  // used to test for equality of ScrapeData objects
192     // boolean objects used to check if a flag value has been changed
193     Boolean   checkvalstrip;
194     Boolean   checkvalanchors;
195 
196     // check to see if this scrape already exists
197     if((scrape = (ScrapeData)scrapes.get(id)) == null) {
198         // scrape does not already exist
199         scrape = new ScrapeData(); // create new scrapedata object
200         // set the attributes of the new scrapedata object
201         scrape.setBegin(begin);
202         scrape.setEnd(end);
203         if (anchors != null)
204         scrape.setanchorsFlag(anchors);
205         if (strip != null)
206         scrape.setstripFlag(strip);
207 
208             scrapes.put(id, scrape);  // add scrape to HashMap scrapes
209         newflag = true; // a new scrape has been added to this page
210     } else if (((scrape.getBegin().compareTo(begin)) != 0) || 
211                    ((scrape.getEnd().compareTo(end)) != 0)) { 
212             // the two scrapedata objects are different     
213             throw new JspException ("scrape id " + id + " is already in use");
214     } else if (((scrape.getBegin().compareTo(begin)) == 0) || 
215                    ((scrape.getEnd().compareTo(end)) == 0)) {
216         // set check objects so that boolean can be compared to String
217         checkvalstrip = new Boolean  (strip);
218         checkvalanchors = new Boolean  (anchors);
219 
220         // scrape is old scrape check to see if any flags have been changed
221         if (scrape.getanchorsFlag() != checkvalanchors.booleanValue()) {
222             // reset anchors flag
223             if (anchors != null) {
224             scrape.setanchorsFlag(anchors);
225             // indicate change has been made to flags
226             changeflag = true;
227             }
228         }
229         if (scrape.getstripFlag() != checkvalstrip.booleanValue()) {
230             // reset strip flag
231             if (strip != null) {
232             scrape.setstripFlag(anchors);
233             // indicate change has been made to flags
234             changeflag = true;
235             }
236         }
237     }
238     }
239 
240 
241     /**
242      * getter method for a result string from a scrapedata object
243      *
244      * @param id  unique key for requested scrape
245      *
246      * @return results from scrapeidentified by id
247      *
248      */
249     public String   getResults(String   id) throws JspException {
250     ScrapeData scrape = (ScrapeData)scrapes.get(id);
251 
252         // error will occur if page or scrapeid in result tag does not exist
253     try {
254         return scrape.getResult();
255     } catch (NullPointerException   ne) {
256         throw new JspException
257                             ("page or scrapeid in result tag do not exist");
258     }
259     }
260 
261     /**
262      * set the name and value of any extra headers to be sent
263      *
264      * @param name   string that is the name of an extra header to be sent
265      * @param value  string that is the value of an extra header to be sent
266      */
267     protected final void setHeader(String   name, String   value) {
268         if (name == null) {
269             this.name = new ArrayList(5);
270             this.value = new ArrayList(5);
271     }
272     this.name.add(name);
273         this.value.add(value);
274     }
275 
276     /**
277      * get the http headers
278      *
279      * @return name   ArrayList of the names of http headers to be sent
280      * @return value  ArrayList or the values of http headers to be sent
281      *
282      */
283     public ArrayList getHeaders() {
284         if (name == null) {
285             return null;
286     } else {
287             ArrayList list = new ArrayList(2);
288             list.add(name);
289             list.add(value);
290             return list;
291     }
292     }
293 
294     /**
295      * setter method for newflag only called to set newflag to false
296      *
297      */
298     public void setNewflag() {
299     newflag = false;
300     }
301 
302     /**
303      * getter method for newflag
304      *
305      * @return boolean value of newflag true if new scrape exists else false
306      *
307      */
308     public boolean getNewFlag() {
309     return newflag;
310     }
311 
312     /**
313      * set the value of proxy port
314      *
315      * @param value  the proxy port to use for the connection as a String
316      *
317      */
318     public final void setProxyPort(int value) {
319     pport = value;
320     }
321 
322     /**
323      * get the value of the proxy port
324      *
325      * @return - int the proxy port number
326      *
327      */
328     public final int getProxyPort() {
329         return pport;
330     }
331 
332     /**
333      * set the value of proxy server
334      *
335      * @param value  the proxy server to use for the connection
336      *
337      */
338     public final void setProxyServer(String   value) {
339     pserver = value;
340     }
341 
342     /**
343      * get the value of the proxy port
344      *
345      * @return - String the name or the proxy server
346      *
347      */
348     public final String   getProxyServer() {
349         return pserver;
350     }
351    /**
352      * set the pass word to access the client keystore
353      *
354      * @param value  password to the client keystore
355      *
356      */
357     public final void setClientPass(String   value) {
358     sslclientpass = value;
359     }
360 
361     /**
362      * set the username and password values for authentication to the proxy server
363      *
364      * @param name  username
365      * @param pass  password
366      * @param base64  base64 encoded username and password
367      *
368      */
369     public final void setAuth(String   name, String   pass) {
370     if (name != null && pass != null)
371         auth = "Basic " + 
372            new BASE64Encoder().encode((name + ":" + pass).getBytes());
373     }
374 
375     /**
376      * get the base64 encoded auth string for proxy authorization
377      *
378      * @return - String base64 encoded authorization for the proxy server
379      *
380      */
381     public final String   getAuth() {
382         return auth;
383     }
384 
385     /**
386      * set the value of secure
387      *
388      * @param value  true if the connection is to be made via https the default 
389      *               is false
390      *
391      */
392     public final void setSSL(boolean value) {
393     ssl = value;
394     }
395 
396     /**
397      * get secure
398      *
399      * @return - boolean value of secure flag
400      */
401     public final boolean getSSL() {
402         return ssl;
403     }
404 
405     /**
406      * getter method for the key set of the HashMap scrapes
407      *
408      * @return the keys that the scrapes for this page are hashed to
409      *
410      */
411     public final Set getKeySet() {
412     return scrapes.keySet();
413     }
414 
415     /**
416      * setter method for lastscrape
417      *
418      * @param time  current time
419      *
420      */
421     public void setLastScrape(long time) {
422     lastscrape = time;
423     }
424 
425     /**
426      * getter method for lastscrape
427      *
428      * @param time  current time
429      *
430      */
431     public long getLastScrape() {
432     return lastscrape;
433     }
434 
435     /**
436      * sets the exception text to make the error in the jsp page easier for the
437      * author to find
438      *
439      * @param begin  the beginning marker for scrape where error occured
440      * @param end  the end marker for the scrape where error occured
441      *
442      */
443     public void setExceptionText(String   begin, String   end) {
444         exceptiontext = new String  
445                     ("there is a syntax error in " + begin + " or " + end + 
446        " for the scrape. A character probably needs to be escaped for perl\n"
447        + " See docs for help if you don't know perl");
448     }
449 
450     /**
451      * set exception to true, a malformedpatternexception has been thrown in
452      * page
453      *
454      */
455     public void setException() {
456     exception = true;
457     }
458 
459     /**
460      * getter method for changeflag
461      *
462      * @return boolean  value of changeflag true if a scrape has been changed
463      *                  else false
464      *
465      */
466     public boolean getChangeFlag() {
467     return changeflag;
468     }
469 
470     /**
471      * getter method for a single scrapedata object from HashMap scrapes
472      *
473      * @param key  the value the requested scrapedata object is keyed to
474      *
475      * @return the requested scrapedata boject
476      *
477      */
478     public ScrapeData getScrape(String   key) {
479     return (ScrapeData)scrapes.get(key);
480     }
481 
482     /**
483      * setter method for pagecontext
484      *
485      * @param page  the page context object for this page
486      *
487      */
488     public void setPageContext(PageContext page) {
489     pagecontext = page;
490     }
491 
492     /**
493      * getter method for pagecontext
494      *
495      * returns the PageContext object for this page
496      *
497      */
498     public PageContext getPageContext() {
499     return pagecontext;
500     }
501 
502     /**
503      * checks the scrapeint, if enough time has passed it starts the getPage
504      * thread to go out and get the page, and sets scrapeing to true, then if
505      * newflag it will wait for the thread to finish running, otherwise it will
506      * just fall through and return the already stored results
507      *
508      * @param url  url of the page to be scraped
509      * @param time  length of time to wait before rescrape
510      * @param proxy  boolean value that says whether or not to use a proxy server
511      * @param pc  PageContext for this JSP page
512      * @param cs charset to be used to scrape the page
513      *
514      */
515    public void scrapePage(String   url, long time, PageContext pc, String   cs)
516      throws JspException {
517     long currenttime = new Date().getTime();  // get the current time
518 
519     // check to see if a scrape is needed
520     if (((currenttime - lastscrape) > time) || newflag || changeflag) { 
521         // if it is time to rescrape but scraping did not get reset, reset it
522         if (scraping.booleanValue() && !page.isAlive()) {
523         scraping = new Boolean  (false);
524         }
525         if (!scraping.booleanValue()) {
526                 // if a scrape is in progress wait until scrape is finished
527             synchronized (scraping) {
528                 if ((page == null) || !page.isAlive()) {
529                         // create thread page if it doesn't exist check for a
530             // proxy connection
531             try {
532                             page = new Page(url, this, pc, cs);
533                 /*if (pport != -1 && pserver != null)
534                 page = new Page(url, this, pc, pport, pserver, auth);
535              //page = new Page(url, this, pc, pport, pserver, ssl, auth);
536                            else
537                 page = new Page(url, this, pc);
538                 //page = new Page(url, this, pc, ssl);
539                             */
540             } catch (MalformedURLException mue) {
541                 pc.getServletContext().log("PageData.scrapePage(): " 
542                                + mue.getMessage());
543             }
544             }
545                 if ((((currenttime - lastscrape) > time) || newflag ||
546             changeflag) && page != null) {
547             // set scraping flag 
548                 scraping = new Boolean  (true);
549                         page.start();
550                 }
551                 }
552         }
553     }
554 
555     // reset the scraping flag to false if the page thread is not running but
556     // the scraping flag is true
557     if (scraping.booleanValue() && !page.isAlive()) {
558         scraping = new Boolean  (false);
559     }
560 
561         if (scraping.booleanValue() && (newflag || changeflag) && (page != null)) {
562         try {
563             page.join();    // wait for scrape to finish
564         changeflag = false; // reset changeflag
565         scraping = new Boolean  (false); // done scraping reset flag
566         } catch (InterruptedException   ie) {
567         // exception shouldn't happen if it does log it to the server
568             pc.getServletContext().
569                                log("PageData.scrapePage(): Page thread interrupted " 
570                    + ie.toString());
571         }
572     }
573     // check to see if a MalformedPatternException was thrown if so throw
574     // JspException for JSP page builder
575     if (exception) {
576         exception = false; // reset exception flag
577         throw new JspException(exceptiontext);
578     }
579     }
580 }
581 
582  /**
583   * Create an http request for the specified URL, check to see if time has
584   * elapsed, if so get page, check last modified header of page, and if
585   * necessary make the request
586   *
587   */
588 class Page extends Thread   {
589 
590     private HttpConnection connection; // object to create an http request
591     private long lastmodified;          // time the page was last modified
592     private long expires;             // http header = time the page expires
593     private URL url;                // url from the page to be scraped
594     private PageData pagedata;    // pagedata object that holds data on this url
595     // char array to hold the source page from the http request
596     private char source[];
597     // max size of the buffer that the http request is read into
598     private final long MAX_BUFFER_SIZE = 50000;
599     // pagecontext that the servlet resides in, used for logging to the server
600     private PageContext pageContext;
601     // value determines if a proxy server is to be used for the http connection
602     private boolean proxy = false;
603     // the port to use for the proxy connection
604     private int pport = -1;
605     // the proxy server to use for the connection
606     private String   pserver = null;
607     // authentication string for authentication to the proxy server name:password
608     private String   authstring = null;
609     // boolean value determines if the connection to to travel via a secure
610     // connection
611     private boolean ssl = false;
612     // charset to be used to scrape the page
613     private String   charset = null;
614 
615     /**
616      * Constructor for Page
617      *
618      * @param url  the URL of the page to get scraped
619      * @param page  PageData object for the page to get scraped
620      * @param pc  PageContext the taglibrary is running in used for logging
621      * @param secure boolean flag to determine if the connection is via http of https
622      * @param cs charset to be used to scrape the page
623      *
624      * @throws MalformedURLException - 
625      *
626      */
627     Page(String   url, PageData page, PageContext pc, String   cs) 
628     //Page(String url, PageData page, PageContext pc, boolean secure) 
629     throws MalformedURLException {
630     this.url = new URL(url);
631     // get the file part of the url make it "/" if it doesn't exist so that the
632     // front page can be accessed
633         if(this.url.getFile().length() == 0)
634             this.url = new URL(url + "/");
635     pagedata = page;
636     pageContext = pc;
637     charset = cs;
638     //ssl = secure;
639     }
640 
641     /**
642      * Constructor for Page
643      *
644      * @param url  the URL of the page to get scraped
645      * @param page  PageData object for the page to get scraped
646      * @param pc  PageContext the taglibrary is running in used for logging
647      * @param port  the port to connect to on the proxy server
648      * @param server  the proxy server to connect to
649      * @param secure boolean flag to determine if the connection is via http of https
650      * @param name  username used for authentication to the proxy server
651      * @param pass  password used for authentication to the proxy server
652      *
653      * @throws MalformedURLException - 
654      *
655      */
656     Page(String   url, PageData page, PageContext pc, int port, String   server, 
657          String   proxyauth) throws MalformedURLException {
658         //boolean secure, String proxyauth) throws MalformedURLException {
659     this.url = new URL(url);
660     // get the file part of the url make it "/" if it doesn't exist so that the
661     // front page can be accessed
662         if(this.url.getFile().length() == 0)
663             this.url = new URL(url + "/");
664     pagedata = page;
665     pageContext = pc;
666     pport = port;
667     pserver = server;
668     authstring = proxyauth;
669     proxy = true;
670     //ssl = secure;
671     }
672 
673     public void run() {
674         long current = new Date().getTime();  // get current time
675 
676         // make http connection to url
677          try {
678          // create new HttpUrlConnection
679              connection = new HttpConnection(url, pagedata, pageContext);
680          /*if (!proxy)
681          //connection = new HttpConnection(url, ssl, pageContext);
682          connection = new HttpConnection(url, pageContext);
683              else
684          //new HttpConnection(url, pport, pserver, ssl, authstring);
685          connection = new HttpConnection(url, pport, pserver, 
686          authstring, pageContext);*/
687          connection.setRequestMethod("HEAD");
688          connection.connect();
689          connection.sendRequest();
690 
691          // check response status code a code of 200 is a successful
692              // connection
693          if (connection.getResponseCode() >= 300) {
694          pageContext.getServletContext().
695                    log("Page.run(): Error Occured: " 
696                + connection.getResponseMessage());
697          } else {
698          // get expires header
699          if ((expires =(long)connection.getExpiration()) == 0)
700              // do this if header does not exist
701              expires = current - 1;
702 
703              // check for a new scrape for this page or that the Expires
704          // time for the page has passed
705                  if((expires < current) || pagedata.getNewFlag() || 
706             pagedata.getChangeFlag()) {
707 
708              // get lastmodified header
709              // getLastModified returns 0 if header does not exist
710              if ((lastmodified = (long)connection.getLastModified()) == 0)
711              // do this if header does not exist
712              lastmodified = pagedata.getLastScrape() + 1;
713 
714              // check for a new scrape for this page or that Last-
715              // Modified time for the page has passed
716                  if ((pagedata.getLastScrape() < lastmodified) || 
717                             pagedata.getNewFlag() || pagedata.getChangeFlag()) {
718 
719              // dissconnect so a new connection for a GET can be made
720              connection.disconnect();
721 
722              // set lastscrape
723              pagedata.setLastScrape(current);
724 
725              // set the request method to get
726              connection.setRequestMethod("GET");
727              // make the connection
728              connection.connect();
729              connection.sendRequest();
730 
731              // check responce code from connection
732              if (connection.getResponseCode() >= 300) {
733                      pageContext.getServletContext().
734                                 log("Page.run(): Error Occured: " +
735                         connection.getResponseMessage());
736                  // the connection did not occur return cached data
737                  return;
738              }
739 
740              // read http request into buffer return value is false
741              // if an error occured
742              if (streamtochararray(connection.getInputStream(),charset)) {
743                  // perform the scrapes on this page
744                      scrape();
745              }
746              // close the connection
747              connection.disconnect();
748              }
749              }
750          }
751      } catch (IOException ee) {
752          pageContext.getServletContext().
753              log("Page.run(): " + ee.toString());
754          }
755      }
756 
757     /**
758      * Helper routine to read the input stream into a char array
759      *
760      * @param in  The inputstream from the Http request
761      *
762      * @return a value of true if no error occured in reading the input stream
763      *         otherwise false
764      *
765      */
766     private boolean streamtochararray(InputStream in, String   charset) {
767         long sourcelength = 50000; // length of buffer inputstream is read into
768     StringBuffer   temp; // buffer used to chop unused portion of source
769     boolean returnvalue = true;  // no error in reading from input stream
770     // create a char stream from a byte stream
771     InputStreamReader input = null;
772     if ( charset == null ) {
773       input = new InputStreamReader(in);
774     } else {
775       try {
776         input = new InputStreamReader(in, charset);
777       } catch( UnsupportedEncodingException exc ) {
778         System.err.println( "WARNING: unsupported charset " + charset + ". Using default." );
779         input = new InputStreamReader(in);
780       }
781     }
782     boolean chop = false; // flag tells whether or not to truncate buffer
783     int offset = 0; // offset in the input stream to start reading from
784     int num; // number of chars read from the input stream
785 
786 
787         sourcelength = (long)connection.getHeaderFieldInt("Content-Length",
788                    (int)MAX_BUFFER_SIZE);
789 
790         // check that sourcelength is not greater than max allowed or 0
791         if ((sourcelength > MAX_BUFFER_SIZE)) {
792         sourcelength = MAX_BUFFER_SIZE;
793     }
794     source = new char[(int)sourcelength];
795 
796     // flag marks if the inputstream was read at least once. it may contain
797     // enough to scrape
798     boolean check = false;
799         try {   // read the input stream into the buffer
800             while((num = input.read(source, offset,
801                           (int)(sourcelength - offset))) > 0) {
802         offset += num;
803         check = true;
804             }
805     // error occured in reading input stream set return value to false
806     } catch (IOException e) {
807         if (!check)
808         returnvalue = false;
809         pageContext.getServletContext().log("Page.streamtochararray(): Error " +
810                                                 "ocured while reading the " + 
811                         "inputstream " + e.toString());
812     }
813 
814     if (chop) {
815         // truncate any extra buffer space if it wasn't needed
816             temp = new StringBuffer  ().append(source);
817             source = new char[temp.length() + 1]; // create new buffer
818             temp.getChars(0, temp.length(), source,0); // fill new buffer
819     }
820         return returnvalue;
821     }
822 
823     /**
824      * Using regular expressions, parse the source from the http request for a
825      * string specified by the delimiter strings obtained from the Scrape tag
826      * then check flags and store the results
827      *
828      */
829   public void scrape() {
830 
831     // object to compile regular expressions
832     Perl5Compiler compiler = new Perl5Compiler(); 
833     // object to match compiled regular expressions
834     Perl5Matcher matcher = new Perl5Matcher();  
835     Perl5Pattern pattern = null;  // pattern to be compiled into regex
836     // used to perserve state across calls to contains() method
837     MatchResult result;  // class for accessing results of pattern match
838     // matcher class used to preserve position across calls to contains
839     PatternMatcherInput input;
840     String   match;  // string value of result for dropping of end markers
841     ScrapeData sd;  // data object that holds data on current scrape
842     // set of keys for the hashmap scrapes
843     Set scrapedatakeys = pagedata.getKeySet();
844     // iterator for scrapedatakeys
845     Iterator scrapesit = scrapedatakeys.iterator();
846     Iterator scrapesit1 = scrapedatakeys.iterator();
847     // String variable that will become the regular expression
848     String   regex = new String  ();
849 
850     // iterate through the scrapedata objects and perform a scrape for each one
851     while(scrapesit.hasNext()) {
852 
853         // get next item from HashMap scrapedata
854         sd = pagedata.getScrape((String  )scrapesit.next());
855 
856         // build the regular expression
857         regex = regex.concat(sd.getBegin().concat(".*?").concat(sd.getEnd()));
858 
859         //attempt to compile the pattern try to catch MalformedPatternException
860         try {
861            // compile pattern with singleline_mask
862             pattern = (Perl5Pattern)compiler.compile(regex,
863                                Perl5Compiler.SINGLELINE_MASK);
864         } catch (MalformedPatternException e) {
865         // if exception occurs store it to be kicked out later otherwise
866         // could not get out of run()
867         pagedata.setException();
868         pagedata.setExceptionText(sd.getBegin(), sd.getEnd());
869         }
870 
871         matcher.contains(source, pattern); // attempt to match the regex
872 
873         result = matcher.getMatch(); // get returned match
874 
875         if(result != null) {
876             // get the result to a String value from a string
877             StringBuffer   matchbuffer = new StringBuffer  ();
878             matchbuffer.append(result.toString());
879             match = new String  (matchbuffer);
880 
881         // default value is false and begin and end anchors are not part of
882         // the scrape
883             if(!sd.getanchorsFlag()) {
884         // chop begin and end marker from final result
885                 match = match.substring(sd.getBegin().length(), 
886                                          match.lastIndexOf(sd.getEnd()));
887             }
888 
889         // if stripflag remove tags from result string
890             if(sd.getstripFlag()) {
891                 regex = ">.*?<";
892                 // string that is built as tags are removed from result
893                 String   finalresult = new String  ();
894                 // attempt to compile the pattern try to catch 
895                 //MalformedPatternException
896                 try {
897                     // compile pattern with singleline_mask
898                     pattern = (Perl5Pattern)compiler.compile(regex,
899                                Perl5Compiler.SINGLELINE_MASK);
900                 } catch (MalformedPatternException e) {
901             // an error will never occur here since the regex is
902             // predefined
903                 }
904 
905         // if there is text before the first tag add it to the
906         // final result
907         if ((match.indexOf('<') < match.indexOf('>')) && 
908                                             (match.indexOf('<') != 0))
909             finalresult = match.substring(0, match.indexOf('<')).
910                                                            concat(" ");
911 
912         // set patternmatcherinput so multiple matches can be made
913         input = new PatternMatcherInput(match);
914 
915                 // loop until no tags are left in result string
916                 while(matcher.contains(input, pattern)) {
917 
918             matchbuffer.setLength(0); // reset matchbuffer for reuse
919 
920                     result = matcher.getMatch(); // get the result
921             // add current result to to final result string
922                     matchbuffer.append(result.toString());
923                     match = new String  (matchbuffer);
924 
925             // check to see that match is not a match on ><
926             if (match.length() > 2) {
927                         finalresult = finalresult.concat(match.substring(1,
928                                            match.indexOf('<'))).concat(" ");
929             }
930             }
931 
932         // reset match to original string so that any trailing text can
933         // be added to final result
934         match = input.toString();
935 
936         // if there is trailing text outside of tags add to the result
937         if (match.lastIndexOf('>') > match.lastIndexOf('<'))
938             finalresult = finalresult.concat(match.substring(
939                                   match.lastIndexOf('>') + 1, match.length()));
940 
941             match = finalresult; // set final result of scrape to match
942 
943         }
944             sd.setResult(match); // set scrape results in scrapedata object
945         pagedata.setNewflag(); // successful scrape set newflag to false
946         } else {
947             sd.setResult("");
948         }
949     regex = ""; // clear regex for use again
950     match = null;  // clear match for reuse
951     }
952   }
953 }
954
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags