SiteCapturer


1   // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2   // http://sourceforge.org/projects/htmlparser
3   // Copyright (C) 2003 Derrick Oswald
4   //
5   // Revision Control Information
6   //
7   // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $
8   // $Author: derrickoswald $
9   // $Date: 2005/02/14 23:49:24 $
10  // $Revision: 1.7 $
11  //
12  // This library is free software; you can redistribute it and/or
13  // modify it under the terms of the GNU Lesser General Public
14  // License as published by the Free Software Foundation; either
15  // version 2.1 of the License, or (at your option) any later version.
16  //
17  // This library is distributed in the hope that it will be useful,
18  // but WITHOUT ANY WARRANTY; without even the implied warranty of
19  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  // Lesser General Public License for more details.
21  //
22  // You should have received a copy of the GNU Lesser General Public
23  // License along with this library; if not, write to the Free Software
24  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  //
26  
27  package org.htmlparser.parserapplications;
28  
29  import java.io.File  ;
30  import java.io.FileNotFoundException  ;
31  import java.io.FileOutputStream  ;
32  import java.io.IOException  ;
33  import java.io.InputStream  ;
34  import java.io.PrintWriter  ;
35  import java.net.MalformedURLException  ;
36  import java.net.URL  ;
37  import java.net.URLConnection  ;
38  import java.util.ArrayList  ;
39  import java.util.HashSet  ;
40  import javax.swing.JFileChooser  ;
41  import javax.swing.JOptionPane  ;
42  
43  import org.htmlparser.NodeFilter;
44  import org.htmlparser.Parser;
45  import org.htmlparser.PrototypicalNodeFactory;
46  import org.htmlparser.filters.AndFilter;
47  import org.htmlparser.filters.HasAttributeFilter;
48  import org.htmlparser.filters.NodeClassFilter;
49  import org.htmlparser.tags.BaseHrefTag;
50  import org.htmlparser.tags.FrameTag;
51  import org.htmlparser.tags.ImageTag;
52  import org.htmlparser.tags.LinkTag;
53  import org.htmlparser.tags.MetaTag;
54  import org.htmlparser.util.EncodingChangeException;
55  import org.htmlparser.util.NodeIterator;
56  import org.htmlparser.util.NodeList;
57  import org.htmlparser.util.ParserException;
58  
59  /**
60   * Save a web site locally.
61   * Illustrative program to save a web site contents locally.
62   * It was created to demonstrate URL rewriting in it's simplest form.
63   * It uses customized tags in the NodeFactory to alter the URLs.
64   * This program has a number of limitations:
65   * <ul>
66   * <li>it doesn't capture forms, this would involve too many assumptions</li>
67   * <li>it doesn't capture script references, so funky onMouseOver and other
68   * non-static content will not be faithfully reproduced</li>
69   * <li>it doesn't handle style sheets</li>
70   * <li>it doesn't dig into attributes that might reference resources, so
71   * for example, background images won't necessarily be captured</li>
72   * <li>worst of all, it gets confused when a URL both has content and is
73   * the prefix for other content,
74   * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both
75   * yield content, since this cannot be faithfully replicated to a static
76   * directory structure (this happens a lot with servlet based sites)</li>
77   *</ul>
78   */
79  public class SiteCapturer
80  {
81      /**
82       * The web site to capture.
83       * This is used as the base URL in deciding whether to adjust a link
84       * and whether to capture a page or not.
85       */
86      protected String   mSource;
87  
88      /**
89       * The local directory to capture to.
90       * This is used as a base prefix for files saved locally.
91       */
92      protected String   mTarget;
93  
94      /**
95       * The list of pages to capture.
96       * Links are added to this list as they are discovered, and removed in
97       * sequential order (FIFO queue) leading to a breadth
98       * first traversal of the web site space.
99       */
100     protected ArrayList   mPages;
101 
102     /**
103      * The set of pages already captured.
104      * Used to avoid repeated acquisition of the same page.
105      */
106     protected HashSet   mFinished;
107 
108     /**
109      * The list of resources to copy.
110      * Images and other resources are added to this list as they are discovered.
111      */
112     protected ArrayList   mImages;
113 
114     /**
115      * The set of resources already copied.
116      * Used to avoid repeated acquisition of the same images and other resources.
117      */
118     protected HashSet   mCopied;
119 
120     /**
121      * The parser to use for processing.
122      */
123     protected Parser mParser;
124 
125     /**
126      * If <code>true</code>, save resources locally too,
127      * otherwise, leave resource links pointing to original page.
128      */
129     protected boolean mCaptureResources;
130 
131     /**
132      * The filter to apply to the nodes retrieved.
133      */
134     protected NodeFilter mFilter;
135 
136     /**
137      * Copy buffer size.
138      * Resources are moved to disk in chunks this size or less.
139      */
140     protected final int TRANSFER_SIZE = 4096;
141 
142     /**
143      * Create a web site capturer.
144      */
145     public SiteCapturer ()
146     {
147         PrototypicalNodeFactory factory;
148 
149         mSource = null;
150         mTarget = null;
151         mPages = new ArrayList   ();
152         mFinished = new HashSet   ();
153         mImages = new ArrayList   ();
154         mCopied = new HashSet   ();
155         mParser = new Parser ();
156         factory = new PrototypicalNodeFactory ();
157         factory.registerTag (new LocalLinkTag ());
158         factory.registerTag (new LocalFrameTag ());
159         factory.registerTag (new LocalBaseHrefTag ());
160         factory.registerTag (new LocalImageTag ());
161         mParser.setNodeFactory (factory);
162         mCaptureResources = true;
163         mFilter = null;
164     }
165 
166     /**
167      * Getter for property source.
168      * @return Value of property source.
169      */
170     public String   getSource ()
171     {
172         return (mSource);
173     }
174     
175     /**
176      * Setter for property source.
177      * This is the base URL to capture. URL's that don't start with this prefix
178      * are ignored (left as is), while the ones with this URL as a base are
179      * re-homed to the local target.
180      * @param source New value of property source.
181      */
182     public void setSource (String   source)
183     {
184         if (source.endsWith ("/"))
185             source = source.substring (0, source.length () - 1);
186         mSource = source;
187     }
188     
189     /**
190      * Getter for property target.
191      * @return Value of property target.
192      */
193     public String   getTarget ()
194     {
195         return (mTarget);
196     }
197     
198     /**
199      * Setter for property target.
200      * This is the local directory under which to save the site's pages.
201      * @param target New value of property target.
202      */
203     public void setTarget (String   target)
204     {
205         mTarget = target;
206     }
207 
208     /**
209      * Getter for property captureResources.
210      * If <code>true</code>, the images and other resources referenced by
211      * the site and within the base URL tree are also copied locally to the
212      * target directory. If <code>false</code>, the image links are left 'as
213      * is', still refering to the original site.
214      * @return Value of property captureResources.
215      */
216     public boolean getCaptureResources ()
217     {
218         return (mCaptureResources);
219     }
220     
221     /**
222      * Setter for property captureResources.
223      * @param capture New value of property captureResources.
224      */
225     public void setCaptureResources (boolean capture)
226     {
227         mCaptureResources = capture;
228     }
229     
230     
231     /** Getter for property filter.
232      * @return Value of property filter.
233      *
234      */
235     public NodeFilter getFilter ()
236     {
237         return (mFilter);
238     }
239     
240     /** Setter for property filter.
241      * @param filter New value of property filter.
242      *
243      */
244     public void setFilter (NodeFilter filter)
245     {
246         mFilter = filter;
247     }
248     
249     /**
250      * Returns <code>true</code> if the link is one we are interested in.
251      * @param link The link to be checked.
252      * @return <code>true</code> if the link has the source URL as a prefix
253      * and doesn't contain '?' or '#'; the former because we won't be able to
254      * handle server side queries in the static target directory structure and
255      * the latter because presumably the full page with that reference has
256      * already been captured previously. This performs a case insensitive
257      * comparison, which is cheating really, but it's cheap.
258      */
259     protected boolean isToBeCaptured (String   link)
260     {
261         return (
262             link.toLowerCase ().startsWith (getSource ().toLowerCase ())
263             && (-1 == link.indexOf ("?"))
264             && (-1 == link.indexOf ("#")));
265     }
266 
267     /**
268      * Returns <code>true</code> if the link contains text/html content.
269      * @return <code>true</code> if the HTTP header indicates the type is
270      * "text/html".
271      */
272     protected boolean isHtml (String   link)
273         throws
274             ParserException
275     {
276         URL   url;
277         URLConnection   connection;
278         String   type;
279         boolean ret;
280 
281         ret = false;
282         try
283         {
284             url = new URL   (link);
285             connection = url.openConnection ();
286             type = connection.getContentType ();
287             if (type == null)
288                 ret = false;
289             else
290                 ret = type.startsWith ("text/html");
291         }
292         catch (Exception   e)
293         {
294             throw new ParserException ("URL " + link + " has a problem", e);
295         }
296         
297         return (ret);
298     }
299 
300     /**
301      * Converts a link to local.
302      * A relative link can be used to construct both a URL and a file name.
303      * Basically, the operation is to strip off the base url, if any,
304      * and then prepend as many dot-dots as necessary to make
305      * it relative to the current page.
306      * A bit of a kludge handles the root page specially by calling it
307      * index.html, even though that probably isn't it's real file name.
308      * This isn't pretty, but it works for me.
309      * @param link The link to make relative.
310      * @param current The current page URL, or empty if it's an absolute URL
311      * that needs to be converted.
312      * @return The URL relative to the current page.
313      */
314     protected String   makeLocalLink (String   link, String   current)
315     {
316         int i;
317         int j;
318         String   ret;
319 
320         if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
321             ret = "index.html"; // handle the root page specially
322         else if (link.startsWith (getSource ())
323                 && (link.length () > getSource ().length ()))
324             ret = link.substring (getSource ().length () + 1);
325         else
326             ret = link; // give up
327             
328         // make it relative to the current page by prepending "../" for
329         // each '/' in the current local path
330         if ((null != current)
331             && link.startsWith (getSource ())
332             && (current.length () > getSource ().length ()))
333         {
334             current = current.substring (getSource ().length () + 1);
335             i = 0;
336             while (-1 != (j = current.indexOf ('/', i)))
337             {
338                 ret = "../" + ret;
339                 i = j + 1;
340             }
341         }
342 
343         return (ret);
344     }
345 
346     /**
347      * Copy a resource (image) locally.
348      * Removes one element from the 'to be copied' list and saves the
349      * resource it points to locally as a file.
350      */
351     protected void copy ()
352     {
353         String   link;
354         File   file;
355         File   dir;
356         URL   source;
357         byte[] data;
358         InputStream   in;
359         FileOutputStream   out;
360         int read;
361 
362         link = (String  )mImages.remove (0);
363         mCopied.add (link);
364 
365         if (getCaptureResources ())
366         {
367             file = new File   (getTarget (), makeLocalLink (link, ""));
368             System.out.println ("copying " + link + " to " + file.getAbsolutePath ());
369             // ensure directory exists
370             dir = file.getParentFile ();
371             if (!dir.exists ())
372                 dir.mkdirs ();
373             try
374             {
375                 source = new URL   (link);
376                 data = new byte [TRANSFER_SIZE];
377                 try
378                 {
379                     in = source.openStream ();
380                     try
381                     {
382                         out = new FileOutputStream   (file);
383                         try
384                         {
385                             while (-1 != (read = in.read (data, 0, data.length)))
386                                 out.write (data, 0, read);
387                         }
388                         finally
389                         {
390                             out.close ();
391                         }
392                     }
393                     catch (FileNotFoundException   fnfe)
394                     {
395                         fnfe.printStackTrace ();
396                     }
397                     finally
398                     {
399                         in.close ();
400                     }
401                 }
402                 catch (FileNotFoundException   fnfe)
403                 {
404                     System.err.println ("broken link " + fnfe.getMessage () + " ignored");
405                 }
406             }
407             catch (MalformedURLException   murle)
408             {
409                 murle.printStackTrace ();
410             }
411             catch (IOException   ioe)
412             {
413                 ioe.printStackTrace ();
414             }
415         }
416     }
417  
418     /**
419      * Process a single page.
420      */
421     protected void process (NodeFilter filter)
422         throws
423             ParserException
424     {
425         String   url;
426         int bookmark;
427         NodeList list;
428         NodeList robots;
429         MetaTag robot;
430         String   content;
431         File   file;
432         File   dir;
433         PrintWriter   out;
434 
435         // get the next URL and add it to the done pile
436         url = (String  )mPages.remove (0);
437         System.out.println ("processing " + url);
438         mFinished.add (url);
439 
440         try
441         {
442             bookmark = mPages.size ();
443             // fetch the page and gather the list of nodes
444             mParser.setURL (url);
445             try
446             {
447                 list = new NodeList ();
448                 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
449                     list.add (e.nextNode ()); // URL conversion occurs in the tags
450             }
451             catch (EncodingChangeException ece)
452             {
453                 // fix bug #998195 SiteCatpurer just crashed
454                 // try again with the encoding now set correctly
455                 // hopefully mPages, mImages, mCopied and mFinished won't be corrupted
456                 mParser.reset ();
457                 list = new NodeList ();
458                 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
459                     list.add (e.nextNode ());
460             }
461 
462             // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
463             // <meta name="robots" content="index,follow" />
464             // <meta name="robots" content="noindex,nofollow" />
465             robots = list.extractAllNodesThatMatch (
466                 new AndFilter (
467                     new NodeClassFilter (MetaTag.class),
468                     new HasAttributeFilter ("name", "robots")), true);
469             if (0 != robots.size ())
470             {
471                 robot = (MetaTag)robots.elementAt (0);
472                 content = robot.getAttribute ("content").toLowerCase ();
473                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
474                     // reset mPages
475                     for (int i = bookmark; i < mPages.size (); i++)
476                         mPages.remove (i);
477                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
478                     return;
479             }
480     
481             if (null != filter)
482                 list.keepAllNodesThatMatch (filter, true);
483 
484             // save the page locally
485             file = new File   (getTarget (), makeLocalLink (url, ""));
486             dir = file.getParentFile ();
487             if (!dir.exists ())
488                 dir.mkdirs ();
489             else if (!dir.isDirectory ())
490             {
491                 dir = new File   (dir.getParentFile (), dir.getName () + ".content");
492                 if (!dir.exists ())
493                     dir.mkdirs ();
494                 file = new File   (dir, file.getName ());
495             }
496                 
497             try
498             {
499                 out = new PrintWriter   (new FileOutputStream   (file));
500                 for (int i = 0; i < list.size (); i++)
501                     out.print (list.elementAt (i).toHtml ());
502                 out.close ();
503             }
504             catch (FileNotFoundException   fnfe)
505             {
506                 fnfe.printStackTrace ();
507             }
508         }
509         catch (ParserException pe)
510         {
511             String   message;
512             
513             // this exception handling is suboptimal,
514             // but it recognizes resources that aren't text/html
515             message = pe.getMessage ();
516             if ((null != message) && (message.endsWith ("does not contain text")))
517             {
518                 if (!mCopied.contains (url))
519                     if (!mImages.contains (url))
520                         mImages.add (url);
521                 mFinished.remove (url);
522             }
523             else
524                 throw pe;
525         }
526     }
527 
528     /**
529      * Link tag that rewrites the HREF.
530      * The HREF is changed to a local target if it matches the source.
531      */
532     class LocalLinkTag extends LinkTag
533     {
534         public void doSemanticAction ()
535             throws
536                 ParserException
537         {
538             boolean html;
539             String   link;
540 
541             // get the link
542             link = getLink ();
543             // check if it needs to be captured
544             if (isToBeCaptured (link))
545             {
546                 // add the link to a list to be processed
547                 if (mFinished.contains (link))
548                     html = true;
549                 else if (mPages.contains (link))
550                     html = true;
551                 else if (mCopied.contains (link))
552                     html = false;
553                 else if (mImages.contains (link))
554                     html = false;
555                 else
556                 {   // this test is expensive, do it reluctantly
557                     html = isHtml (link);
558                     if (html)
559                         mPages.add (link);
560                     else
561                         mImages.add (link);
562                 }
563                 // alter the link
564                 if (html || (!html && getCaptureResources ()))
565                     link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
566                 setLink (link);
567             }
568         }
569     }
570 
571     /**
572      * Frame tag that rewrites the SRC URLs.
573      * The SRC URLs are mapped to local targets if they match the source.
574      */
575     class LocalFrameTag extends FrameTag
576     {
577         public void doSemanticAction ()
578             throws
579                 ParserException
580         {
581             boolean html;
582             String   link;
583 
584             // get the link
585             link = getFrameLocation ();
586             // check if it needs to be captured
587             if (isToBeCaptured (link))
588             {
589                 // add the link to a list to be processed
590                 if (mFinished.contains (link))
591                     html = true;
592                 else if (mPages.contains (link))
593                     html = true;
594                 else if (mCopied.contains (link))
595                     html = false;
596                 else if (mImages.contains (link))
597                     html = false;
598                 else
599                 {   // this test is expensive, do it reluctantly
600                     html = isHtml (link);
601                     if (html)
602                         mPages.add (link);
603                     else
604                         mImages.add (link);
605                 }
606                 // alter the link
607                 if (html || (!html && getCaptureResources ()))
608                     link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
609                 setFrameLocation (link);
610             }
611         }
612     }
613 
614     /**
615      * Image tag that rewrites the SRC URL.
616      * If resources are being captured the SRC is mapped to a local target if
617      * it matches the source, otherwise it is convered to a full URL to point
618      * back to the original site.
619      */
620     class LocalImageTag extends ImageTag
621     {
622         public void doSemanticAction ()
623             throws
624                 ParserException
625         {
626             String   image;
627             
628             // get the image url
629             image = getImageURL ();
630             // check if it needs to be captured
631             if (isToBeCaptured (image))
632             {   // add the image to the list needing to be copied
633                 if (!mCopied.contains (image))
634                     if (!mImages.contains (image))
635                         mImages.add (image);
636                 if (getCaptureResources ())
637                     image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ());
638                 // alter the link
639                 setImageURL (image);
640             }
641         }
642     }
643 
644     /**
645      * Base tag that doesn't show.
646      * The toHtml() method is overridden to return an empty string,
647      * effectively shutting off the base reference.
648      */
649     class LocalBaseHrefTag extends BaseHrefTag
650     {
651         // we don't want to have a base pointing back at the source page
652         public String   toHtml ()
653         {
654             return ("");
655         }
656     }
657 
658     /**
659      * Perform the capture.
660      */
661     public void capture ()
662     {
663        
664         mPages.clear ();
665         mPages.add (getSource ());
666         while (0 != mPages.size ())
667             try
668             {
669                 process (getFilter ());
670                 while (0 != mImages.size ())
671                     copy ();
672             }
673             catch (ParserException pe)
674             {   // this exception handling is suboptimal,
675                 // but it messages correctly about broken links
676                 Throwable   throwable;
677                 
678                 throwable = pe.getThrowable ();
679                 if (null != throwable)
680                 {
681                     throwable = throwable.getCause ();
682                     if (throwable instanceof FileNotFoundException  )
683                         System.err.println ("broken link " + ((FileNotFoundException  )throwable).getMessage () + " ignored");
684                     else
685                         pe.printStackTrace ();
686                 }
687                 else
688                     pe.printStackTrace ();
689             }
690     }
691 
692     /**
693      * Mainline to capture a web site locally.
694      * @param args The command line arguments.
695      * There are three arguments the web site to capture, the local directory
696      * to save it to, and a flag (true or false) to indicate whether resources
697      * such as images and video are to be captured as well.
698      * These are requested via dialog boxes if not supplied.
699      */
700     public static void main (String  [] args)
701         throws
702             MalformedURLException  ,
703             IOException  
704     {
705         SiteCapturer worker;
706         String   url;
707         JFileChooser   chooser;
708         URL   source;
709         String   path;
710         File   target;
711         Boolean   capture;
712         int ret;
713         
714         worker = new SiteCapturer ();
715         if (0 >= args.length)
716         {
717             url = (String  )JOptionPane.showInputDialog (
718                 null,
719                 "Enter the URL to capture:",
720                 "Web Site",
721                 JOptionPane.PLAIN_MESSAGE,
722                 null,
723                 null,
724                 "http://htmlparser.sourceforge.net/wiki");
725             if (null != url)
726                 worker.setSource (url);
727             else
728                 System.exit (1);
729         }
730         else
731             worker.setSource (args[0]);
732         if (1 >= args.length)
733         {
734             url = worker.getSource ();
735             source = new URL   (url);
736             path = new File   (new File   ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
737             target = new File   (path);
738             chooser = new JFileChooser   (target);
739             chooser.setDialogType (JFileChooser.SAVE_DIALOG);
740             chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
741             chooser.setSelectedFile (target); // this doesn't frickin' work
742             chooser.setMultiSelectionEnabled (false);
743             chooser.setDialogTitle ("Target Directory");
744             ret = chooser.showSaveDialog (null);
745             if (ret == JFileChooser.APPROVE_OPTION)
746                 worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
747             else
748                 System.exit (1);
749         }
750         else
751             worker.setTarget (args[1]);
752         if (2 >= args.length)
753         {
754             capture = (Boolean  )JOptionPane.showInputDialog (
755                 null,
756                 "Should resources be captured:",
757                 "Capture Resources",
758                 JOptionPane.PLAIN_MESSAGE,
759                 null,
760                 new Object  [] { Boolean.TRUE, Boolean.FALSE},
761                 Boolean.TRUE);
762             if (null != capture)
763                 worker.setCaptureResources (capture.booleanValue ());
764             else
765                 System.exit (1);
766         }
767         else
768             worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
769         worker.capture ();
770         
771         System.exit (0);
772     }
773 }
774
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags