KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > parserapplications > SiteCapturer


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2003 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/02/14 23:49:24 $
10
// $Revision: 1.7 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.parserapplications;
28
29 import java.io.File JavaDoc;
30 import java.io.FileNotFoundException JavaDoc;
31 import java.io.FileOutputStream JavaDoc;
32 import java.io.IOException JavaDoc;
33 import java.io.InputStream JavaDoc;
34 import java.io.PrintWriter JavaDoc;
35 import java.net.MalformedURLException JavaDoc;
36 import java.net.URL JavaDoc;
37 import java.net.URLConnection JavaDoc;
38 import java.util.ArrayList JavaDoc;
39 import java.util.HashSet JavaDoc;
40 import javax.swing.JFileChooser JavaDoc;
41 import javax.swing.JOptionPane JavaDoc;
42
43 import org.htmlparser.NodeFilter;
44 import org.htmlparser.Parser;
45 import org.htmlparser.PrototypicalNodeFactory;
46 import org.htmlparser.filters.AndFilter;
47 import org.htmlparser.filters.HasAttributeFilter;
48 import org.htmlparser.filters.NodeClassFilter;
49 import org.htmlparser.tags.BaseHrefTag;
50 import org.htmlparser.tags.FrameTag;
51 import org.htmlparser.tags.ImageTag;
52 import org.htmlparser.tags.LinkTag;
53 import org.htmlparser.tags.MetaTag;
54 import org.htmlparser.util.EncodingChangeException;
55 import org.htmlparser.util.NodeIterator;
56 import org.htmlparser.util.NodeList;
57 import org.htmlparser.util.ParserException;
58
59 /**
60  * Save a web site locally.
61  * Illustrative program to save a web site contents locally.
62  * It was created to demonstrate URL rewriting in it's simplest form.
63  * It uses customized tags in the NodeFactory to alter the URLs.
64  * This program has a number of limitations:
65  * <ul>
66  * <li>it doesn't capture forms, this would involve too many assumptions</li>
67  * <li>it doesn't capture script references, so funky onMouseOver and other
68  * non-static content will not be faithfully reproduced</li>
69  * <li>it doesn't handle style sheets</li>
70  * <li>it doesn't dig into attributes that might reference resources, so
71  * for example, background images won't necessarily be captured</li>
72  * <li>worst of all, it gets confused when a URL both has content and is
73  * the prefix for other content,
74  * i.e. http://whatever.com/top and http://whatever.com/top/sub.html both
75  * yield content, since this cannot be faithfully replicated to a static
76  * directory structure (this happens a lot with servlet based sites)</li>
77  *</ul>
78  */

79 public class SiteCapturer
80 {
81     /**
82      * The web site to capture.
83      * This is used as the base URL in deciding whether to adjust a link
84      * and whether to capture a page or not.
85      */

86     protected String JavaDoc mSource;
87
88     /**
89      * The local directory to capture to.
90      * This is used as a base prefix for files saved locally.
91      */

92     protected String JavaDoc mTarget;
93
94     /**
95      * The list of pages to capture.
96      * Links are added to this list as they are discovered, and removed in
97      * sequential order (FIFO queue) leading to a breadth
98      * first traversal of the web site space.
99      */

100     protected ArrayList JavaDoc mPages;
101
102     /**
103      * The set of pages already captured.
104      * Used to avoid repeated acquisition of the same page.
105      */

106     protected HashSet JavaDoc mFinished;
107
108     /**
109      * The list of resources to copy.
110      * Images and other resources are added to this list as they are discovered.
111      */

112     protected ArrayList JavaDoc mImages;
113
114     /**
115      * The set of resources already copied.
116      * Used to avoid repeated acquisition of the same images and other resources.
117      */

118     protected HashSet JavaDoc mCopied;
119
120     /**
121      * The parser to use for processing.
122      */

123     protected Parser mParser;
124
125     /**
126      * If <code>true</code>, save resources locally too,
127      * otherwise, leave resource links pointing to original page.
128      */

129     protected boolean mCaptureResources;
130
131     /**
132      * The filter to apply to the nodes retrieved.
133      */

134     protected NodeFilter mFilter;
135
136     /**
137      * Copy buffer size.
138      * Resources are moved to disk in chunks this size or less.
139      */

140     protected final int TRANSFER_SIZE = 4096;
141
142     /**
143      * Create a web site capturer.
144      */

145     public SiteCapturer ()
146     {
147         PrototypicalNodeFactory factory;
148
149         mSource = null;
150         mTarget = null;
151         mPages = new ArrayList JavaDoc ();
152         mFinished = new HashSet JavaDoc ();
153         mImages = new ArrayList JavaDoc ();
154         mCopied = new HashSet JavaDoc ();
155         mParser = new Parser ();
156         factory = new PrototypicalNodeFactory ();
157         factory.registerTag (new LocalLinkTag ());
158         factory.registerTag (new LocalFrameTag ());
159         factory.registerTag (new LocalBaseHrefTag ());
160         factory.registerTag (new LocalImageTag ());
161         mParser.setNodeFactory (factory);
162         mCaptureResources = true;
163         mFilter = null;
164     }
165
166     /**
167      * Getter for property source.
168      * @return Value of property source.
169      */

170     public String JavaDoc getSource ()
171     {
172         return (mSource);
173     }
174     
175     /**
176      * Setter for property source.
177      * This is the base URL to capture. URL's that don't start with this prefix
178      * are ignored (left as is), while the ones with this URL as a base are
179      * re-homed to the local target.
180      * @param source New value of property source.
181      */

182     public void setSource (String JavaDoc source)
183     {
184         if (source.endsWith ("/"))
185             source = source.substring (0, source.length () - 1);
186         mSource = source;
187     }
188     
189     /**
190      * Getter for property target.
191      * @return Value of property target.
192      */

193     public String JavaDoc getTarget ()
194     {
195         return (mTarget);
196     }
197     
198     /**
199      * Setter for property target.
200      * This is the local directory under which to save the site's pages.
201      * @param target New value of property target.
202      */

203     public void setTarget (String JavaDoc target)
204     {
205         mTarget = target;
206     }
207
208     /**
209      * Getter for property captureResources.
210      * If <code>true</code>, the images and other resources referenced by
211      * the site and within the base URL tree are also copied locally to the
212      * target directory. If <code>false</code>, the image links are left 'as
213      * is', still refering to the original site.
214      * @return Value of property captureResources.
215      */

216     public boolean getCaptureResources ()
217     {
218         return (mCaptureResources);
219     }
220     
221     /**
222      * Setter for property captureResources.
223      * @param capture New value of property captureResources.
224      */

225     public void setCaptureResources (boolean capture)
226     {
227         mCaptureResources = capture;
228     }
229     
230     
231     /** Getter for property filter.
232      * @return Value of property filter.
233      *
234      */

235     public NodeFilter getFilter ()
236     {
237         return (mFilter);
238     }
239     
240     /** Setter for property filter.
241      * @param filter New value of property filter.
242      *
243      */

244     public void setFilter (NodeFilter filter)
245     {
246         mFilter = filter;
247     }
248     
249     /**
250      * Returns <code>true</code> if the link is one we are interested in.
251      * @param link The link to be checked.
252      * @return <code>true</code> if the link has the source URL as a prefix
253      * and doesn't contain '?' or '#'; the former because we won't be able to
254      * handle server side queries in the static target directory structure and
255      * the latter because presumably the full page with that reference has
256      * already been captured previously. This performs a case insensitive
257      * comparison, which is cheating really, but it's cheap.
258      */

259     protected boolean isToBeCaptured (String JavaDoc link)
260     {
261         return (
262             link.toLowerCase ().startsWith (getSource ().toLowerCase ())
263             && (-1 == link.indexOf ("?"))
264             && (-1 == link.indexOf ("#")));
265     }
266
267     /**
268      * Returns <code>true</code> if the link contains text/html content.
269      * @return <code>true</code> if the HTTP header indicates the type is
270      * "text/html".
271      */

272     protected boolean isHtml (String JavaDoc link)
273         throws
274             ParserException
275     {
276         URL JavaDoc url;
277         URLConnection JavaDoc connection;
278         String JavaDoc type;
279         boolean ret;
280
281         ret = false;
282         try
283         {
284             url = new URL JavaDoc (link);
285             connection = url.openConnection ();
286             type = connection.getContentType ();
287             if (type == null)
288                 ret = false;
289             else
290                 ret = type.startsWith ("text/html");
291         }
292         catch (Exception JavaDoc e)
293         {
294             throw new ParserException ("URL " + link + " has a problem", e);
295         }
296         
297         return (ret);
298     }
299
300     /**
301      * Converts a link to local.
302      * A relative link can be used to construct both a URL and a file name.
303      * Basically, the operation is to strip off the base url, if any,
304      * and then prepend as many dot-dots as necessary to make
305      * it relative to the current page.
306      * A bit of a kludge handles the root page specially by calling it
307      * index.html, even though that probably isn't it's real file name.
308      * This isn't pretty, but it works for me.
309      * @param link The link to make relative.
310      * @param current The current page URL, or empty if it's an absolute URL
311      * that needs to be converted.
312      * @return The URL relative to the current page.
313      */

314     protected String JavaDoc makeLocalLink (String JavaDoc link, String JavaDoc current)
315     {
316         int i;
317         int j;
318         String JavaDoc ret;
319
320         if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
321             ret = "index.html"; // handle the root page specially
322
else if (link.startsWith (getSource ())
323                 && (link.length () > getSource ().length ()))
324             ret = link.substring (getSource ().length () + 1);
325         else
326             ret = link; // give up
327

328         // make it relative to the current page by prepending "../" for
329
// each '/' in the current local path
330
if ((null != current)
331             && link.startsWith (getSource ())
332             && (current.length () > getSource ().length ()))
333         {
334             current = current.substring (getSource ().length () + 1);
335             i = 0;
336             while (-1 != (j = current.indexOf ('/', i)))
337             {
338                 ret = "../" + ret;
339                 i = j + 1;
340             }
341         }
342
343         return (ret);
344     }
345
346     /**
347      * Copy a resource (image) locally.
348      * Removes one element from the 'to be copied' list and saves the
349      * resource it points to locally as a file.
350      */

351     protected void copy ()
352     {
353         String JavaDoc link;
354         File JavaDoc file;
355         File JavaDoc dir;
356         URL JavaDoc source;
357         byte[] data;
358         InputStream JavaDoc in;
359         FileOutputStream JavaDoc out;
360         int read;
361
362         link = (String JavaDoc)mImages.remove (0);
363         mCopied.add (link);
364
365         if (getCaptureResources ())
366         {
367             file = new File JavaDoc (getTarget (), makeLocalLink (link, ""));
368             System.out.println ("copying " + link + " to " + file.getAbsolutePath ());
369             // ensure directory exists
370
dir = file.getParentFile ();
371             if (!dir.exists ())
372                 dir.mkdirs ();
373             try
374             {
375                 source = new URL JavaDoc (link);
376                 data = new byte [TRANSFER_SIZE];
377                 try
378                 {
379                     in = source.openStream ();
380                     try
381                     {
382                         out = new FileOutputStream JavaDoc (file);
383                         try
384                         {
385                             while (-1 != (read = in.read (data, 0, data.length)))
386                                 out.write (data, 0, read);
387                         }
388                         finally
389                         {
390                             out.close ();
391                         }
392                     }
393                     catch (FileNotFoundException JavaDoc fnfe)
394                     {
395                         fnfe.printStackTrace ();
396                     }
397                     finally
398                     {
399                         in.close ();
400                     }
401                 }
402                 catch (FileNotFoundException JavaDoc fnfe)
403                 {
404                     System.err.println ("broken link " + fnfe.getMessage () + " ignored");
405                 }
406             }
407             catch (MalformedURLException JavaDoc murle)
408             {
409                 murle.printStackTrace ();
410             }
411             catch (IOException JavaDoc ioe)
412             {
413                 ioe.printStackTrace ();
414             }
415         }
416     }
417  
418     /**
419      * Process a single page.
420      */

421     protected void process (NodeFilter filter)
422         throws
423             ParserException
424     {
425         String JavaDoc url;
426         int bookmark;
427         NodeList list;
428         NodeList robots;
429         MetaTag robot;
430         String JavaDoc content;
431         File JavaDoc file;
432         File JavaDoc dir;
433         PrintWriter JavaDoc out;
434
435         // get the next URL and add it to the done pile
436
url = (String JavaDoc)mPages.remove (0);
437         System.out.println ("processing " + url);
438         mFinished.add (url);
439
440         try
441         {
442             bookmark = mPages.size ();
443             // fetch the page and gather the list of nodes
444
mParser.setURL (url);
445             try
446             {
447                 list = new NodeList ();
448                 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
449                     list.add (e.nextNode ()); // URL conversion occurs in the tags
450
}
451             catch (EncodingChangeException ece)
452             {
453                 // fix bug #998195 SiteCatpurer just crashed
454
// try again with the encoding now set correctly
455
// hopefully mPages, mImages, mCopied and mFinished won't be corrupted
456
mParser.reset ();
457                 list = new NodeList ();
458                 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
459                     list.add (e.nextNode ());
460             }
461
462             // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
463
// <meta name="robots" content="index,follow" />
464
// <meta name="robots" content="noindex,nofollow" />
465
robots = list.extractAllNodesThatMatch (
466                 new AndFilter (
467                     new NodeClassFilter (MetaTag.class),
468                     new HasAttributeFilter ("name", "robots")), true);
469             if (0 != robots.size ())
470             {
471                 robot = (MetaTag)robots.elementAt (0);
472                 content = robot.getAttribute ("content").toLowerCase ();
473                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
474                     // reset mPages
475
for (int i = bookmark; i < mPages.size (); i++)
476                         mPages.remove (i);
477                 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
478                     return;
479             }
480     
481             if (null != filter)
482                 list.keepAllNodesThatMatch (filter, true);
483
484             // save the page locally
485
file = new File JavaDoc (getTarget (), makeLocalLink (url, ""));
486             dir = file.getParentFile ();
487             if (!dir.exists ())
488                 dir.mkdirs ();
489             else if (!dir.isDirectory ())
490             {
491                 dir = new File JavaDoc (dir.getParentFile (), dir.getName () + ".content");
492                 if (!dir.exists ())
493                     dir.mkdirs ();
494                 file = new File JavaDoc (dir, file.getName ());
495             }
496                 
497             try
498             {
499                 out = new PrintWriter JavaDoc (new FileOutputStream JavaDoc (file));
500                 for (int i = 0; i < list.size (); i++)
501                     out.print (list.elementAt (i).toHtml ());
502                 out.close ();
503             }
504             catch (FileNotFoundException JavaDoc fnfe)
505             {
506                 fnfe.printStackTrace ();
507             }
508         }
509         catch (ParserException pe)
510         {
511             String JavaDoc message;
512             
513             // this exception handling is suboptimal,
514
// but it recognizes resources that aren't text/html
515
message = pe.getMessage ();
516             if ((null != message) && (message.endsWith ("does not contain text")))
517             {
518                 if (!mCopied.contains (url))
519                     if (!mImages.contains (url))
520                         mImages.add (url);
521                 mFinished.remove (url);
522             }
523             else
524                 throw pe;
525         }
526     }
527
528     /**
529      * Link tag that rewrites the HREF.
530      * The HREF is changed to a local target if it matches the source.
531      */

532     class LocalLinkTag extends LinkTag
533     {
534         public void doSemanticAction ()
535             throws
536                 ParserException
537         {
538             boolean html;
539             String JavaDoc link;
540
541             // get the link
542
link = getLink ();
543             // check if it needs to be captured
544
if (isToBeCaptured (link))
545             {
546                 // add the link to a list to be processed
547
if (mFinished.contains (link))
548                     html = true;
549                 else if (mPages.contains (link))
550                     html = true;
551                 else if (mCopied.contains (link))
552                     html = false;
553                 else if (mImages.contains (link))
554                     html = false;
555                 else
556                 { // this test is expensive, do it reluctantly
557
html = isHtml (link);
558                     if (html)
559                         mPages.add (link);
560                     else
561                         mImages.add (link);
562                 }
563                 // alter the link
564
if (html || (!html && getCaptureResources ()))
565                     link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
566                 setLink (link);
567             }
568         }
569     }
570
571     /**
572      * Frame tag that rewrites the SRC URLs.
573      * The SRC URLs are mapped to local targets if they match the source.
574      */

575     class LocalFrameTag extends FrameTag
576     {
577         public void doSemanticAction ()
578             throws
579                 ParserException
580         {
581             boolean html;
582             String JavaDoc link;
583
584             // get the link
585
link = getFrameLocation ();
586             // check if it needs to be captured
587
if (isToBeCaptured (link))
588             {
589                 // add the link to a list to be processed
590
if (mFinished.contains (link))
591                     html = true;
592                 else if (mPages.contains (link))
593                     html = true;
594                 else if (mCopied.contains (link))
595                     html = false;
596                 else if (mImages.contains (link))
597                     html = false;
598                 else
599                 { // this test is expensive, do it reluctantly
600
html = isHtml (link);
601                     if (html)
602                         mPages.add (link);
603                     else
604                         mImages.add (link);
605                 }
606                 // alter the link
607
if (html || (!html && getCaptureResources ()))
608                     link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ());
609                 setFrameLocation (link);
610             }
611         }
612     }
613
614     /**
615      * Image tag that rewrites the SRC URL.
616      * If resources are being captured the SRC is mapped to a local target if
617      * it matches the source, otherwise it is convered to a full URL to point
618      * back to the original site.
619      */

620     class LocalImageTag extends ImageTag
621     {
622         public void doSemanticAction ()
623             throws
624                 ParserException
625         {
626             String JavaDoc image;
627             
628             // get the image url
629
image = getImageURL ();
630             // check if it needs to be captured
631
if (isToBeCaptured (image))
632             { // add the image to the list needing to be copied
633
if (!mCopied.contains (image))
634                     if (!mImages.contains (image))
635                         mImages.add (image);
636                 if (getCaptureResources ())
637                     image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ());
638                 // alter the link
639
setImageURL (image);
640             }
641         }
642     }
643
644     /**
645      * Base tag that doesn't show.
646      * The toHtml() method is overridden to return an empty string,
647      * effectively shutting off the base reference.
648      */

649     class LocalBaseHrefTag extends BaseHrefTag
650     {
651         // we don't want to have a base pointing back at the source page
652
public String JavaDoc toHtml ()
653         {
654             return ("");
655         }
656     }
657
658     /**
659      * Perform the capture.
660      */

661     public void capture ()
662     {
663        
664         mPages.clear ();
665         mPages.add (getSource ());
666         while (0 != mPages.size ())
667             try
668             {
669                 process (getFilter ());
670                 while (0 != mImages.size ())
671                     copy ();
672             }
673             catch (ParserException pe)
674             { // this exception handling is suboptimal,
675
// but it messages correctly about broken links
676
Throwable JavaDoc throwable;
677                 
678                 throwable = pe.getThrowable ();
679                 if (null != throwable)
680                 {
681                     throwable = throwable.getCause ();
682                     if (throwable instanceof FileNotFoundException JavaDoc)
683                         System.err.println ("broken link " + ((FileNotFoundException JavaDoc)throwable).getMessage () + " ignored");
684                     else
685                         pe.printStackTrace ();
686                 }
687                 else
688                     pe.printStackTrace ();
689             }
690     }
691
692     /**
693      * Mainline to capture a web site locally.
694      * @param args The command line arguments.
695      * There are three arguments the web site to capture, the local directory
696      * to save it to, and a flag (true or false) to indicate whether resources
697      * such as images and video are to be captured as well.
698      * These are requested via dialog boxes if not supplied.
699      */

700     public static void main (String JavaDoc[] args)
701         throws
702             MalformedURLException JavaDoc,
703             IOException JavaDoc
704     {
705         SiteCapturer worker;
706         String JavaDoc url;
707         JFileChooser JavaDoc chooser;
708         URL JavaDoc source;
709         String JavaDoc path;
710         File JavaDoc target;
711         Boolean JavaDoc capture;
712         int ret;
713         
714         worker = new SiteCapturer ();
715         if (0 >= args.length)
716         {
717             url = (String JavaDoc)JOptionPane.showInputDialog (
718                 null,
719                 "Enter the URL to capture:",
720                 "Web Site",
721                 JOptionPane.PLAIN_MESSAGE,
722                 null,
723                 null,
724                 "http://htmlparser.sourceforge.net/wiki");
725             if (null != url)
726                 worker.setSource (url);
727             else
728                 System.exit (1);
729         }
730         else
731             worker.setSource (args[0]);
732         if (1 >= args.length)
733         {
734             url = worker.getSource ();
735             source = new URL JavaDoc (url);
736             path = new File JavaDoc (new File JavaDoc ("." + File.separator), source.getHost () + File.separator).getCanonicalPath ();
737             target = new File JavaDoc (path);
738             chooser = new JFileChooser JavaDoc (target);
739             chooser.setDialogType (JFileChooser.SAVE_DIALOG);
740             chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY);
741             chooser.setSelectedFile (target); // this doesn't frickin' work
742
chooser.setMultiSelectionEnabled (false);
743             chooser.setDialogTitle ("Target Directory");
744             ret = chooser.showSaveDialog (null);
745             if (ret == JFileChooser.APPROVE_OPTION)
746                 worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ());
747             else
748                 System.exit (1);
749         }
750         else
751             worker.setTarget (args[1]);
752         if (2 >= args.length)
753         {
754             capture = (Boolean JavaDoc)JOptionPane.showInputDialog (
755                 null,
756                 "Should resources be captured:",
757                 "Capture Resources",
758                 JOptionPane.PLAIN_MESSAGE,
759                 null,
760                 new Object JavaDoc[] { Boolean.TRUE, Boolean.FALSE},
761                 Boolean.TRUE);
762             if (null != capture)
763                 worker.setCaptureResources (capture.booleanValue ());
764             else
765                 System.exit (1);
766         }
767         else
768             worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ()));
769         worker.capture ();
770         
771         System.exit (0);
772     }
773 }
774
Popular Tags