1 27 package org.htmlparser.parserapplications; 28 29 import java.io.File ; 30 import java.io.FileNotFoundException ; 31 import java.io.FileOutputStream ; 32 import java.io.IOException ; 33 import java.io.InputStream ; 34 import java.io.PrintWriter ; 35 import java.net.MalformedURLException ; 36 import java.net.URL ; 37 import java.net.URLConnection ; 38 import java.util.ArrayList ; 39 import java.util.HashSet ; 40 import javax.swing.JFileChooser ; 41 import javax.swing.JOptionPane ; 42 43 import org.htmlparser.NodeFilter; 44 import org.htmlparser.Parser; 45 import org.htmlparser.PrototypicalNodeFactory; 46 import org.htmlparser.filters.AndFilter; 47 import org.htmlparser.filters.HasAttributeFilter; 48 import org.htmlparser.filters.NodeClassFilter; 49 import org.htmlparser.tags.BaseHrefTag; 50 import org.htmlparser.tags.FrameTag; 51 import org.htmlparser.tags.ImageTag; 52 import org.htmlparser.tags.LinkTag; 53 import org.htmlparser.tags.MetaTag; 54 import org.htmlparser.util.EncodingChangeException; 55 import org.htmlparser.util.NodeIterator; 56 import org.htmlparser.util.NodeList; 57 import org.htmlparser.util.ParserException; 58 59 79 public class SiteCapturer 80 { 81 86 protected String mSource; 87 88 92 protected String mTarget; 93 94 100 protected ArrayList mPages; 101 102 106 protected HashSet mFinished; 107 108 112 protected ArrayList mImages; 113 114 118 protected HashSet mCopied; 119 120 123 protected Parser mParser; 124 125 129 protected boolean mCaptureResources; 130 131 134 protected NodeFilter mFilter; 135 136 140 protected final int TRANSFER_SIZE = 4096; 141 142 145 public SiteCapturer () 146 { 147 PrototypicalNodeFactory factory; 148 149 mSource = null; 150 mTarget = null; 151 mPages = new ArrayList (); 152 mFinished = new HashSet (); 153 mImages = new ArrayList (); 154 mCopied = new HashSet (); 155 mParser = new Parser (); 156 factory = new PrototypicalNodeFactory (); 157 factory.registerTag (new LocalLinkTag ()); 158 factory.registerTag (new LocalFrameTag ()); 159 factory.registerTag (new LocalBaseHrefTag ()); 160 factory.registerTag (new LocalImageTag ()); 161 mParser.setNodeFactory (factory); 162 mCaptureResources = true; 163 mFilter = null; 164 } 165 166 170 public String getSource () 171 { 172 return (mSource); 173 } 174 175 182 public void setSource (String source) 183 { 184 if (source.endsWith ("/")) 185 source = source.substring (0, source.length () - 1); 186 mSource = source; 187 } 188 189 193 public String getTarget () 194 { 195 return (mTarget); 196 } 197 198 203 public void setTarget (String target) 204 { 205 mTarget = target; 206 } 207 208 216 public boolean getCaptureResources () 217 { 218 return (mCaptureResources); 219 } 220 221 225 public void setCaptureResources (boolean capture) 226 { 227 mCaptureResources = capture; 228 } 229 230 231 235 public NodeFilter getFilter () 236 { 237 return (mFilter); 238 } 239 240 244 public void setFilter (NodeFilter filter) 245 { 246 mFilter = filter; 247 } 248 249 259 protected boolean isToBeCaptured (String link) 260 { 261 return ( 262 link.toLowerCase ().startsWith (getSource ().toLowerCase ()) 263 && (-1 == link.indexOf ("?")) 264 && (-1 == link.indexOf ("#"))); 265 } 266 267 272 protected boolean isHtml (String link) 273 throws 274 ParserException 275 { 276 URL url; 277 URLConnection connection; 278 String type; 279 boolean ret; 280 281 ret = false; 282 try 283 { 284 url = new URL (link); 285 connection = url.openConnection (); 286 type = connection.getContentType (); 287 if (type == null) 288 ret = false; 289 else 290 ret = type.startsWith ("text/html"); 291 } 292 catch (Exception e) 293 { 294 throw new ParserException ("URL " + link + " has a problem", e); 295 } 296 297 return (ret); 298 } 299 300 314 protected String makeLocalLink (String link, String current) 315 { 316 int i; 317 int j; 318 String ret; 319 320 if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) 321 ret = "index.html"; else if (link.startsWith (getSource ()) 323 && (link.length () > getSource ().length ())) 324 ret = link.substring (getSource ().length () + 1); 325 else 326 ret = link; 328 if ((null != current) 331 && link.startsWith (getSource ()) 332 && (current.length () > getSource ().length ())) 333 { 334 current = current.substring (getSource ().length () + 1); 335 i = 0; 336 while (-1 != (j = current.indexOf ('/', i))) 337 { 338 ret = "../" + ret; 339 i = j + 1; 340 } 341 } 342 343 return (ret); 344 } 345 346 351 protected void copy () 352 { 353 String link; 354 File file; 355 File dir; 356 URL source; 357 byte[] data; 358 InputStream in; 359 FileOutputStream out; 360 int read; 361 362 link = (String )mImages.remove (0); 363 mCopied.add (link); 364 365 if (getCaptureResources ()) 366 { 367 file = new File (getTarget (), makeLocalLink (link, "")); 368 System.out.println ("copying " + link + " to " + file.getAbsolutePath ()); 369 dir = file.getParentFile (); 371 if (!dir.exists ()) 372 dir.mkdirs (); 373 try 374 { 375 source = new URL (link); 376 data = new byte [TRANSFER_SIZE]; 377 try 378 { 379 in = source.openStream (); 380 try 381 { 382 out = new FileOutputStream (file); 383 try 384 { 385 while (-1 != (read = in.read (data, 0, data.length))) 386 out.write (data, 0, read); 387 } 388 finally 389 { 390 out.close (); 391 } 392 } 393 catch (FileNotFoundException fnfe) 394 { 395 fnfe.printStackTrace (); 396 } 397 finally 398 { 399 in.close (); 400 } 401 } 402 catch (FileNotFoundException fnfe) 403 { 404 System.err.println ("broken link " + fnfe.getMessage () + " ignored"); 405 } 406 } 407 catch (MalformedURLException murle) 408 { 409 murle.printStackTrace (); 410 } 411 catch (IOException ioe) 412 { 413 ioe.printStackTrace (); 414 } 415 } 416 } 417 418 421 protected void process (NodeFilter filter) 422 throws 423 ParserException 424 { 425 String url; 426 int bookmark; 427 NodeList list; 428 NodeList robots; 429 MetaTag robot; 430 String content; 431 File file; 432 File dir; 433 PrintWriter out; 434 435 url = (String )mPages.remove (0); 437 System.out.println ("processing " + url); 438 mFinished.add (url); 439 440 try 441 { 442 bookmark = mPages.size (); 443 mParser.setURL (url); 445 try 446 { 447 list = new NodeList (); 448 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) 449 list.add (e.nextNode ()); } 451 catch (EncodingChangeException ece) 452 { 453 mParser.reset (); 457 list = new NodeList (); 458 for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); ) 459 list.add (e.nextNode ()); 460 } 461 462 robots = list.extractAllNodesThatMatch ( 466 new AndFilter ( 467 new NodeClassFilter (MetaTag.class), 468 new HasAttributeFilter ("name", "robots")), true); 469 if (0 != robots.size ()) 470 { 471 robot = (MetaTag)robots.elementAt (0); 472 content = robot.getAttribute ("content").toLowerCase (); 473 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) 474 for (int i = bookmark; i < mPages.size (); i++) 476 mPages.remove (i); 477 if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) 478 return; 479 } 480 481 if (null != filter) 482 list.keepAllNodesThatMatch (filter, true); 483 484 file = new File (getTarget (), makeLocalLink (url, "")); 486 dir = file.getParentFile (); 487 if (!dir.exists ()) 488 dir.mkdirs (); 489 else if (!dir.isDirectory ()) 490 { 491 dir = new File (dir.getParentFile (), dir.getName () + ".content"); 492 if (!dir.exists ()) 493 dir.mkdirs (); 494 file = new File (dir, file.getName ()); 495 } 496 497 try 498 { 499 out = new PrintWriter (new FileOutputStream (file)); 500 for (int i = 0; i < list.size (); i++) 501 out.print (list.elementAt (i).toHtml ()); 502 out.close (); 503 } 504 catch (FileNotFoundException fnfe) 505 { 506 fnfe.printStackTrace (); 507 } 508 } 509 catch (ParserException pe) 510 { 511 String message; 512 513 message = pe.getMessage (); 516 if ((null != message) && (message.endsWith ("does not contain text"))) 517 { 518 if (!mCopied.contains (url)) 519 if (!mImages.contains (url)) 520 mImages.add (url); 521 mFinished.remove (url); 522 } 523 else 524 throw pe; 525 } 526 } 527 528 532 class LocalLinkTag extends LinkTag 533 { 534 public void doSemanticAction () 535 throws 536 ParserException 537 { 538 boolean html; 539 String link; 540 541 link = getLink (); 543 if (isToBeCaptured (link)) 545 { 546 if (mFinished.contains (link)) 548 html = true; 549 else if (mPages.contains (link)) 550 html = true; 551 else if (mCopied.contains (link)) 552 html = false; 553 else if (mImages.contains (link)) 554 html = false; 555 else 556 { html = isHtml (link); 558 if (html) 559 mPages.add (link); 560 else 561 mImages.add (link); 562 } 563 if (html || (!html && getCaptureResources ())) 565 link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); 566 setLink (link); 567 } 568 } 569 } 570 571 575 class LocalFrameTag extends FrameTag 576 { 577 public void doSemanticAction () 578 throws 579 ParserException 580 { 581 boolean html; 582 String link; 583 584 link = getFrameLocation (); 586 if (isToBeCaptured (link)) 588 { 589 if (mFinished.contains (link)) 591 html = true; 592 else if (mPages.contains (link)) 593 html = true; 594 else if (mCopied.contains (link)) 595 html = false; 596 else if (mImages.contains (link)) 597 html = false; 598 else 599 { html = isHtml (link); 601 if (html) 602 mPages.add (link); 603 else 604 mImages.add (link); 605 } 606 if (html || (!html && getCaptureResources ())) 608 link = makeLocalLink (link, mParser.getLexer ().getPage ().getUrl ()); 609 setFrameLocation (link); 610 } 611 } 612 } 613 614 620 class LocalImageTag extends ImageTag 621 { 622 public void doSemanticAction () 623 throws 624 ParserException 625 { 626 String image; 627 628 image = getImageURL (); 630 if (isToBeCaptured (image)) 632 { if (!mCopied.contains (image)) 634 if (!mImages.contains (image)) 635 mImages.add (image); 636 if (getCaptureResources ()) 637 image = makeLocalLink (image, mParser.getLexer ().getPage ().getUrl ()); 638 setImageURL (image); 640 } 641 } 642 } 643 644 649 class LocalBaseHrefTag extends BaseHrefTag 650 { 651 public String toHtml () 653 { 654 return (""); 655 } 656 } 657 658 661 public void capture () 662 { 663 664 mPages.clear (); 665 mPages.add (getSource ()); 666 while (0 != mPages.size ()) 667 try 668 { 669 process (getFilter ()); 670 while (0 != mImages.size ()) 671 copy (); 672 } 673 catch (ParserException pe) 674 { Throwable throwable; 677 678 throwable = pe.getThrowable (); 679 if (null != throwable) 680 { 681 throwable = throwable.getCause (); 682 if (throwable instanceof FileNotFoundException ) 683 System.err.println ("broken link " + ((FileNotFoundException )throwable).getMessage () + " ignored"); 684 else 685 pe.printStackTrace (); 686 } 687 else 688 pe.printStackTrace (); 689 } 690 } 691 692 700 public static void main (String [] args) 701 throws 702 MalformedURLException , 703 IOException 704 { 705 SiteCapturer worker; 706 String url; 707 JFileChooser chooser; 708 URL source; 709 String path; 710 File target; 711 Boolean capture; 712 int ret; 713 714 worker = new SiteCapturer (); 715 if (0 >= args.length) 716 { 717 url = (String )JOptionPane.showInputDialog ( 718 null, 719 "Enter the URL to capture:", 720 "Web Site", 721 JOptionPane.PLAIN_MESSAGE, 722 null, 723 null, 724 "http://htmlparser.sourceforge.net/wiki"); 725 if (null != url) 726 worker.setSource (url); 727 else 728 System.exit (1); 729 } 730 else 731 worker.setSource (args[0]); 732 if (1 >= args.length) 733 { 734 url = worker.getSource (); 735 source = new URL (url); 736 path = new File (new File ("." + File.separator), source.getHost () + File.separator).getCanonicalPath (); 737 target = new File (path); 738 chooser = new JFileChooser (target); 739 chooser.setDialogType (JFileChooser.SAVE_DIALOG); 740 chooser.setFileSelectionMode (JFileChooser.DIRECTORIES_ONLY); 741 chooser.setSelectedFile (target); chooser.setMultiSelectionEnabled (false); 743 chooser.setDialogTitle ("Target Directory"); 744 ret = chooser.showSaveDialog (null); 745 if (ret == JFileChooser.APPROVE_OPTION) 746 worker.setTarget (chooser.getSelectedFile ().getAbsolutePath ()); 747 else 748 System.exit (1); 749 } 750 else 751 worker.setTarget (args[1]); 752 if (2 >= args.length) 753 { 754 capture = (Boolean )JOptionPane.showInputDialog ( 755 null, 756 "Should resources be captured:", 757 "Capture Resources", 758 JOptionPane.PLAIN_MESSAGE, 759 null, 760 new Object [] { Boolean.TRUE, Boolean.FALSE}, 761 Boolean.TRUE); 762 if (null != capture) 763 worker.setCaptureResources (capture.booleanValue ()); 764 else 765 System.exit (1); 766 } 767 else 768 worker.setCaptureResources ((Boolean.valueOf (args[2]).booleanValue ())); 769 worker.capture (); 770 771 System.exit (0); 772 } 773 } 774 | Popular Tags |