1 17 18 19 20 package org.apache.lenya.util; 21 22 import java.io.FileReader ; 23 import java.io.IOException ; 24 import java.io.InputStreamReader ; 25 import java.io.Reader ; 26 import java.net.URL ; 27 import java.net.URLConnection ; 28 import java.util.Iterator ; 29 import java.util.List ; 30 31 import javax.swing.text.html.parser.ParserDelegator ; 32 33 34 37 public class HTML { 38 HTMLHandler htmlHandler; 39 40 47 public HTML(String uri) throws IOException { 48 ParserDelegator pd = new ParserDelegator (); 49 htmlHandler = new HTMLHandler(); 50 pd.parse(getReader(uri), htmlHandler, true); 51 } 52 53 58 public static void main(String [] args) { 59 if (args.length != 1) { 60 System.err.println("Usage: HTML uri (file or url)"); 61 62 return; 63 } 64 65 try { 66 HTML html = new HTML(args[0]); 67 68 List img_src_list = html.getImageSrcs(false); 69 System.out.println("<im src"); 70 71 Iterator img_src_iterator = img_src_list.iterator(); 72 73 while (img_src_iterator.hasNext()) { 74 System.out.println((String ) img_src_iterator.next()); 75 } 76 77 List a_href_list = html.getAnchorHRefs(false); 78 System.out.println("<a href"); 79 80 Iterator a_href_iterator = a_href_list.iterator(); 81 82 while (a_href_iterator.hasNext()) { 83 System.out.println((String ) a_href_iterator.next()); 84 } 85 86 List link_href_list = html.getLinkHRefs(false); 87 System.out.println("<link href"); 88 89 Iterator link_href_iterator = link_href_list.iterator(); 90 91 while (link_href_iterator.hasNext()) { 92 System.out.println((String ) link_href_iterator.next()); 93 } 94 } catch (Exception e) { 95 System.err.println(".main(): " + e); 96 } 97 } 98 99 106 public List getAnchorHRefs(boolean duplicate) { 107 if (duplicate) { 108 return htmlHandler.getAllAHRefs(); 109 } else { 110 return htmlHandler.getAHRefs(); 111 } 112 } 113 114 121 public List getLinkHRefs(boolean duplicate) { 122 if (duplicate) { 123 return htmlHandler.getAllLinkHRefs(); 124 } else { 125 return htmlHandler.getLinkHRefs(); 126 } 127 } 128 129 136 public List getImageSrcs(boolean duplicate) { 137 if (duplicate) { 138 return htmlHandler.getAllImageSrcs(); 139 } else { 140 return htmlHandler.getImageSrcs(); 141 } 142 } 143 144 private Reader getReader(String uri) throws IOException { 145 if (uri.startsWith("http:")) { 146 URLConnection connection = new URL (uri).openConnection(); 148 149 return new InputStreamReader (connection.getInputStream()); 150 } else { 151 return new FileReader (uri); 153 } 154 } 155 } 156 | Popular Tags |