1 4 package net.sf.mybatchfwk.test.websitedownloader; 5 6 import java.io.BufferedReader ; 7 import java.io.BufferedWriter ; 8 import java.io.File ; 9 import java.io.FileReader ; 10 import java.io.FileWriter ; 11 import java.io.IOException ; 12 import java.io.InputStreamReader ; 13 import java.io.Reader ; 14 import java.net.URL ; 15 import java.util.LinkedList ; 16 import java.util.List ; 17 18 import javax.swing.text.BadLocationException ; 19 import javax.swing.text.Document ; 20 import javax.swing.text.EditorKit ; 21 import javax.swing.text.ElementIterator ; 22 import javax.swing.text.SimpleAttributeSet ; 23 import javax.swing.text.html.HTML ; 24 import javax.swing.text.html.HTMLEditorKit ; 25 26 import net.sf.mybatchfwk.ITask; 27 import net.sf.mybatchfwk.TaskExecutionException; 28 29 public class DownloadTask implements ITask { 30 31 private URL url; 32 private File downloadFolder; 33 private String [] links; 34 35 public DownloadTask(URL url, File downloadFolder) { 36 this.url = url; 37 this.downloadFolder = downloadFolder; 38 } 39 40 public String getId() { 41 return "[D]" + url.toString(); 42 } 43 44 public void run() { 45 String fileUrl = url.getFile(); 46 47 File parent = downloadFolder; 48 if (fileUrl.startsWith("/") && (fileUrl.length() > 1) && (fileUrl.indexOf('/', 1) != -1)) { 49 parent = new File (downloadFolder, fileUrl.substring(0, fileUrl.lastIndexOf('/'))); 50 parent.mkdirs(); 51 } 52 53 if ((fileUrl.lastIndexOf('/')) < (fileUrl.length()-1)) { 54 File destination = new File (parent, fileUrl.substring(fileUrl.lastIndexOf('/')+1, fileUrl.length())); 55 try { 56 downloadFile(url, destination); 57 if (isHTML(destination)) { 58 this.links = getRelativeLinks(destination); 59 } 60 } catch (IOException e) { 61 throw new TaskExecutionException(e); 62 } catch (BadLocationException e) { 63 throw new TaskExecutionException(e); 64 } 65 } 66 } 67 68 protected boolean isHTML(File file) { 69 String [] parts = file.getName().split("\\."); 70 if (parts.length > 1) { 71 String extension = parts[parts.length-1]; 72 if ("HTM".equalsIgnoreCase(extension) || "HTML".equalsIgnoreCase(extension)) { 73 return true; 74 } 75 } 76 return false; 77 } 78 79 protected void downloadFile(URL source, File destination) throws IOException { 80 BufferedReader in = null; 81 BufferedWriter out = null; 82 83 try { 84 in = new BufferedReader (new InputStreamReader (source.openStream())); 85 out = new BufferedWriter (new FileWriter (destination)); 86 87 String line = null; 88 while ((line = in.readLine()) != null) { 89 out.write(line + "\n"); 90 } 91 in.close(); 92 out.close(); 93 } finally { 94 if (out != null) { 95 try { 96 out.close(); 97 } catch (IOException e) {} 98 } 99 if (in != null) { 100 try { 101 in.close(); 102 } catch (IOException e) {} 103 } 104 } 105 } 106 107 protected String [] getRelativeLinks(File file) throws IOException , BadLocationException { 108 List <String > links = new LinkedList <String >(); 109 110 EditorKit kit = new HTMLEditorKit (); 111 Document doc = kit.createDefaultDocument(); 112 113 doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); 116 Reader rd = new FileReader (file); 118 119 kit.read(rd, doc, 0); 121 122 ElementIterator it = new ElementIterator (doc); 125 javax.swing.text.Element elem; 126 while ((elem = it.next()) != null) { 127 SimpleAttributeSet s = (SimpleAttributeSet ) elem.getAttributes().getAttribute(HTML.Tag.A); 128 if (s != null) { 129 String href = (String ) s.getAttribute(HTML.Attribute.HREF); 130 if ((href != null) && (!href.startsWith("http"))) { 131 links.add(href.toString()); 132 } 133 } 134 } 135 136 return links.toArray(new String [links.size()]); 137 } 138 139 142 public String [] getLinks() { 143 return links; 144 } 145 } 146 | Popular Tags |