1 25 package org.archive.crawler.scope; 26 27 import java.io.BufferedReader ; 28 import java.io.IOException ; 29 import java.io.Writer ; 30 import java.util.logging.Level ; 31 import java.util.logging.Logger ; 32 33 import org.apache.commons.httpclient.URIException; 34 import org.archive.net.UURI; 35 import org.archive.net.UURIFactory; 36 import org.archive.util.iterator.LineReadingIterator; 37 import org.archive.util.iterator.RegexpLineIterator; 38 import org.archive.util.iterator.TransformingIteratorWrapper; 39 40 41 46 public class SeedFileIterator extends TransformingIteratorWrapper<String ,UURI> { 47 private static Logger logger = 48 Logger.getLogger(SeedFileIterator.class.getName()); 49 50 BufferedReader input; 51 Writer ignored; 52 53 58 public SeedFileIterator(BufferedReader br) { 59 this(br,null); 60 } 61 62 71 public SeedFileIterator(BufferedReader inputReader, Writer ignoredWriter) { 72 super(); 73 inner = new RegexpLineIterator( 74 new LineReadingIterator(inputReader), 75 RegexpLineIterator.COMMENT_LINE, 76 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT, 77 RegexpLineIterator.ENTRY); 78 input = inputReader; 79 ignored = ignoredWriter; 80 } 81 82 protected UURI transform(String uri) { 83 if(! uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { uri = "http://"+uri; 87 } 88 try { 89 return UURIFactory.getInstance(uri); 91 } catch (URIException e) { 92 logger.log(Level.INFO, "line in seed file ignored: " 93 + e.getMessage(), e); 94 if(ignored!=null) { 95 try { 96 ignored.write(uri+"\n"); 97 } catch (IOException e1) { 98 e1.printStackTrace(); 100 } 101 } 102 return null; 103 } 104 } 105 106 107 112 protected void noteExhausted() { 113 super.noteExhausted(); 114 close(); 115 } 116 117 public void close() { 118 try { 119 if(input!=null) { 120 input.close(); 121 } 122 if(ignored!=null) { 123 ignored.close(); 124 } 125 } catch (IOException e) { 126 e.printStackTrace(); 128 } 129 } 130 } | Popular Tags |