KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > scope > SeedFileIterator


1 /* SeedFileIterator
2 *
3 * $Id: SeedFileIterator.java,v 1.8.6.1 2007/01/13 01:31:25 stack-sf Exp $
4 *
5 * Created on Mar 28, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.scope;
26
27 import java.io.BufferedReader JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.Writer JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.net.UURI;
35 import org.archive.net.UURIFactory;
36 import org.archive.util.iterator.LineReadingIterator;
37 import org.archive.util.iterator.RegexpLineIterator;
38 import org.archive.util.iterator.TransformingIteratorWrapper;
39
40
41 /**
42  * Iterator wrapper for seeds file on disk.
43  *
44  * @author gojomo
45  */

46 public class SeedFileIterator extends TransformingIteratorWrapper<String JavaDoc,UURI> {
47     private static Logger JavaDoc logger =
48         Logger.getLogger(SeedFileIterator.class.getName());
49     
50     BufferedReader JavaDoc input;
51     Writer JavaDoc ignored;
52     
53     /**
54      * Construct a SeedFileIterator over the input available
55      * from the supplied BufferedReader.
56      * @param br BufferedReader from which to get seeds
57      */

58     public SeedFileIterator(BufferedReader JavaDoc br) {
59         this(br,null);
60     }
61
62     /**
63      * Construct a SeedFileIterator over the input available
64      * from the supplied BufferedReader, reporting any nonblank
65      * noncomment entries which don't generate a valid seed to
66      * the supplied BufferedWriter.
67      *
68      * @param inputReader BufferedReader from which to get seeds
69      * @param ignoredWriter BufferedWriter to report any ignored input
70      */

71     public SeedFileIterator(BufferedReader JavaDoc inputReader, Writer JavaDoc ignoredWriter) {
72         super();
73         inner = new RegexpLineIterator(
74                     new LineReadingIterator(inputReader),
75                     RegexpLineIterator.COMMENT_LINE,
76                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
77                     RegexpLineIterator.ENTRY);
78         input = inputReader;
79         ignored = ignoredWriter;
80     }
81     
82     protected UURI transform(String JavaDoc uri) {
83         if(! uri.matches("[a-zA-Z][\\w+\\-]+:.*")) { // Rfc2396 s3.1 scheme,
84
// minus '.'
85
// Does not begin with scheme, so try http://
86
uri = "http://"+uri;
87         }
88         try {
89             // TODO: ignore lines beginning with non-word char
90
return UURIFactory.getInstance(uri);
91         } catch (URIException e) {
92             logger.log(Level.INFO, "line in seed file ignored: "
93                     + e.getMessage(), e);
94             if(ignored!=null) {
95                 try {
96                     ignored.write(uri+"\n");
97                 } catch (IOException JavaDoc e1) {
98                     // TODO Auto-generated catch block
99
e1.printStackTrace();
100                 }
101             }
102             return null;
103         }
104     }
105     
106     
107     /**
108      * Clean-up when hasNext() has returned null: close open files.
109      *
110      * @see org.archive.util.iterator.TransformingIteratorWrapper#noteExhausted()
111      */

112     protected void noteExhausted() {
113         super.noteExhausted();
114         close();
115     }
116     
117     public void close() {
118         try {
119             if(input!=null) {
120                 input.close();
121             }
122             if(ignored!=null) {
123                 ignored.close();
124             }
125         } catch (IOException JavaDoc e) {
126             // TODO Auto-generated catch block
127
e.printStackTrace();
128         }
129     }
130 }
Popular Tags