KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > extractor > CharSequenceLinkExtractor


1 /* CharSequenceLinkExtractor
2 *
3 * $Id: CharSequenceLinkExtractor.java,v 1.4.12.1 2007/01/13 01:31:31 stack-sf Exp $
4 *
5 * Created on Mar 17, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.extractor;
26
27 import java.io.InputStream JavaDoc;
28 import java.nio.charset.Charset JavaDoc;
29 import java.util.LinkedList JavaDoc;
30 import java.util.List JavaDoc;
31 import java.util.NoSuchElementException JavaDoc;
32
33 import org.archive.crawler.extractor.Link;
34 import org.archive.net.UURI;
35
36 /**
37  * Abstract superclass providing utility methods for LinkExtractors which
38  * would prefer to work on a CharSequence rather than a stream.
39  *
40  * ROUGH DRAFT IN PROGRESS / incomplete... untested...
41  *
42  * @author gojomo
43  */

44 public abstract class CharSequenceLinkExtractor implements LinkExtractor {
45
46     protected UURI source;
47     protected UURI base;
48     protected ExtractErrorListener extractErrorListener;
49
50     protected CharSequence JavaDoc sourceContent;
51     protected LinkedList JavaDoc<Link> next;
52
53     public void setup(UURI source, UURI base, InputStream JavaDoc content,
54             Charset JavaDoc charset, ExtractErrorListener listener) {
55         setup(source, base, charSequenceFrom(content,charset), listener);
56     }
57
58     /**
59      * @param source
60      * @param base
61      * @param content
62      * @param listener
63      */

64     public void setup(UURI source, UURI base, CharSequence JavaDoc content,
65             ExtractErrorListener listener) {
66         this.source = source;
67         this.base = base;
68         this.extractErrorListener = listener;
69         this.sourceContent = content;
70         this.next = new LinkedList JavaDoc<Link>();
71     }
72
73
74     /**
75      * Convenience method for when source and base are same.
76      *
77      * @param sourceandbase
78      * @param content
79      * @param listener
80      */

81     public void setup(UURI sourceandbase, CharSequence JavaDoc content,
82             ExtractErrorListener listener) {
83         setup(sourceandbase, sourceandbase, content, listener);
84     }
85
86     /* (non-Javadoc)
87      * @see org.archive.extractor.LinkExtractor#setup(org.archive.crawler.datamodel.UURI, java.io.InputStream, java.nio.charset.Charset)
88      */

89     public void setup(UURI sourceandbase, InputStream JavaDoc content, Charset JavaDoc charset,
90             ExtractErrorListener listener) {
91         setup(sourceandbase,sourceandbase,content,charset,listener);
92     }
93
94     /* (non-Javadoc)
95      * @see org.archive.extractor.LinkExtractor#nextLink()
96      */

97     public Link nextLink() {
98         if(!hasNext()) {
99             throw new NoSuchElementException JavaDoc();
100         }
101         // next will have been filled with at least one item
102
return (Link) next.removeFirst();
103     }
104
105     /**
106      * Discard all state. Another setup() is required to use again.
107      */

108     public void reset() {
109         base = null;
110         source = null;
111         sourceContent = null; // TODO: discard other resources
112
}
113
114     /* (non-Javadoc)
115      * @see java.util.Iterator#hasNext()
116      */

117     public boolean hasNext() {
118         if (!next.isEmpty()) {
119             return true;
120         }
121         return findNextLink();
122     }
123
124     /**
125      * Scan to the next link(s), if any, loading it into the next buffer.
126      *
127      * @return true if any links are found/available, false otherwise
128      */

129     abstract protected boolean findNextLink();
130
131     /* (non-Javadoc)
132      * @see java.util.Iterator#next()
133      */

134     public Object JavaDoc next() {
135         return nextLink();
136     }
137
138     /* (non-Javadoc)
139      * @see java.util.Iterator#remove()
140      */

141     public void remove() {
142         throw new UnsupportedOperationException JavaDoc();
143     }
144
145     /**
146      * @param content
147      * @param charset
148      * @return CharSequence obtained from stream in given charset
149      */

150     protected CharSequence JavaDoc charSequenceFrom(InputStream JavaDoc content, Charset JavaDoc charset) {
151         // See if content InputStream can provide
152
if(content instanceof CharSequenceProvider) {
153             return ((CharSequenceProvider)content).getCharSequence();
154         }
155         // otherwise, create one
156
return createCharSequenceFrom(content, charset);
157     }
158
159     /**
160      * @param content
161      * @param charset
162      * @return CharSequence built over given stream in given charset
163      */

164     protected CharSequence JavaDoc createCharSequenceFrom(InputStream JavaDoc content, Charset JavaDoc charset) {
165         // TODO: implement
166
return null;
167         // TODO: consider cleanup in reset()
168
}
169
170     /**
171      * Convenience method to do default extraction.
172      *
173      * @param content
174      * @param source
175      * @param base
176      * @param collector
177      * @param extractErrorListener
178      */

179     public static void extract(CharSequence JavaDoc content, UURI source, UURI base,
180             List JavaDoc<Link> collector, ExtractErrorListener extractErrorListener) {
181         // TODO: arrange for inheritance of prefs... eg when HTML includes JS
182
// includes HTML, have inner HTML follow robots, etc from outer
183
CharSequenceLinkExtractor extractor = newDefaultInstance();
184         extractor.setup(source, base, content, extractErrorListener);
185         while (extractor.hasNext()) {
186             collector.add(extractor.nextLink());
187         }
188         extractor.reset();
189     }
190
191     protected static CharSequenceLinkExtractor newDefaultInstance() {
192         // override in subclasses
193
return null;
194     }
195 }
196
Popular Tags