1 25 package org.archive.extractor; 26 27 import java.io.InputStream ; 28 import java.nio.charset.Charset ; 29 import java.util.LinkedList ; 30 import java.util.List ; 31 import java.util.NoSuchElementException ; 32 33 import org.archive.crawler.extractor.Link; 34 import org.archive.net.UURI; 35 36 44 public abstract class CharSequenceLinkExtractor implements LinkExtractor { 45 46 protected UURI source; 47 protected UURI base; 48 protected ExtractErrorListener extractErrorListener; 49 50 protected CharSequence sourceContent; 51 protected LinkedList <Link> next; 52 53 public void setup(UURI source, UURI base, InputStream content, 54 Charset charset, ExtractErrorListener listener) { 55 setup(source, base, charSequenceFrom(content,charset), listener); 56 } 57 58 64 public void setup(UURI source, UURI base, CharSequence content, 65 ExtractErrorListener listener) { 66 this.source = source; 67 this.base = base; 68 this.extractErrorListener = listener; 69 this.sourceContent = content; 70 this.next = new LinkedList <Link>(); 71 } 72 73 74 81 public void setup(UURI sourceandbase, CharSequence content, 82 ExtractErrorListener listener) { 83 setup(sourceandbase, sourceandbase, content, listener); 84 } 85 86 89 public void setup(UURI sourceandbase, InputStream content, Charset charset, 90 ExtractErrorListener listener) { 91 setup(sourceandbase,sourceandbase,content,charset,listener); 92 } 93 94 97 public Link nextLink() { 98 if(!hasNext()) { 99 throw new NoSuchElementException (); 100 } 101 return (Link) next.removeFirst(); 103 } 104 105 108 public void reset() { 109 base = null; 110 source = null; 111 sourceContent = null; } 113 114 117 public boolean hasNext() { 118 if (!next.isEmpty()) { 119 return true; 120 } 121 return findNextLink(); 122 } 123 124 129 abstract protected boolean findNextLink(); 130 131 134 public Object next() { 135 return nextLink(); 136 } 137 138 141 public void remove() { 142 throw new UnsupportedOperationException (); 143 } 144 145 150 protected CharSequence charSequenceFrom(InputStream content, Charset charset) { 151 if(content instanceof CharSequenceProvider) { 153 return ((CharSequenceProvider)content).getCharSequence(); 154 } 155 return createCharSequenceFrom(content, charset); 157 } 158 159 164 protected CharSequence createCharSequenceFrom(InputStream content, Charset charset) { 165 return null; 167 } 169 170 179 public static void extract(CharSequence content, UURI source, UURI base, 180 List <Link> collector, ExtractErrorListener extractErrorListener) { 181 CharSequenceLinkExtractor extractor = newDefaultInstance(); 184 extractor.setup(source, base, content, extractErrorListener); 185 while (extractor.hasNext()) { 186 collector.add(extractor.nextLink()); 187 } 188 extractor.reset(); 189 } 190 191 protected static CharSequenceLinkExtractor newDefaultInstance() { 192 return null; 194 } 195 } 196 | Popular Tags |