1 26 27 package org.archive.crawler.extractor; 28 29 import java.io.IOException ; 30 import java.io.InputStream ; 31 import java.util.logging.Logger ; 32 33 import org.archive.crawler.datamodel.CoreAttributeConstants; 34 import org.archive.crawler.datamodel.CrawlURI; 35 36 import com.anotherbigidea.flash.interfaces.SWFTagTypes; 37 import com.anotherbigidea.flash.readers.SWFReader; 38 import com.anotherbigidea.flash.readers.TagParser; 39 import com.anotherbigidea.io.InStream; 40 41 49 public class ExtractorSWF 50 extends Extractor 51 implements CoreAttributeConstants { 52 53 private static final long serialVersionUID = 3627359592408010589L; 54 55 private static Logger logger = 56 Logger.getLogger(ExtractorSWF.class.getName()); 57 protected long numberOfCURIsHandled = 0; 58 protected long numberOfLinksExtracted = 0; 59 private static final int MAX_READ_SIZE = 1024 * 1024; 63 66 public ExtractorSWF(String name) { 67 super(name, "Flash extractor. Extracts URIs from SWF " + 68 "(flash/shockwave) files."); 69 } 70 71 protected void extract(CrawlURI curi) { 72 if (!isHttpTransactionContentToProcess(curi)) { 73 return; 74 } 75 76 String contentType = curi.getContentType(); 77 if (contentType == null) { 78 return; 79 } 80 if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0) 81 && (!curi.toString().toLowerCase().endsWith(".swf"))) { 82 return; 83 } 84 85 numberOfCURIsHandled++; 86 87 InputStream documentStream = null; 88 try { 90 documentStream = curi.getHttpRecorder().getRecordedInput(). 91 getContentReplayInputStream(); 92 if (documentStream == null) { 93 return; 94 } 95 96 CrawlUriSWFAction curiAction = new CrawlUriSWFAction(curi, 99 getController()); 100 CustomSWFTags customTags = new CustomSWFTags(curiAction); 102 SWFReader reader = 104 new SWFReader(getTagParser(customTags), documentStream) { 105 112 public int readOneTag() throws IOException { 113 int header = mIn.readUI16(); 114 int type = header >> 6; int length = header & 0x3F; boolean longTag = (length == 0x3F); 117 if(longTag) { 118 length = (int)mIn.readUI32(); 119 } 120 if (length > MAX_READ_SIZE) { 122 mIn.skipBytes(length); 125 logger.info("oversized SWF tag (type=" + type 126 + ";length=" + length + ") skipped"); 127 } else { 128 byte[] contents = mIn.read(length); 129 mConsumer.tag(type, longTag, contents); 130 } 131 return type; 132 } 133 }; 134 135 reader.readFile(); 136 numberOfLinksExtracted += curiAction.getLinkCount(); 137 } catch (IOException e) { 138 curi.addLocalizedError(getName(), e, "Fail reading."); 139 } finally { 140 try { 141 documentStream.close(); 142 } catch (IOException e) { 143 curi.addLocalizedError(getName(), e, "Fail on close."); 144 } 145 } 146 147 curi.linkExtractorFinished(); 149 logger.fine(curi + " has " + numberOfLinksExtracted + " links."); 150 } 151 152 public String report() { 153 StringBuffer ret = new StringBuffer (); 154 ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n"); 155 ret.append(" Function: Link extraction on Shockwave Flash " + 156 "documents (.swf)\n"); 157 158 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 159 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 160 return ret.toString(); 161 } 162 163 164 175 private TagParser getTagParser(CustomSWFTags customTags) { 176 return new ExtractorTagParser(customTags); 177 } 178 179 183 protected class ExtractorTagParser extends TagParser { 184 185 protected ExtractorTagParser(SWFTagTypes tagtypes) { 186 super(tagtypes); 187 } 188 189 protected void parseDefineBits(InStream in) throws IOException { 190 } 192 193 protected void parseDefineBitsJPEG3(InStream in) throws IOException { 194 } 196 197 protected void parseDefineBitsLossless(InStream in, int length, boolean hasAlpha) throws IOException { 198 } 200 201 protected void parseDefineButtonSound(InStream in) throws IOException { 202 } 204 205 protected void parseDefineFont(InStream in) throws IOException { 206 } 208 209 protected void parseDefineJPEG2(InStream in, int length) throws IOException { 210 } 212 213 protected void parseDefineJPEGTables(InStream in) throws IOException { 214 } 216 217 protected void parseDefineShape(int type, InStream in) throws IOException { 218 } 220 221 protected void parseDefineSound(InStream in) throws IOException { 222 } 224 225 protected void parseFontInfo(InStream in, int length, boolean isFI2) throws IOException { 226 } 228 229 protected void parseDefineFont2(InStream in) throws IOException { 230 } 232 } 233 } 234 | Popular Tags |