KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorSWF


1 /*
2  * Heritrix
3  *
4  * $Id: ExtractorSWF.java,v 1.22.6.1 2007/01/13 01:31:16 stack-sf Exp $
5  *
6  * Created on March 19, 2004
7  *
8  * Copyright (C) 2003 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException JavaDoc;
30 import java.io.InputStream JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import org.archive.crawler.datamodel.CoreAttributeConstants;
34 import org.archive.crawler.datamodel.CrawlURI;
35
36 import com.anotherbigidea.flash.interfaces.SWFTagTypes;
37 import com.anotherbigidea.flash.readers.SWFReader;
38 import com.anotherbigidea.flash.readers.TagParser;
39 import com.anotherbigidea.io.InStream;
40
41 /**
42  * Extracts URIs from SWF (flash/shockwave) files.
43  *
44  * To test, here is a link to an swf that has links
45  * embedded inside of it: http://www.hitspring.com/index.swf.
46  *
47  * @author Igor Ranitovic
48  */

49 public class ExtractorSWF
50 extends Extractor
51 implements CoreAttributeConstants {
52
53     private static final long serialVersionUID = 3627359592408010589L;
54
55     private static Logger JavaDoc logger =
56         Logger.getLogger(ExtractorSWF.class.getName());
57     protected long numberOfCURIsHandled = 0;
58     protected long numberOfLinksExtracted = 0;
59     // TODO: consider if this should be even smaller, because anything
60
// containing URLs wouldn't be this big
61
private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB
62

63     /**
64      * @param name
65      */

66     public ExtractorSWF(String JavaDoc name) {
67         super(name, "Flash extractor. Extracts URIs from SWF " +
68             "(flash/shockwave) files.");
69     }
70
71     protected void extract(CrawlURI curi) {
72         if (!isHttpTransactionContentToProcess(curi)) {
73             return;
74         }
75
76         String JavaDoc contentType = curi.getContentType();
77         if (contentType == null) {
78             return;
79         }
80         if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
81                 && (!curi.toString().toLowerCase().endsWith(".swf"))) {
82             return;
83         }
84
85         numberOfCURIsHandled++;
86
87         InputStream JavaDoc documentStream = null;
88         // Get the SWF file's content stream.
89
try {
90             documentStream = curi.getHttpRecorder().getRecordedInput().
91                 getContentReplayInputStream();
92             if (documentStream == null) {
93                 return;
94             }
95
96             // Create SWF action that will add discoved URIs to CrawlURI
97
// alist(s).
98
CrawlUriSWFAction curiAction = new CrawlUriSWFAction(curi,
99                     getController());
100             // Overwrite parsing of specific tags that might have URIs.
101
CustomSWFTags customTags = new CustomSWFTags(curiAction);
102             // Get a SWFReader instance.
103
SWFReader reader =
104                 new SWFReader(getTagParser(customTags), documentStream) {
105                 /**
106                  * Override because a corrupt SWF file can cause us to try
107                  * read lengths that are hundreds of megabytes in size
108                  * causing us to OOME.
109                  *
110                  * Below is copied from SWFReader parent class.
111                  */

112                 public int readOneTag() throws IOException JavaDoc {
113                     int header = mIn.readUI16();
114                     int type = header >> 6; //only want the top 10 bits
115
int length = header & 0x3F; //only want the bottom 6 bits
116
boolean longTag = (length == 0x3F);
117                     if(longTag) {
118                         length = (int)mIn.readUI32();
119                     }
120                     // Below test added for Heritrix use.
121
if (length > MAX_READ_SIZE) {
122                         // skip to next, rather than throw IOException ending
123
// processing
124
mIn.skipBytes(length);
125                         logger.info("oversized SWF tag (type=" + type
126                                 + ";length=" + length + ") skipped");
127                     } else {
128                         byte[] contents = mIn.read(length);
129                         mConsumer.tag(type, longTag, contents);
130                     }
131                     return type;
132                 }
133             };
134             
135             reader.readFile();
136             numberOfLinksExtracted += curiAction.getLinkCount();
137         } catch (IOException JavaDoc e) {
138             curi.addLocalizedError(getName(), e, "Fail reading.");
139         } finally {
140             try {
141                 documentStream.close();
142             } catch (IOException JavaDoc e) {
143                 curi.addLocalizedError(getName(), e, "Fail on close.");
144             }
145         }
146
147         // Set flag to indicate that link extraction is completed.
148
curi.linkExtractorFinished();
149         logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
150     }
151     
152     public String JavaDoc report() {
153         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
154         ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
155         ret.append(" Function: Link extraction on Shockwave Flash " +
156             "documents (.swf)\n");
157
158         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
159         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
160         return ret.toString();
161     }
162     
163     
164     /**
165      * Get a TagParser
166      *
167      * A custom ExtractorTagParser which ignores all the big binary image/
168      * sound/font types which don't carry URLs is used, to avoid the
169      * occasionally fatal (OutOfMemoryError) memory bloat caused by the
170      * all-in-memory SWF library handling.
171      *
172      * @param customTags A custom tag parser.
173      * @return An SWFReader.
174      */

175     private TagParser getTagParser(CustomSWFTags customTags) {
176         return new ExtractorTagParser(customTags);
177     }
178     
179     /**
180      * TagParser customized to ignore SWFTags that
181      * will never contain extractable URIs.
182      */

183     protected class ExtractorTagParser extends TagParser {
184
185         protected ExtractorTagParser(SWFTagTypes tagtypes) {
186             super(tagtypes);
187         }
188
189         protected void parseDefineBits(InStream in) throws IOException JavaDoc {
190             // DO NOTHING - no URLs to be found in bits
191
}
192
193         protected void parseDefineBitsJPEG3(InStream in) throws IOException JavaDoc {
194             // DO NOTHING - no URLs to be found in bits
195
}
196
197         protected void parseDefineBitsLossless(InStream in, int length, boolean hasAlpha) throws IOException JavaDoc {
198             // DO NOTHING - no URLs to be found in bits
199
}
200
201         protected void parseDefineButtonSound(InStream in) throws IOException JavaDoc {
202             // DO NOTHING - no URLs to be found in sound
203
}
204
205         protected void parseDefineFont(InStream in) throws IOException JavaDoc {
206             // DO NOTHING - no URLs to be found in font
207
}
208
209         protected void parseDefineJPEG2(InStream in, int length) throws IOException JavaDoc {
210             // DO NOTHING - no URLs to be found in jpeg
211
}
212
213         protected void parseDefineJPEGTables(InStream in) throws IOException JavaDoc {
214             // DO NOTHING - no URLs to be found in jpeg
215
}
216
217         protected void parseDefineShape(int type, InStream in) throws IOException JavaDoc {
218             // DO NOTHING - no URLs to be found in shape
219
}
220
221         protected void parseDefineSound(InStream in) throws IOException JavaDoc {
222             // DO NOTHING - no URLs to be found in sound
223
}
224
225         protected void parseFontInfo(InStream in, int length, boolean isFI2) throws IOException JavaDoc {
226             // DO NOTHING - no URLs to be found in font info
227
}
228
229         protected void parseDefineFont2(InStream in) throws IOException JavaDoc {
230             // DO NOTHING - no URLs to be found in bits
231
}
232     }
233 }
234
Popular Tags