KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorXML


1 /*
2  * ExtractorXML
3  *
4  * $Id: ExtractorXML.java,v 1.3.4.1 2007/01/13 01:31:16 stack-sf Exp $
5  *
6  * Created on Sep 27, 2005
7  *
8  * Copyright (C) 2005 Internet Archive.
9  *
10  * This file is part of the Heritrix web crawler (crawler.archive.org).
11  *
12  * Heritrix is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser Public License as published by
14  * the Free Software Foundation; either version 2.1 of the License, or
15  * any later version.
16  *
17  * Heritrix is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU Lesser Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser Public License
23  * along with Heritrix; if not, write to the Free Software
24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25  */

26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException JavaDoc;
30 import java.util.logging.Logger JavaDoc;
31 import java.util.regex.Matcher JavaDoc;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.io.ReplayCharSequence;
38 import org.archive.util.TextUtils;
39
40 /**
41  * A simple extractor which finds HTTP URIs inside XML/RSS files,
42  * inside attribute values and simple elements (those with only
43  * whitespace + HTTP URI + whitespace as contents)
44  *
45  * @author gojomo
46  *
47  **/

48
49 public class ExtractorXML extends Extractor implements CoreAttributeConstants {
50
51     private static final long serialVersionUID = 3101230586822401584L;
52
53     private static Logger JavaDoc logger =
54         Logger.getLogger(ExtractorXML.class.getName());
55
56     private static String JavaDoc ESCAPED_AMP = "&amp";
57
58     static final String JavaDoc XML_URI_EXTRACTOR =
59     "(?i)[\"\'>]\\s*(http:[^\\s\"\'<>]+)\\s*[\"\'<]";
60     // GROUPS:
61
// (G1) URI
62

63     private long numberOfCURIsHandled = 0;
64     private long numberOfLinksExtracted = 0;
65
66     /**
67      * @param name
68      */

69     public ExtractorXML(String JavaDoc name) {
70         super(name, "XML Extractor. Extracts links from XML/RSS.");
71     }
72
73     /**
74      * @param curi Crawl URI to process.
75      */

76     public void extract(CrawlURI curi) {
77         if (!isHttpTransactionContentToProcess(curi)) {
78             return;
79         }
80         String JavaDoc mimeType = curi.getContentType();
81         if (mimeType == null) {
82             return;
83         }
84         if ((mimeType.toLowerCase().indexOf("xml") < 0)
85                 && (!curi.toString().toLowerCase().endsWith(".rss"))
86                 && (!curi.toString().toLowerCase().endsWith(".xml"))) {
87             return;
88         }
89         this.numberOfCURIsHandled++;
90
91         ReplayCharSequence cs = null;
92         try {
93             cs = curi.getHttpRecorder().getReplayCharSequence();
94         } catch (IOException JavaDoc e) {
95             logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
96         }
97         if (cs == null) {
98             logger.severe("Failed getting ReplayCharSequence: " +
99                 curi.toString());
100             return;
101         }
102         try {
103             this.numberOfLinksExtracted += processXml(curi, cs,
104                 getController());
105             // Set flag to indicate that link extraction is completed.
106
curi.linkExtractorFinished();
107         } finally {
108             if (cs != null) {
109                 try {
110                     cs.close();
111                 } catch (IOException JavaDoc ioe) {
112                     logger.warning(TextUtils.exceptionToString(
113                             "Failed close of ReplayCharSequence.", ioe));
114                 }
115             }
116         }
117     }
118
119     public static long processXml(CrawlURI curi, CharSequence JavaDoc cs,
120             CrawlController controller) {
121         long foundLinks = 0;
122         Matcher JavaDoc uris = null;
123         String JavaDoc xmlUri;
124         uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
125         while (uris.find()) {
126             xmlUri = uris.group(1);
127             // TODO: Escape more HTML Entities.
128
xmlUri = TextUtils.replaceAll(ESCAPED_AMP, xmlUri, "&");
129             foundLinks++;
130             try {
131                 // treat as speculative, as whether context really
132
// intends to create a followable/fetchable URI is
133
// unknown
134
curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC,
135                         Link.SPECULATIVE_HOP);
136             } catch (URIException e) {
137                 // There may not be a controller (e.g. If we're being run
138
// by the extractor tool).
139
if (controller != null) {
140                     controller.logUriError(e, curi.getUURI(), xmlUri);
141                 } else {
142                     logger.info(curi + ", " + xmlUri + ": " +
143                         e.getMessage());
144                 }
145             }
146         }
147         TextUtils.recycleMatcher(uris);
148         return foundLinks;
149     }
150
151     public String JavaDoc report() {
152         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
153         ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
154         ret.append(" Function: Link extraction on XML/RSS\n");
155         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
156         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
157
158         return ret.toString();
159     }
160 }
161
Popular Tags