KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > extractor > ExtractorDOC


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  *
19  * Created on Jul 7, 2003
20  *
21  */

22 package org.archive.crawler.extractor;
23
24 import java.io.IOException JavaDoc;
25 import java.util.logging.Logger JavaDoc;
26 import java.util.regex.Matcher JavaDoc;
27 import java.util.regex.Pattern JavaDoc;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.datamodel.CoreAttributeConstants;
31 import org.archive.crawler.datamodel.CrawlURI;
32 import org.archive.io.ReplayInputStream;
33 import org.archive.io.SeekReader;
34 import org.archive.io.SeekReaderCharSequence;
35 import org.archive.util.ms.Doc;
36
37 /**
38  * This class allows the caller to extract href style links from word97-format word documents.
39  *
40  * @author Parker Thompson
41  *
42  */

43 public class ExtractorDOC extends Extractor implements CoreAttributeConstants {
44
45     private static final long serialVersionUID = 1896822554981116303L;
46     
47     private static Pattern JavaDoc PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
48
49     private static Logger JavaDoc logger =
50         Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");
51     private long numberOfCURIsHandled = 0;
52     private long numberOfLinksExtracted = 0;
53
54     /**
55      * @param name
56      */

57     public ExtractorDOC(String JavaDoc name) {
58         super(name, "MS-Word document Extractor. Extracts links from MS-Word" +
59                 " '.doc' documents.");
60     }
61
62     /**
63      * Processes a word document and extracts any hyperlinks from it.
64      * This only extracts href style links, and does not examine the actual
65      * text for valid URIs.
66      * @param curi CrawlURI to process.
67      */

68     protected void extract(CrawlURI curi){
69         // Assumes docs will be coming in through http.
70
// TODO make this more general (currently we're only fetching via http
71
// so it doesn't matter)
72
if (!isHttpTransactionContentToProcess(curi) ||
73                 !isExpectedMimeType(curi.getContentType(),
74                     "application/msword")) {
75             return;
76         }
77
78         int links = 0;
79         ReplayInputStream documentStream = null;
80         SeekReader docReader = null;
81         
82         numberOfCURIsHandled++;
83
84         // Get the doc as a repositionable reader
85
try
86         {
87             documentStream = curi.getHttpRecorder().getRecordedInput().
88                 getContentReplayInputStream();
89
90             if (documentStream==null) {
91                 // TODO: note problem
92
return;
93             }
94             
95             docReader = Doc.getText(documentStream);
96         }catch(Exception JavaDoc e){
97             curi.addLocalizedError(getName(),e,"ExtractorDOC Exception");
98             return;
99         } finally {
100             try {
101                 documentStream.close();
102             } catch (IOException JavaDoc ignored) {
103
104             }
105         }
106
107         CharSequence JavaDoc cs = new SeekReaderCharSequence(docReader, 0);
108         Matcher JavaDoc m = PATTERN.matcher(cs);
109         while (m.find()) {
110             links++;
111             addLink(curi, m.group(1));
112         }
113         
114         curi.linkExtractorFinished(); // Set flag to indicate that link extraction is completed.
115
logger.fine(curi + " has " + links + " links.");
116     }
117     
118     
119     private void addLink(CrawlURI curi, String JavaDoc hyperlink) {
120         try {
121             curi.createAndAddLink(hyperlink,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
122         } catch (URIException e1) {
123             getController().logUriError(e1, curi.getUURI(), hyperlink);
124             if (getController() != null) {
125                 // Controller can be null: e.g. when running
126
// ExtractorTool.
127
getController().logUriError(e1, curi.getUURI(), hyperlink);
128             } else {
129                 logger.info(curi + ", " + hyperlink + ": "
130                         + e1.getMessage());
131             }
132         }
133         numberOfLinksExtracted++;
134     }
135
136     /* (non-Javadoc)
137      * @see org.archive.crawler.framework.Processor#report()
138      */

139     public String JavaDoc report() {
140         StringBuffer JavaDoc ret = new StringBuffer JavaDoc();
141         ret.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
142         ret.append(" Function: Link extraction on MS Word documents (.doc)\n");
143         ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
144         ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
145
146         return ret.toString();
147     }
148 }
149
Popular Tags