1 22 package org.archive.crawler.extractor; 23 24 import java.io.IOException ; 25 import java.util.logging.Logger ; 26 import java.util.regex.Matcher ; 27 import java.util.regex.Pattern ; 28 29 import org.apache.commons.httpclient.URIException; 30 import org.archive.crawler.datamodel.CoreAttributeConstants; 31 import org.archive.crawler.datamodel.CrawlURI; 32 import org.archive.io.ReplayInputStream; 33 import org.archive.io.SeekReader; 34 import org.archive.io.SeekReaderCharSequence; 35 import org.archive.util.ms.Doc; 36 37 43 public class ExtractorDOC extends Extractor implements CoreAttributeConstants { 44 45 private static final long serialVersionUID = 1896822554981116303L; 46 47 private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\""); 48 49 private static Logger logger = 50 Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC"); 51 private long numberOfCURIsHandled = 0; 52 private long numberOfLinksExtracted = 0; 53 54 57 public ExtractorDOC(String name) { 58 super(name, "MS-Word document Extractor. Extracts links from MS-Word" + 59 " '.doc' documents."); 60 } 61 62 68 protected void extract(CrawlURI curi){ 69 if (!isHttpTransactionContentToProcess(curi) || 73 !isExpectedMimeType(curi.getContentType(), 74 "application/msword")) { 75 return; 76 } 77 78 int links = 0; 79 ReplayInputStream documentStream = null; 80 SeekReader docReader = null; 81 82 numberOfCURIsHandled++; 83 84 try 86 { 87 documentStream = curi.getHttpRecorder().getRecordedInput(). 88 getContentReplayInputStream(); 89 90 if (documentStream==null) { 91 return; 93 } 94 95 docReader = Doc.getText(documentStream); 96 }catch(Exception e){ 97 curi.addLocalizedError(getName(),e,"ExtractorDOC Exception"); 98 return; 99 } finally { 100 try { 101 documentStream.close(); 102 } catch (IOException ignored) { 103 104 } 105 } 106 107 CharSequence cs = new SeekReaderCharSequence(docReader, 0); 108 Matcher m = PATTERN.matcher(cs); 109 while (m.find()) { 110 links++; 111 addLink(curi, m.group(1)); 112 } 113 114 curi.linkExtractorFinished(); logger.fine(curi + " has " + links + " links."); 116 } 117 118 119 private void addLink(CrawlURI curi, String hyperlink) { 120 try { 121 curi.createAndAddLink(hyperlink,Link.NAVLINK_MISC,Link.NAVLINK_HOP); 122 } catch (URIException e1) { 123 getController().logUriError(e1, curi.getUURI(), hyperlink); 124 if (getController() != null) { 125 getController().logUriError(e1, curi.getUURI(), hyperlink); 128 } else { 129 logger.info(curi + ", " + hyperlink + ": " 130 + e1.getMessage()); 131 } 132 } 133 numberOfLinksExtracted++; 134 } 135 136 139 public String report() { 140 StringBuffer ret = new StringBuffer (); 141 ret.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n"); 142 ret.append(" Function: Link extraction on MS Word documents (.doc)\n"); 143 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n"); 144 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n"); 145 146 return ret.toString(); 147 } 148 } 149 | Popular Tags |