1 2 3 4 package net.nutch.parse.ext; 5 6 import net.nutch.protocol.Content; 7 import net.nutch.parse.Parser; 8 import net.nutch.parse.Parse; 9 import net.nutch.parse.ParseData; 10 import net.nutch.parse.ParseImpl; 11 import net.nutch.parse.Outlink; 12 import net.nutch.parse.ParseException; 13 14 import net.nutch.util.LogFormatter; 15 import net.nutch.util.NutchConf; 16 import net.nutch.util.CommandRunner; 17 18 import net.nutch.plugin.Extension; 19 import net.nutch.plugin.PluginRepository; 20 21 import java.util.Hashtable ; 22 import java.util.Properties ; 23 import java.util.logging.Level ; 24 import java.util.logging.Logger ; 25 26 import java.io.ByteArrayInputStream ; 27 import java.io.ByteArrayOutputStream ; 28 29 34 35 public class ExtParser implements Parser { 36 public static final Logger LOG = 37 LogFormatter.getLogger("net.nutch.parse.ext"); 38 39 static final int BUFFER_SIZE = 4096; 40 41 static final int TIMEOUT_DEFAULT = 30; 43 static Hashtable TYPE_PARAMS_MAP = new Hashtable (); 45 46 static { 48 Extension[] extensions = PluginRepository.getInstance() 49 .getExtensionPoint("net.nutch.parse.Parser").getExtentens(); 50 51 String contentType, command, timeoutString; 52 53 for (int i = 0; i < extensions.length; i++) { 54 Extension extension = extensions[i]; 55 56 if (!extension.getDiscriptor().getPluginId().equals("parse-ext")) 58 continue; 59 60 contentType = extension.getAttribute("contentType"); 61 if (contentType == null || contentType.equals("")) 62 continue; 63 64 command = extension.getAttribute("command"); 65 if (command == null || command.equals("")) 66 continue; 67 68 timeoutString = extension.getAttribute("timeout"); 69 if (timeoutString == null || timeoutString.equals("")) 70 timeoutString = "" + TIMEOUT_DEFAULT; 71 72 TYPE_PARAMS_MAP.put(contentType, new String []{command, timeoutString}); 73 } 74 } 75 76 public ExtParser () {} 77 78 public Parse getParse(Content content) throws ParseException { 79 80 String contentType = content.getContentType(); 81 82 String [] params = (String []) TYPE_PARAMS_MAP.get(contentType); 83 if (params == null) 84 throw new ParseException( 85 "No external command defined for contentType: " + contentType); 86 87 String command = params[0]; 88 int timeout = Integer.parseInt(params[1]); 89 90 if (LOG.isLoggable(Level.FINE)) 91 LOG.fine("Use "+command+ " with timeout="+timeout+"secs"); 92 93 String text = null; 94 String title = null; 95 96 try { 97 98 byte[] raw = content.getContent(); 99 100 String contentLength = 101 (String )content.getMetadata().get("Content-Length"); 102 if (contentLength != null 103 && raw.length != Integer.parseInt(contentLength)) { 104 throw new ParseException("Content truncated at "+raw.length 105 +" bytes. Parser can't handle incomplete "+contentType+" file."); 106 } 107 108 ByteArrayOutputStream os = new ByteArrayOutputStream (BUFFER_SIZE); 109 ByteArrayOutputStream es = new ByteArrayOutputStream (BUFFER_SIZE/4); 110 111 CommandRunner cr = new CommandRunner(); 112 113 cr.setCommand(command+ " " +contentType); 114 cr.setInputStream(new ByteArrayInputStream (raw)); 115 cr.setStdOutputStream(os); 116 cr.setStdErrorStream(es); 117 118 cr.setTimeout(timeout); 119 120 cr.evaluate(); 121 122 if (cr.getExitValue() != 0) 123 throw new ParseException("External command "+command 124 +" failed with error: "+es.toString()); 125 126 text = os.toString(); 127 128 } catch (ParseException e) { 129 throw e; 130 } catch (Exception e) { throw new ParseException("ExtParser failed. "+e); 132 } 133 134 if (text == null) 135 text = ""; 136 137 if (title == null) 138 title = ""; 139 140 Outlink[] outlinks = new Outlink[0]; 142 143 Properties metaData = new Properties (); 145 metaData.putAll(content.getMetadata()); 147 ParseData parseData = new ParseData(title, outlinks, metaData); 148 return new ParseImpl(text, parseData); 149 } 150 151 } 152 | Popular Tags |