KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > ext > ExtParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.ext;
5
6 import net.nutch.protocol.Content;
7 import net.nutch.parse.Parser;
8 import net.nutch.parse.Parse;
9 import net.nutch.parse.ParseData;
10 import net.nutch.parse.ParseImpl;
11 import net.nutch.parse.Outlink;
12 import net.nutch.parse.ParseException;
13
14 import net.nutch.util.LogFormatter;
15 import net.nutch.util.NutchConf;
16 import net.nutch.util.CommandRunner;
17
18 import net.nutch.plugin.Extension;
19 import net.nutch.plugin.PluginRepository;
20
21 import java.util.Hashtable JavaDoc;
22 import java.util.Properties JavaDoc;
23 import java.util.logging.Level JavaDoc;
24 import java.util.logging.Logger JavaDoc;
25
26 import java.io.ByteArrayInputStream JavaDoc;
27 import java.io.ByteArrayOutputStream JavaDoc;
28
29 /**
30  * A wrapper that invokes external command to do real parsing job.
31  *
32  * @author John Xing
33  */

34
35 public class ExtParser implements Parser {
36   public static final Logger JavaDoc LOG =
37     LogFormatter.getLogger("net.nutch.parse.ext");
38
39   static final int BUFFER_SIZE = 4096;
40
41   static final int TIMEOUT_DEFAULT = 30; // in seconds
42

43   // handy map from String contentType to String[] {command, timeoutString}
44
static Hashtable JavaDoc TYPE_PARAMS_MAP = new Hashtable JavaDoc();
45
46   // set TYPE_PARAMS_MAP using plugin.xml of this plugin
47
static {
48     Extension[] extensions = PluginRepository.getInstance()
49       .getExtensionPoint("net.nutch.parse.Parser").getExtentens();
50
51     String JavaDoc contentType, command, timeoutString;
52
53     for (int i = 0; i < extensions.length; i++) {
54       Extension extension = extensions[i];
55
56       // only look for extensions defined by plugin parse-ext
57
if (!extension.getDiscriptor().getPluginId().equals("parse-ext"))
58         continue;
59
60       contentType = extension.getAttribute("contentType");
61       if (contentType == null || contentType.equals(""))
62         continue;
63
64       command = extension.getAttribute("command");
65       if (command == null || command.equals(""))
66         continue;
67
68       timeoutString = extension.getAttribute("timeout");
69       if (timeoutString == null || timeoutString.equals(""))
70         timeoutString = "" + TIMEOUT_DEFAULT;
71
72       TYPE_PARAMS_MAP.put(contentType, new String JavaDoc[]{command, timeoutString});
73     }
74   }
75
76   public ExtParser () {}
77
78   public Parse getParse(Content content) throws ParseException {
79
80     String JavaDoc contentType = content.getContentType();
81
82     String JavaDoc[] params = (String JavaDoc[]) TYPE_PARAMS_MAP.get(contentType);
83     if (params == null)
84       throw new ParseException(
85         "No external command defined for contentType: " + contentType);
86
87     String JavaDoc command = params[0];
88     int timeout = Integer.parseInt(params[1]);
89
90     if (LOG.isLoggable(Level.FINE))
91       LOG.fine("Use "+command+ " with timeout="+timeout+"secs");
92
93     String JavaDoc text = null;
94     String JavaDoc title = null;
95
96     try {
97
98       byte[] raw = content.getContent();
99
100       String JavaDoc contentLength =
101         (String JavaDoc)content.getMetadata().get("Content-Length");
102       if (contentLength != null
103             && raw.length != Integer.parseInt(contentLength)) {
104           throw new ParseException("Content truncated at "+raw.length
105             +" bytes. Parser can't handle incomplete "+contentType+" file.");
106       }
107
108       ByteArrayOutputStream JavaDoc os = new ByteArrayOutputStream JavaDoc(BUFFER_SIZE);
109       ByteArrayOutputStream JavaDoc es = new ByteArrayOutputStream JavaDoc(BUFFER_SIZE/4);
110
111       CommandRunner cr = new CommandRunner();
112
113       cr.setCommand(command+ " " +contentType);
114       cr.setInputStream(new ByteArrayInputStream JavaDoc(raw));
115       cr.setStdOutputStream(os);
116       cr.setStdErrorStream(es);
117
118       cr.setTimeout(timeout);
119
120       cr.evaluate();
121
122       if (cr.getExitValue() != 0)
123         throw new ParseException("External command "+command
124           +" failed with error: "+es.toString());
125
126       text = os.toString();
127
128     } catch (ParseException e) {
129       throw e;
130     } catch (Exception JavaDoc e) { // run time exception
131
throw new ParseException("ExtParser failed. "+e);
132     }
133
134     if (text == null)
135       text = "";
136
137     if (title == null)
138       title = "";
139
140     // collect outlink
141
Outlink[] outlinks = new Outlink[0];
142
143     // collect meta data
144
Properties JavaDoc metaData = new Properties JavaDoc();
145     metaData.putAll(content.getMetadata()); // copy through
146

147     ParseData parseData = new ParseData(title, outlinks, metaData);
148     return new ParseImpl(text, parseData);
149   }
150
151 }
152
Popular Tags