KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > indexer > more > MoreIndexingFilter


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.indexer.more;
5
6 import org.apache.oro.text.regex.Perl5Compiler;
7 import org.apache.oro.text.regex.Perl5Matcher;
8 import org.apache.oro.text.regex.Perl5Pattern;
9 import org.apache.oro.text.regex.PatternMatcher;
10 import org.apache.oro.text.regex.MatchResult;
11 import org.apache.oro.text.regex.MalformedPatternException;
12
13 import javax.activation.MimetypesFileTypeMap JavaDoc;
14 import javax.activation.MimeType JavaDoc;
15 import javax.activation.MimeTypeParseException JavaDoc;
16
17 import org.apache.lucene.document.Document;
18 import org.apache.lucene.document.Field;
19
20 import net.nutch.net.protocols.HttpDateFormat;
21
22 import java.text.DateFormat JavaDoc;
23 import java.text.ParseException JavaDoc;
24 import java.text.SimpleDateFormat JavaDoc;
25
26 import net.nutch.parse.Parse;
27
28 import net.nutch.indexer.IndexingFilter;
29 import net.nutch.indexer.IndexingException;
30
31 import net.nutch.fetcher.FetcherOutput;
32
33 import net.nutch.util.NutchConf;
34
35 import net.nutch.util.LogFormatter;
36 import java.util.logging.Level JavaDoc;
37 import java.util.logging.Logger JavaDoc;
38
39 import java.util.Date JavaDoc;
40 import java.util.Enumeration JavaDoc;
41 import java.util.Properties JavaDoc;
42
43 import java.io.InputStream JavaDoc;
44 import java.io.IOException JavaDoc;
45
46 /************************************
47  * Add (or reset) a few metaData properties as respective fields
48  * (if they are available), * so that they can be displayed by more.jsp
49  * (called by search.jsp).
50  * In future, need to make some of them searchable!
51  *
52  * @author John Xing
53  ***********************************/

54
55 public class MoreIndexingFilter implements IndexingFilter {
56   public static final Logger JavaDoc LOG
57     = LogFormatter.getLogger(MoreIndexingFilter.class.getName());
58
59   // Filename extension to mime-type map.
60
// Used by addType().
61
static MimetypesFileTypeMap JavaDoc TYPE_MAP = null;
62   static {
63     try {
64       // read mime types from config file
65
InputStream JavaDoc is =
66         NutchConf.getConfResourceAsInputStream
67         (NutchConf.get("mime.types.file"));
68       if (is == null) {
69         LOG.warning
70           ("no mime.types.file: content-type won't be indexed.");
71         TYPE_MAP = null;
72       } else {
73         TYPE_MAP = new MimetypesFileTypeMap JavaDoc(is);
74       }
75
76       if (is != null)
77         is.close();
78     } catch (IOException JavaDoc e) {
79       LOG.log(Level.SEVERE, "Unexpected error", e);
80     }
81   }
82
83   public Document filter(Document doc, Parse parse, FetcherOutput fo)
84     throws IndexingException {
85
86     String JavaDoc url = fo.getUrl().toString();
87
88     // normalize metaData (see note in the method below).
89
Properties JavaDoc metaData = normalizeMeta(parse.getData().getMetadata());
90
91     addTime(doc, metaData, url);
92
93     addLength(doc, metaData, url);
94
95     if (TYPE_MAP != null)
96       addType(doc, metaData, url);
97
98     resetTitle(doc, metaData, url);
99
100     return doc;
101   }
102     
103   // Add time related meta info, now Last-Modified only
104
// Others for consideration: Date, Expires
105
private Document addTime(Document doc, Properties JavaDoc metaData, String JavaDoc url) {
106
107     String JavaDoc lastModified = metaData.getProperty("last-modified");
108     if (lastModified == null)
109       return doc;
110
111     // index/store it as long value
112
DateFormat df = new SimpleDateFormat JavaDoc("EEE MMM dd HH:mm:ss yyyy zzz");
113     try {
114       lastModified = new Long JavaDoc(HttpDateFormat.toLong(lastModified)).toString();
115     } catch (ParseException JavaDoc e) {
116       // try to parse it as date in alternative format
117
try {
118         Date JavaDoc d = df.parse(lastModified);
119         lastModified = new Long JavaDoc(d.getTime()).toString();
120       } catch (Exception JavaDoc e1) {
121         LOG.fine(url+": can't parse erroneous last-modified: "+lastModified);
122         lastModified = null;
123       }
124     }
125
126     if (lastModified != null)
127       doc.add(Field.UnIndexed("lastModified", lastModified));
128
129     return doc;
130   }
131
132   // Add Content-Length
133
private Document addLength(Document doc, Properties JavaDoc metaData, String JavaDoc url) {
134     String JavaDoc contentLength = metaData.getProperty("content-length");
135
136     if (contentLength != null)
137       doc.add(Field.UnIndexed("contentLength", contentLength));
138
139     return doc;
140   }
141
142   // Add Content-Type as primaryType and subType
143
private Document addType(Document doc, Properties JavaDoc metaData, String JavaDoc url) {
144     String JavaDoc contentType = metaData.getProperty("content-type");
145     if (contentType == null)
146       return doc;
147
148     MimeType JavaDoc mimeType;
149     try {
150       mimeType = new MimeType JavaDoc(contentType);
151     } catch (MimeTypeParseException JavaDoc e) {
152       LOG.warning(url+": can't parse erroneous content-type: "+contentType);
153       return doc;
154     }
155
156     String JavaDoc primaryType = mimeType.getPrimaryType();
157     String JavaDoc subType = mimeType.getSubType();
158     // leave this for future improvement
159
//MimeTypeParameterList parameterList = mimeType.getParameters()
160

161     // primaryType and subType are stored
162
doc.add(Field.UnIndexed("primaryType", primaryType));
163     doc.add(Field.UnIndexed("subType", subType));
164
165     return doc;
166   }
167
168   // Reset title if we see non-standard HTTP header "Content-Disposition".
169
// It's a good indication that content provider wants filename therein
170
// be used as the title of this url.
171

172   // Patterns used to extract filename from possible non-standard
173
// HTTP header "Content-Disposition". Typically it looks like:
174
// Content-Disposition: inline; filename="foo.ppt"
175
private PatternMatcher matcher = new Perl5Matcher();
176   static Perl5Pattern patterns[] = {null, null};
177   static {
178     Perl5Compiler compiler = new Perl5Compiler();
179     try {
180       // order here is important
181
patterns[0] =
182         (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
183       patterns[1] =
184         (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
185     } catch (MalformedPatternException e) {
186       // just ignore
187
}
188   }
189
190   private Document resetTitle(Document doc, Properties JavaDoc metaData, String JavaDoc url) {
191     String JavaDoc contentDisposition = metaData.getProperty("content-disposition");
192     if (contentDisposition == null)
193       return doc;
194
195     MatchResult result;
196     for (int i=0; i<patterns.length; i++) {
197       if (matcher.contains(contentDisposition,patterns[i])) {
198         result = matcher.getMatch();
199         doc.add(Field.UnIndexed("title", result.group(1)));
200         break;
201       }
202     }
203
204     return doc;
205   }
206
207   // Meta info in nutch metaData are saved in raw form, i.e.,
208
// whatever the fetcher sees. To facilitate further processing,
209
// a "normalization" is necessary.
210
// This includes fixing http server oddities, such as:
211
// (*) non-uniform casing of header names
212
// (*) empty header value
213
// Note: the original metaData should be kept intact,
214
// because there is a benefit to preserve whatever comes from server.
215
private Properties JavaDoc normalizeMeta(Properties JavaDoc old) {
216     Properties JavaDoc normalized = new Properties JavaDoc();
217
218     for (Enumeration JavaDoc e = old.propertyNames(); e.hasMoreElements();) {
219       String JavaDoc key = (String JavaDoc) e.nextElement();
220       String JavaDoc value = old.getProperty(key).trim();
221       // some http server sends out header with empty value! if so, skip it
222
if (value == null || value.equals(""))
223         continue;
224       // convert key (but, not value) to lower-case
225
normalized.setProperty(key.toLowerCase(),value);
226     }
227
228     return normalized;
229   }
230
231 }
232
Popular Tags