1 2 3 4 package net.nutch.indexer.more; 5 6 import org.apache.oro.text.regex.Perl5Compiler; 7 import org.apache.oro.text.regex.Perl5Matcher; 8 import org.apache.oro.text.regex.Perl5Pattern; 9 import org.apache.oro.text.regex.PatternMatcher; 10 import org.apache.oro.text.regex.MatchResult; 11 import org.apache.oro.text.regex.MalformedPatternException; 12 13 import javax.activation.MimetypesFileTypeMap ; 14 import javax.activation.MimeType ; 15 import javax.activation.MimeTypeParseException ; 16 17 import org.apache.lucene.document.Document; 18 import org.apache.lucene.document.Field; 19 20 import net.nutch.net.protocols.HttpDateFormat; 21 22 import java.text.DateFormat ; 23 import java.text.ParseException ; 24 import java.text.SimpleDateFormat ; 25 26 import net.nutch.parse.Parse; 27 28 import net.nutch.indexer.IndexingFilter; 29 import net.nutch.indexer.IndexingException; 30 31 import net.nutch.fetcher.FetcherOutput; 32 33 import net.nutch.util.NutchConf; 34 35 import net.nutch.util.LogFormatter; 36 import java.util.logging.Level ; 37 import java.util.logging.Logger ; 38 39 import java.util.Date ; 40 import java.util.Enumeration ; 41 import java.util.Properties ; 42 43 import java.io.InputStream ; 44 import java.io.IOException ; 45 46 54 55 public class MoreIndexingFilter implements IndexingFilter { 56 public static final Logger LOG 57 = LogFormatter.getLogger(MoreIndexingFilter.class.getName()); 58 59 static MimetypesFileTypeMap TYPE_MAP = null; 62 static { 63 try { 64 InputStream is = 66 NutchConf.getConfResourceAsInputStream 67 (NutchConf.get("mime.types.file")); 68 if (is == null) { 69 LOG.warning 70 ("no mime.types.file: content-type won't be indexed."); 71 TYPE_MAP = null; 72 } else { 73 TYPE_MAP = new MimetypesFileTypeMap (is); 74 } 75 76 if (is != null) 77 is.close(); 78 } catch (IOException e) { 79 LOG.log(Level.SEVERE, "Unexpected error", e); 80 } 81 } 82 83 public Document filter(Document doc, Parse parse, FetcherOutput fo) 84 throws IndexingException { 85 86 String url = fo.getUrl().toString(); 87 88 Properties metaData = normalizeMeta(parse.getData().getMetadata()); 90 91 addTime(doc, metaData, url); 92 93 addLength(doc, metaData, url); 94 95 if (TYPE_MAP != null) 96 addType(doc, metaData, url); 97 98 resetTitle(doc, metaData, url); 99 100 return doc; 101 } 102 103 private Document addTime(Document doc, Properties metaData, String url) { 106 107 String lastModified = metaData.getProperty("last-modified"); 108 if (lastModified == null) 109 return doc; 110 111 DateFormat df = new SimpleDateFormat ("EEE MMM dd HH:mm:ss yyyy zzz"); 113 try { 114 lastModified = new Long (HttpDateFormat.toLong(lastModified)).toString(); 115 } catch (ParseException e) { 116 try { 118 Date d = df.parse(lastModified); 119 lastModified = new Long (d.getTime()).toString(); 120 } catch (Exception e1) { 121 LOG.fine(url+": can't parse erroneous last-modified: "+lastModified); 122 lastModified = null; 123 } 124 } 125 126 if (lastModified != null) 127 doc.add(Field.UnIndexed("lastModified", lastModified)); 128 129 return doc; 130 } 131 132 private Document addLength(Document doc, Properties metaData, String url) { 134 String contentLength = metaData.getProperty("content-length"); 135 136 if (contentLength != null) 137 doc.add(Field.UnIndexed("contentLength", contentLength)); 138 139 return doc; 140 } 141 142 private Document addType(Document doc, Properties metaData, String url) { 144 String contentType = metaData.getProperty("content-type"); 145 if (contentType == null) 146 return doc; 147 148 MimeType mimeType; 149 try { 150 mimeType = new MimeType (contentType); 151 } catch (MimeTypeParseException e) { 152 LOG.warning(url+": can't parse erroneous content-type: "+contentType); 153 return doc; 154 } 155 156 String primaryType = mimeType.getPrimaryType(); 157 String subType = mimeType.getSubType(); 158 161 doc.add(Field.UnIndexed("primaryType", primaryType)); 163 doc.add(Field.UnIndexed("subType", subType)); 164 165 return doc; 166 } 167 168 172 private PatternMatcher matcher = new Perl5Matcher(); 176 static Perl5Pattern patterns[] = {null, null}; 177 static { 178 Perl5Compiler compiler = new Perl5Compiler(); 179 try { 180 patterns[0] = 182 (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]"); 183 patterns[1] = 184 (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b"); 185 } catch (MalformedPatternException e) { 186 } 188 } 189 190 private Document resetTitle(Document doc, Properties metaData, String url) { 191 String contentDisposition = metaData.getProperty("content-disposition"); 192 if (contentDisposition == null) 193 return doc; 194 195 MatchResult result; 196 for (int i=0; i<patterns.length; i++) { 197 if (matcher.contains(contentDisposition,patterns[i])) { 198 result = matcher.getMatch(); 199 doc.add(Field.UnIndexed("title", result.group(1))); 200 break; 201 } 202 } 203 204 return doc; 205 } 206 207 private Properties normalizeMeta(Properties old) { 216 Properties normalized = new Properties (); 217 218 for (Enumeration e = old.propertyNames(); e.hasMoreElements();) { 219 String key = (String ) e.nextElement(); 220 String value = old.getProperty(key).trim(); 221 if (value == null || value.equals("")) 223 continue; 224 normalized.setProperty(key.toLowerCase(),value); 226 } 227 228 return normalized; 229 } 230 231 } 232 | Popular Tags |