1 2 3 package net.nutch.analysis.lang; 4 5 import java.io.BufferedReader ; 6 import java.io.File ; 7 import java.io.FileInputStream ; 8 import java.io.IOException ; 9 import java.io.InputStream ; 10 import java.io.InputStreamReader ; 11 import java.util.Iterator ; 12 import java.util.Vector ; 13 import java.util.logging.Logger ; 14 15 import net.nutch.fetcher.FetcherOutput; 16 import net.nutch.indexer.IndexingException; 17 import net.nutch.indexer.IndexingFilter; 18 import net.nutch.parse.Parse; 19 import net.nutch.parse.ParseException; 20 import net.nutch.parse.Parser; 21 import net.nutch.parse.ParserFactory; 22 import net.nutch.parse.ParserNotFound; 23 import net.nutch.protocol.Content; 24 import net.nutch.protocol.Protocol; 25 import net.nutch.protocol.ProtocolException; 26 import net.nutch.protocol.ProtocolFactory; 27 import net.nutch.protocol.ProtocolNotFound; 28 import net.nutch.util.LogFormatter; 29 30 import org.apache.lucene.document.Document; 31 import org.apache.lucene.document.Field; 32 import java.util.Properties ; 33 import java.util.Enumeration ; 34 35 40 public class LanguageIdentifier implements IndexingFilter { 41 public static final Logger LOG = LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier"); 42 43 private Vector languages = new Vector (); 44 45 private Vector supportedLanguages = new Vector (); 46 47 private static LanguageIdentifier identifier = new LanguageIdentifier(true); 48 49 private static float SCORE_THRESOLD = 0.00F; 50 51 public LanguageIdentifier() {} 53 54 private LanguageIdentifier(boolean fake) { 55 Properties p = new Properties (); 56 try { 57 p.load(this.getClass().getResourceAsStream("langmappings.properties")); 58 59 Enumeration alllanguages = p.keys(); 60 61 StringBuffer list = new StringBuffer ("Language identifier plugin supports:"); 62 while (alllanguages.hasMoreElements()) { 63 String lang = (String ) (alllanguages.nextElement()); 64 65 InputStream is = this.getClass().getClassLoader().getResourceAsStream( 66 "net/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION); 67 68 if (is != null) { 69 NGramProfile profile = new NGramProfile(lang); 70 try { 71 profile.load(is); 72 languages.add(profile); 73 supportedLanguages.add(lang); 74 list.append(" " + lang); 75 is.close(); 76 } catch (IOException e1) { 77 LOG.severe(e1.toString()); 78 } 79 } 80 } 81 LOG.info(list.toString()); 82 } catch (Exception e) { 83 LOG.severe(e.toString()); 84 } 85 } 86 87 90 public static LanguageIdentifier getInstance() { 91 return identifier; 92 } 93 94 99 public static void main(String args[]) { 100 101 String usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]"; 102 int command = 0; 103 104 final int IDFILE = 1; 105 final int IDTEXT = 2; 106 final int IDURL = 3; 107 final int IDFILESET = 4; 108 final int IDROWS = 5; 109 110 Vector fileset = new Vector (); 111 String filename = ""; 112 String url = ""; 113 String text = ""; 114 int max = 0; 115 116 if (args.length == 0) { 117 System.err.println(usage); 118 System.exit(-1); 119 } 120 121 for (int i = 0; i < args.length; i++) { if (args[i].equals("-identifyfile")) { 123 command = IDFILE; 124 filename = args[++i]; 125 } 126 127 if (args[i].equals("-identifyurl")) { 128 command = IDURL; 129 filename = args[++i]; 130 } 131 132 if (args[i].equals("-identifyrows")) { 133 command = IDROWS; 134 filename = args[++i]; 135 max = Integer.parseInt(args[++i]); 136 } 137 138 if (args[i].equals("-identifytext")) { 139 command = IDTEXT; 140 for (i++; i < args.length - 1; i++) 141 text += args[i] + " "; 142 } 143 144 if (args[i].equals("-identifyfileset")) { 145 command = IDFILESET; 146 for (i++; i < args.length; i++) { 147 fileset.add(args[i]); 148 System.out.println(args[i]); 149 } 150 } 151 152 } 153 154 String lang = null; 155 LanguageIdentifier idfr = LanguageIdentifier.getInstance(); 156 File f; 157 FileInputStream fis; 158 try { 159 switch (command) { 160 161 case IDTEXT: 162 lang = idfr.identify(text); 163 break; 164 165 case IDFILE: 166 f = new File (filename); 167 fis = new FileInputStream (f); 168 lang = idfr.identify(fis); 169 fis.close(); 170 break; 171 172 case IDURL: 173 text = getUrlContent(filename); 174 lang = idfr.identify(text); 175 break; 176 177 case IDROWS: 178 f = new File (filename); 179 BufferedReader br = new BufferedReader (new InputStreamReader (new FileInputStream (f))); 180 String line; 181 while (max > 0 && (line = br.readLine()) != null) { 182 line = line.trim(); 183 if (line.length() > 2) { 184 max--; 185 lang = idfr.identify(line); 186 System.out.println("R=" + lang + ":" + line); 187 } 188 } 189 190 br.close(); 191 System.exit(0); 192 break; 193 194 case IDFILESET: 195 System.out.println("FILESET"); 196 Iterator i = fileset.iterator(); 197 198 while (i.hasNext()) { 199 try { 200 filename = (String ) i.next(); 201 f = new File (filename); 202 fis = new FileInputStream (f); 203 lang = idfr.identify(fis); 204 fis.close(); 205 } catch (Exception e) { 206 System.out.println(e); 207 } 208 209 System.out.println(filename + " was identified as " + lang); 210 } 211 System.exit(0); 212 break; 213 214 } 215 } catch (Exception e) { 216 System.out.println(e); 217 } 218 System.out.println("text was identified as " + lang); 219 } 220 221 225 private static String getUrlContent(String url) { 226 Protocol protocol; 227 try { 228 protocol = ProtocolFactory.getProtocol(url); 229 Content content = protocol.getContent(url); 230 String contentType = content.getContentType(); 231 Parser parser = ParserFactory.getParser(contentType, url); 232 Parse parse = parser.getParse(content); 233 System.out.println("text:" + parse.getText()); 234 return parse.getText(); 235 236 } catch (ProtocolNotFound e) { 237 e.printStackTrace(); 238 } catch (ProtocolException e) { 239 e.printStackTrace(); 240 } catch (ParserNotFound e) { 241 e.printStackTrace(); 242 } catch (ParseException e) { 243 e.printStackTrace(); 244 } 245 return null; 246 } 247 248 255 public String identify(String text) { 256 257 return identify(new StringBuffer (text)); 258 } 259 260 public String identify(StringBuffer text) { 261 262 NGramProfile p = new NGramProfile("suspect"); 263 p.analyze(text); 264 265 float topscore = Float.MAX_VALUE; 266 String lang = ""; 267 268 Iterator i = languages.iterator(); 269 while (i.hasNext()) { 270 271 NGramProfile profile = (NGramProfile) i.next(); 272 float score = profile.getSimilarity(p); 273 274 276 if (score < topscore) { 277 topscore = score; 278 lang = profile.getName(); 279 } 280 } 281 282 p.ngrams.clear(); 283 p = null; 284 285 LOG.finest("TOPSCORE: " + lang + " with " + topscore); 286 287 if (topscore > SCORE_THRESOLD) 288 return lang; 289 290 else return null; 291 } 292 293 300 public String identify(InputStream is) throws IOException { 301 302 StringBuffer text = new StringBuffer (); 303 byte buffer[] = new byte[2000]; 304 int len = 0; 305 306 while ((len = is.read(buffer)) != -1) { 307 text.append(new String (buffer, 0, len)); 308 } 309 310 return identify(text.toString()); 311 } 312 313 public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException { 314 315 String lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME); 317 318 if (lang == null) lang = parse.getData().get("Content-Language"); 320 321 if (lang == null) { 322 StringBuffer text = new StringBuffer (); 323 327 text.append(parse.getData().getTitle()).append(" "); 328 text.append(parse.getText()); 329 lang = LanguageIdentifier.getInstance().identify(text); 330 } 331 332 if (lang == null) { 333 lang = "unknown"; 334 } 335 336 doc.add(Field.Keyword("lang", lang)); 337 338 return doc; 339 } 340 341 } | Popular Tags |