KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > lang > LanguageIdentifier


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.analysis.lang;
4
5 import java.io.BufferedReader JavaDoc;
6 import java.io.File JavaDoc;
7 import java.io.FileInputStream JavaDoc;
8 import java.io.IOException JavaDoc;
9 import java.io.InputStream JavaDoc;
10 import java.io.InputStreamReader JavaDoc;
11 import java.util.Iterator JavaDoc;
12 import java.util.Vector JavaDoc;
13 import java.util.logging.Logger JavaDoc;
14
15 import net.nutch.fetcher.FetcherOutput;
16 import net.nutch.indexer.IndexingException;
17 import net.nutch.indexer.IndexingFilter;
18 import net.nutch.parse.Parse;
19 import net.nutch.parse.ParseException;
20 import net.nutch.parse.Parser;
21 import net.nutch.parse.ParserFactory;
22 import net.nutch.parse.ParserNotFound;
23 import net.nutch.protocol.Content;
24 import net.nutch.protocol.Protocol;
25 import net.nutch.protocol.ProtocolException;
26 import net.nutch.protocol.ProtocolFactory;
27 import net.nutch.protocol.ProtocolNotFound;
28 import net.nutch.util.LogFormatter;
29
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import java.util.Properties JavaDoc;
33 import java.util.Enumeration JavaDoc;
34
35 /**
36  *
37  * @author Sami Siren
38  *
39  */

40 public class LanguageIdentifier implements IndexingFilter {
41   public static final Logger JavaDoc LOG = LogFormatter.getLogger("net.nutch.analysis.lang.LanguageIdentifier");
42
43   private Vector JavaDoc languages = new Vector JavaDoc();
44
45   private Vector JavaDoc supportedLanguages = new Vector JavaDoc();
46
47   private static LanguageIdentifier identifier = new LanguageIdentifier(true);
48
49   private static float SCORE_THRESOLD = 0.00F;
50
51   //public constructor needed for extension mechanism
52
public LanguageIdentifier() {}
53
54   private LanguageIdentifier(boolean fake) {
55     Properties JavaDoc p = new Properties JavaDoc();
56     try {
57       p.load(this.getClass().getResourceAsStream("langmappings.properties"));
58
59       Enumeration JavaDoc alllanguages = p.keys();
60
61       StringBuffer JavaDoc list = new StringBuffer JavaDoc("Language identifier plugin supports:");
62       while (alllanguages.hasMoreElements()) {
63         String JavaDoc lang = (String JavaDoc) (alllanguages.nextElement());
64
65         InputStream JavaDoc is = this.getClass().getClassLoader().getResourceAsStream(
66                 "net/nutch/analysis/lang/" + lang + "." + NGramProfile.NGRAM_FILE_EXTENSION);
67
68         if (is != null) {
69           NGramProfile profile = new NGramProfile(lang);
70           try {
71             profile.load(is);
72             languages.add(profile);
73             supportedLanguages.add(lang);
74             list.append(" " + lang);
75             is.close();
76           } catch (IOException JavaDoc e1) {
77             LOG.severe(e1.toString());
78           }
79         }
80       }
81       LOG.info(list.toString());
82     } catch (Exception JavaDoc e) {
83       LOG.severe(e.toString());
84     }
85   }
86
87   /**
88    * return handle to singleton instance
89    */

90   public static LanguageIdentifier getInstance() {
91     return identifier;
92   }
93
94   /**
95    * main method used for testing
96    *
97    * @param args
98    */

99   public static void main(String JavaDoc args[]) {
100
101     String JavaDoc usage = "Usage: LanguageIdentifier [-identifyrows filename maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext text] [-identifyurl url]";
102     int command = 0;
103
104     final int IDFILE = 1;
105     final int IDTEXT = 2;
106     final int IDURL = 3;
107     final int IDFILESET = 4;
108     final int IDROWS = 5;
109
110     Vector JavaDoc fileset = new Vector JavaDoc();
111     String JavaDoc filename = "";
112     String JavaDoc url = "";
113     String JavaDoc text = "";
114     int max = 0;
115
116     if (args.length == 0) {
117       System.err.println(usage);
118       System.exit(-1);
119     }
120
121     for (int i = 0; i < args.length; i++) { // parse command line
122
if (args[i].equals("-identifyfile")) {
123         command = IDFILE;
124         filename = args[++i];
125       }
126
127       if (args[i].equals("-identifyurl")) {
128         command = IDURL;
129         filename = args[++i];
130       }
131
132       if (args[i].equals("-identifyrows")) {
133         command = IDROWS;
134         filename = args[++i];
135         max = Integer.parseInt(args[++i]);
136       }
137
138       if (args[i].equals("-identifytext")) {
139         command = IDTEXT;
140         for (i++; i < args.length - 1; i++)
141           text += args[i] + " ";
142       }
143
144       if (args[i].equals("-identifyfileset")) {
145         command = IDFILESET;
146         for (i++; i < args.length; i++) {
147           fileset.add(args[i]);
148           System.out.println(args[i]);
149         }
150       }
151
152     }
153
154     String JavaDoc lang = null;
155     LanguageIdentifier idfr = LanguageIdentifier.getInstance();
156     File JavaDoc f;
157     FileInputStream JavaDoc fis;
158     try {
159       switch (command) {
160
161         case IDTEXT:
162           lang = idfr.identify(text);
163           break;
164
165         case IDFILE:
166           f = new File JavaDoc(filename);
167           fis = new FileInputStream JavaDoc(f);
168           lang = idfr.identify(fis);
169           fis.close();
170           break;
171
172         case IDURL:
173           text = getUrlContent(filename);
174           lang = idfr.identify(text);
175           break;
176
177         case IDROWS:
178           f = new File JavaDoc(filename);
179           BufferedReader JavaDoc br = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(new FileInputStream JavaDoc(f)));
180           String JavaDoc line;
181           while (max > 0 && (line = br.readLine()) != null) {
182             line = line.trim();
183             if (line.length() > 2) {
184               max--;
185               lang = idfr.identify(line);
186               System.out.println("R=" + lang + ":" + line);
187             }
188           }
189
190           br.close();
191           System.exit(0);
192           break;
193
194         case IDFILESET:
195           System.out.println("FILESET");
196           Iterator JavaDoc i = fileset.iterator();
197
198           while (i.hasNext()) {
199             try {
200               filename = (String JavaDoc) i.next();
201               f = new File JavaDoc(filename);
202               fis = new FileInputStream JavaDoc(f);
203               lang = idfr.identify(fis);
204               fis.close();
205             } catch (Exception JavaDoc e) {
206               System.out.println(e);
207             }
208
209             System.out.println(filename + " was identified as " + lang);
210           }
211           System.exit(0);
212           break;
213
214       }
215     } catch (Exception JavaDoc e) {
216       System.out.println(e);
217     }
218     System.out.println("text was identified as " + lang);
219   }
220
221   /**
222    * @param url
223    * @return contents of url
224    */

225   private static String JavaDoc getUrlContent(String JavaDoc url) {
226     Protocol protocol;
227     try {
228       protocol = ProtocolFactory.getProtocol(url);
229       Content content = protocol.getContent(url);
230       String JavaDoc contentType = content.getContentType();
231       Parser parser = ParserFactory.getParser(contentType, url);
232       Parse parse = parser.getParse(content);
233       System.out.println("text:" + parse.getText());
234       return parse.getText();
235
236     } catch (ProtocolNotFound e) {
237       e.printStackTrace();
238     } catch (ProtocolException e) {
239       e.printStackTrace();
240     } catch (ParserNotFound e) {
241       e.printStackTrace();
242     } catch (ParseException e) {
243       e.printStackTrace();
244     }
245     return null;
246   }
247
248   /**
249    * Identify language based on submitted content
250    *
251    * @param text text of doc
252    * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
253    * unknown
254    */

255   public String JavaDoc identify(String JavaDoc text) {
256
257     return identify(new StringBuffer JavaDoc(text));
258   }
259
260   public String JavaDoc identify(StringBuffer JavaDoc text) {
261
262     NGramProfile p = new NGramProfile("suspect");
263     p.analyze(text);
264
265     float topscore = Float.MAX_VALUE;
266     String JavaDoc lang = "";
267
268     Iterator JavaDoc i = languages.iterator();
269     while (i.hasNext()) {
270
271       NGramProfile profile = (NGramProfile) i.next();
272       float score = profile.getSimilarity(p);
273
274       //LOG.fine(profile.getName() + ":" + score);
275

276       if (score < topscore) {
277         topscore = score;
278         lang = profile.getName();
279       }
280     }
281
282     p.ngrams.clear();
283     p = null;
284
285     LOG.finest("TOPSCORE: " + lang + " with " + topscore);
286
287     if (topscore > SCORE_THRESOLD)
288       return lang;
289
290     else return null;
291   }
292
293   /**
294    * Identify language from inputstream
295    *
296    * @param is
297    * @return language code
298    * @throws IOException
299    */

300   public String JavaDoc identify(InputStream JavaDoc is) throws IOException JavaDoc {
301
302     StringBuffer JavaDoc text = new StringBuffer JavaDoc();
303     byte buffer[] = new byte[2000];
304     int len = 0;
305
306     while ((len = is.read(buffer)) != -1) {
307       text.append(new String JavaDoc(buffer, 0, len));
308     }
309
310     return identify(text.toString());
311   }
312
313   public Document filter(Document doc, Parse parse, FetcherOutput fo) throws IndexingException {
314
315     //check if X-meta-lang found, possibly put there by HTMLLanguageParser
316
String JavaDoc lang = parse.getData().get(HTMLLanguageParser.META_LANG_NAME);
317
318     //check if HTTP-header tels us the language
319
if (lang == null) lang = parse.getData().get("Content-Language");
320
321     if (lang == null) {
322       StringBuffer JavaDoc text = new StringBuffer JavaDoc();
323       /*
324        * String[] anchors = fo.getAnchors(); for (int i = 0; i < anchors.length;
325        * i++) { text+=anchors[i] + " "; }
326        */

327       text.append(parse.getData().getTitle()).append(" ");
328       text.append(parse.getText());
329       lang = LanguageIdentifier.getInstance().identify(text);
330     }
331
332     if (lang == null) {
333       lang = "unknown";
334     }
335
336     doc.add(Field.Keyword("lang", lang));
337
338     return doc;
339   }
340
341 }
Popular Tags