KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > analysis > lang > NGramProfile


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.analysis.lang;
5
6 import java.io.BufferedInputStream JavaDoc;
7 import java.io.BufferedReader JavaDoc;
8 import java.io.File JavaDoc;
9 import java.io.FileInputStream JavaDoc;
10 import java.io.FileOutputStream JavaDoc;
11 import java.io.IOException JavaDoc;
12 import java.io.InputStream JavaDoc;
13 import java.io.InputStreamReader JavaDoc;
14 import java.io.OutputStream JavaDoc;
15 import java.util.Date JavaDoc;
16 import java.util.Collections JavaDoc;
17 import java.util.Hashtable JavaDoc;
18 import java.util.Iterator JavaDoc;
19 import java.util.Vector JavaDoc;
20 import java.util.logging.Logger JavaDoc;
21
22 import net.nutch.util.LogFormatter;
23
24 import org.apache.lucene.analysis.Token;
25
26 /**
27  * This class runs a ngram analysis over submitted text, results might be used
28  * for automatic language identifiaction.
29  *
30  * The similarity calculation is at experimental level. You have been warned.
31  *
32  * Methods are provided to build new NGramProfiles profiles.
33  *
34  * @author Sami Siren
35  */

36 public class NGramProfile {
37
38   public static final Logger JavaDoc LOG = LogFormatter
39       .getLogger("net.nutch.analysis.lang.NGramProfile");
40
41   private String JavaDoc name;
42
43   private Vector JavaDoc sorted = null;
44
45   private StringBuffer JavaDoc tokensb = new StringBuffer JavaDoc();
46
47   private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH;
48
49   private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH;
50
51   private int ngramcount = 0;
52
53   static final String JavaDoc NGRAM_FILE_EXTENSION = "ngp";
54
55   static final int NGRAM_LENGTH = 1000;
56
57   //separator char
58
static final char SEPARATOR = '_';
59
60   //default min length of ngram
61
static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
62
63   //default max length of ngram
64
static final int DEFAULT_MAX_NGRAM_LENGTH = 4;
65
66   //table to store ngrams
67
Hashtable JavaDoc ngrams = null;
68
69   /**
70    * private class used to store NGramEntry
71    */

72   class NGramEntry implements Comparable JavaDoc {
73     private CharSequence JavaDoc seq;
74
75     private int count;
76
77     private float normalized_count;
78
79     public NGramEntry(CharSequence JavaDoc seq) {
80       this.seq = seq;
81     }
82
83     /**
84      * @param ngramsequence
85      * @param ngramcount
86      */

87     public NGramEntry(String JavaDoc ngramsequence, int ngramcount) {
88       seq = new StringBuffer JavaDoc(ngramsequence).subSequence(0, ngramsequence
89           .length());
90       this.count = ngramcount;
91     }
92
93     public int getCount() {
94       return count;
95     }
96
97     public CharSequence JavaDoc getSeq() {
98       return seq;
99     }
100
101     public int compareTo(Object JavaDoc o) {
102       if (((NGramEntry) o).count - count != 0)
103         return ((NGramEntry) o).count - count;
104       else
105         return (seq.toString().compareTo(((NGramEntry) o).seq.toString()));
106     }
107
108     public void inc() {
109       count++;
110     }
111   }
112
113   /**
114    * Construct a new ngram profile
115    *
116    * @param name
117    * Name of profile
118    */

119   public NGramProfile(String JavaDoc name) {
120     this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
121   }
122
123   /**
124    * Construct a new ngram profile
125    *
126    * @param name
127    * Name of profile
128    * @param minlen
129    * min length of ngram sequences
130    * @param maxlen
131    * max length of ngram sequences
132    */

133   public NGramProfile(String JavaDoc name, int minlen, int maxlen) {
134     ngrams = new Hashtable JavaDoc();
135     this.max_ngram_length = maxlen;
136     this.min_ngram_length = minlen;
137     this.name = name;
138   }
139
140   /**
141    * Add ngrams from a token to this profile
142    *
143    * @param t
144    * Token to be added
145    */

146   public void addFromToken(Token t) {
147     tokensb.setLength(0);
148     tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR);
149     addNGrams(tokensb);
150   }
151
152   /**
153    * Analyze a piece of text
154    *
155    * @param text
156    * the text to be analyzed
157    */

158   public void analyze(StringBuffer JavaDoc text) {
159     StringBuffer JavaDoc word;
160     int i;
161
162     if (ngrams != null) {
163       ngrams.clear();
164     }
165
166     word = new StringBuffer JavaDoc().append(SEPARATOR);
167     for (i = 0; i < text.length(); i++) {
168       char c = Character.toLowerCase(text.charAt(i));
169
170       if (Character.isLetter(c)) {
171         word.append(c);
172       } else {
173         //found word boundary
174
if (word.length() > 1) {
175           //we have a word!
176
word.append(SEPARATOR);
177           addNGrams(word);
178           word.delete(0, word.length());
179         }
180       }
181     }
182
183     if (word.length() > 1) {
184       //we have a last word
185
word.append(SEPARATOR);
186       addNGrams(word);
187     }
188     normalize();
189   }
190
191   /**
192    * Normalize profile
193    */

194   protected void normalize() {
195     Vector JavaDoc sorted = getSorted();
196     int sum = 0;
197
198     //only calculate ngramcount if it was not available in profile
199
if (ngramcount == 0) {
200       for (int i = 0; i < sorted.size(); i++) {
201         ngramcount += ((NGramEntry) sorted.get(i)).count;
202       }
203     }
204
205     if (sorted.size() > 0) {
206       Iterator JavaDoc i = sorted.iterator();
207
208       while (i.hasNext()) {
209         NGramEntry e = (NGramEntry) i.next();
210         e.normalized_count = e.count / (float)ngramcount;
211       }
212     }
213   }
214
215   /**
216    * Add ngrams from a single word to this profile
217    *
218    * @param word
219    */

220   public void addNGrams(StringBuffer JavaDoc word) {
221     int i;
222
223     for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) {
224       addNGrams(word, i);
225     }
226   }
227
228   /**
229    * @param word
230    * @param n
231    * sequence length
232    */

233   private void addNGrams(StringBuffer JavaDoc word, int n) {
234     NGramEntry nge;
235     StringBuffer JavaDoc sb;
236     int i;
237
238     for (i = 0; i <= word.length() - n; i++) {
239
240       CharSequence JavaDoc cs = word.subSequence(i, i + n);
241
242       if (ngrams.containsKey(cs)) {
243         nge = (NGramEntry) ngrams.get(cs);
244       } else {
245         nge = new NGramEntry(cs);
246       }
247       nge.inc();
248       ngrams.put(cs, nge);
249     }
250   }
251
252   /**
253    * Return sorted vector of ngrams (sort done by 1. count 2. sequence)
254    *
255    * @return sorted vector of ngrams
256    */

257   public Vector JavaDoc getSorted() {
258     //make sure srting is done only once
259
if (sorted == null) {
260       sorted = new Vector JavaDoc(ngrams.values());
261       Collections.sort(sorted);
262
263       //trim at NGRAM_LENGTH entries
264
if (sorted.size() > NGRAM_LENGTH)
265         sorted.setSize(NGRAM_LENGTH);
266     }
267
268     return sorted;
269   }
270
271   /**
272    * Return ngramprofile as text
273    *
274    * @return ngramprofile as text
275    */

276   public String JavaDoc toString() {
277     StringBuffer JavaDoc s = new StringBuffer JavaDoc();
278
279     Iterator JavaDoc i = getSorted().iterator();
280
281     s.append("NGramProfile: ").append(name).append("\n");
282     while (i.hasNext()) {
283       NGramEntry entry = (NGramEntry) i.next();
284       s.append(entry.count).append(':').append(entry.seq).append(" ").append(
285           entry.normalized_count).append("\n");
286     }
287     return s.toString();
288   }
289
290   /**
291    * Calculate a score how well NGramProfiles match each other
292    *
293    * @param another
294    * ngram profile to compare against
295    * @return similarity 0=exact match
296    */

297   public float getSimilarity(NGramProfile another) {
298     float sum = 0;
299
300     try {
301       Iterator JavaDoc i = another.getSorted().iterator();
302       while (i.hasNext()) {
303         NGramEntry other = (NGramEntry) i.next();
304         if (ngrams.containsKey(other.seq)) {
305           sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams
306               .get(other.seq)).normalized_count)) / 2;
307         } else {
308           sum += other.normalized_count;
309         }
310       }
311       i = getSorted().iterator();
312       while (i.hasNext()) {
313         NGramEntry other = (NGramEntry) i.next();
314         if (another.ngrams.containsKey(other.seq)) {
315           sum += Math
316               .abs((other.normalized_count - ((NGramEntry) another.ngrams
317                   .get(other.seq)).normalized_count)) / 2;
318         } else {
319           sum += other.normalized_count;
320         }
321       }
322     } catch (Exception JavaDoc e) {
323       LOG.severe(e.toString());
324     }
325     return sum;
326   }
327
328   /**
329    * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
330    */

331   public void load(InputStream JavaDoc is) throws IOException JavaDoc {
332     BufferedReader JavaDoc bis = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is, "UTF-8"));
333     String JavaDoc line;
334
335     ngrams.clear();
336
337     while ((line = bis.readLine()) != null) {
338
339       // # starts a comment line
340
if (line.charAt(0) != '#') {
341         int spacepos = line.indexOf(' ');
342         String JavaDoc ngramsequence = line.substring(0, spacepos).trim();
343         int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
344
345         if (!line.startsWith("ngram_count")) {
346           NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
347           ngrams.put(en.getSeq(), en);
348         } else {
349           this.ngramcount = ngramcount;
350         }
351       }
352     }
353     normalize();
354   }
355
356   /**
357    * Create a new Language profile from (preferably quite large) text file
358    *
359    * @param name
360    * name of profile
361    * @param is
362    * @param encoding
363    * encoding of stream
364    */

365   public static NGramProfile createNgramProfile(String JavaDoc name, InputStream JavaDoc is,
366       String JavaDoc encoding) {
367
368     NGramProfile newProfile = new NGramProfile(name);
369     BufferedInputStream JavaDoc bis = new BufferedInputStream JavaDoc(is);
370
371     byte buffer[] = new byte[4096];
372     StringBuffer JavaDoc text = new StringBuffer JavaDoc();
373     int len;
374
375     try {
376       while ((len = bis.read(buffer)) != -1) {
377         text.append(new String JavaDoc(buffer, 0, len, encoding));
378       }
379     } catch (IOException JavaDoc e) {
380       e.printStackTrace();
381     }
382
383     newProfile.analyze(text);
384
385     return newProfile;
386   }
387
388   /**
389    * Writes NGramProfile content into OutputStream, content is outputted with
390    * UTF-8 encoding
391    *
392    * @param os
393    * Stream to output to
394    * @throws IOException
395    */

396
397   public void save(OutputStream JavaDoc os) throws IOException JavaDoc {
398     Vector JavaDoc v = getSorted();
399     Iterator JavaDoc i = v.iterator();
400     os
401         .write(("# NgramProfile generated at " + new Date JavaDoc() + " for Nutch Language Identification\n")
402             .getBytes());
403     os.write(("ngram_count " + ngramcount + "\n").getBytes());
404
405     while (i.hasNext()) {
406       NGramEntry e = (NGramEntry) i.next();
407       String JavaDoc line = e.getSeq().toString() + " " + e.getCount() + "\n";
408       os.write(line.getBytes("UTF-8"));
409     }
410
411     os.flush();
412   }
413
414   /**
415    * main method used for testing only
416    *
417    * @param args
418    */

419   public static void main(String JavaDoc args[]) {
420
421     String JavaDoc usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]";
422     int command = 0;
423
424     final int CREATE = 1;
425     final int SIMILARITY = 2;
426     final int SCORE = 3;
427
428     String JavaDoc profilename = "";
429     String JavaDoc filename = "";
430     String JavaDoc filename2 = "";
431     String JavaDoc encoding = "";
432
433     if (args.length == 0) {
434       System.err.println(usage);
435       System.exit(-1);
436     }
437
438     for (int i = 0; i < args.length; i++) { // parse command line
439
if (args[i].equals("-create")) { // found -create option
440
command = CREATE;
441         profilename = args[++i];
442         filename = args[++i];
443         encoding = args[++i];
444       }
445
446       if (args[i].equals("-similarity")) { // found -similarity option
447
command = SIMILARITY;
448         filename = args[++i];
449         filename2 = args[++i];
450         encoding = args[++i];
451       }
452
453       if (args[i].equals("-score")) { // found -Score option
454
command = SCORE;
455         profilename = args[++i];
456         filename = args[++i];
457         encoding = args[++i];
458       }
459     }
460
461     try {
462
463       switch (command) {
464
465       case CREATE:
466
467         File JavaDoc f = new File JavaDoc(filename);
468         FileInputStream JavaDoc fis = new FileInputStream JavaDoc(f);
469         NGramProfile newProfile = NGramProfile.createNgramProfile(profilename,
470             fis, encoding);
471         fis.close();
472         f = new File JavaDoc(profilename + "." + NGRAM_FILE_EXTENSION);
473         FileOutputStream JavaDoc fos = new FileOutputStream JavaDoc(f);
474         newProfile.save(fos);
475         System.out.println("new profile " + profilename + "."
476             + NGRAM_FILE_EXTENSION + " was created.");
477         break;
478
479       case SIMILARITY:
480
481         f = new File JavaDoc(filename);
482         fis = new FileInputStream JavaDoc(f);
483         newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
484         newProfile.normalize();
485
486         f = new File JavaDoc(filename2);
487         fis = new FileInputStream JavaDoc(f);
488         NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2,
489             fis, encoding);
490         newProfile2.normalize();
491         System.out.println("Similarity is "
492             + newProfile.getSimilarity(newProfile2));
493         break;
494
495       case SCORE:
496         f = new File JavaDoc(filename);
497         fis = new FileInputStream JavaDoc(f);
498         newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
499
500         f = new File JavaDoc(profilename + "." + NGRAM_FILE_EXTENSION);
501         fis = new FileInputStream JavaDoc(f);
502         NGramProfile compare = new NGramProfile(profilename);
503         compare.load(fis);
504         System.out.println("Score is " + compare.getSimilarity(newProfile));
505
506         break;
507
508       }
509
510     } catch (Exception JavaDoc e) {
511       LOG.severe("Caught an exception:" + e);
512     }
513   }
514
515   /**
516    * @return Returns the name.
517    */

518   public String JavaDoc getName() {
519     return name;
520   }
521
522   /**
523    * @param name
524    * The name to set.
525    */

526   public void setName(String JavaDoc name) {
527     this.name = name;
528   }
529 }
Popular Tags