NGramProfile


1   /* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
2   /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3   
4   package net.nutch.analysis.lang;
5   
6   import java.io.BufferedInputStream  ;
7   import java.io.BufferedReader  ;
8   import java.io.File  ;
9   import java.io.FileInputStream  ;
10  import java.io.FileOutputStream  ;
11  import java.io.IOException  ;
12  import java.io.InputStream  ;
13  import java.io.InputStreamReader  ;
14  import java.io.OutputStream  ;
15  import java.util.Date  ;
16  import java.util.Collections  ;
17  import java.util.Hashtable  ;
18  import java.util.Iterator  ;
19  import java.util.Vector  ;
20  import java.util.logging.Logger  ;
21  
22  import net.nutch.util.LogFormatter;
23  
24  import org.apache.lucene.analysis.Token;
25  
26  /**
27   * This class runs a ngram analysis over submitted text, results might be used
28   * for automatic language identifiaction.
29   * 
30   * The similarity calculation is at experimental level. You have been warned.
31   * 
32   * Methods are provided to build new NGramProfiles profiles.
33   * 
34   * @author Sami Siren
35   */
36  public class NGramProfile {
37  
38    public static final Logger   LOG = LogFormatter
39        .getLogger("net.nutch.analysis.lang.NGramProfile");
40  
41    private String   name;
42  
43    private Vector   sorted = null;
44  
45    private StringBuffer   tokensb = new StringBuffer  ();
46  
47    private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH;
48  
49    private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH;
50  
51    private int ngramcount = 0;
52  
53    static final String   NGRAM_FILE_EXTENSION = "ngp";
54  
55    static final int NGRAM_LENGTH = 1000;
56  
57    //separator char
58    static final char SEPARATOR = '_';
59  
60    //default min length of ngram
61    static final int DEFAULT_MIN_NGRAM_LENGTH = 1;
62  
63    //default max length of ngram
64    static final int DEFAULT_MAX_NGRAM_LENGTH = 4;
65  
66    //table to store ngrams
67    Hashtable   ngrams = null;
68  
69    /**
70     * private class used to store NGramEntry
71     */
72    class NGramEntry implements Comparable   {
73      private CharSequence   seq;
74  
75      private int count;
76  
77      private float normalized_count;
78  
79      public NGramEntry(CharSequence   seq) {
80        this.seq = seq;
81      }
82  
83      /**
84       * @param ngramsequence
85       * @param ngramcount
86       */
87      public NGramEntry(String   ngramsequence, int ngramcount) {
88        seq = new StringBuffer  (ngramsequence).subSequence(0, ngramsequence
89            .length());
90        this.count = ngramcount;
91      }
92  
93      public int getCount() {
94        return count;
95      }
96  
97      public CharSequence   getSeq() {
98        return seq;
99      }
100 
101     public int compareTo(Object   o) {
102       if (((NGramEntry) o).count - count != 0)
103         return ((NGramEntry) o).count - count;
104       else
105         return (seq.toString().compareTo(((NGramEntry) o).seq.toString()));
106     }
107 
108     public void inc() {
109       count++;
110     }
111   }
112 
113   /**
114    * Construct a new ngram profile
115    * 
116    * @param name
117    *          Name of profile
118    */
119   public NGramProfile(String   name) {
120     this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
121   }
122 
123   /**
124    * Construct a new ngram profile
125    * 
126    * @param name
127    *          Name of profile
128    * @param minlen
129    *          min length of ngram sequences
130    * @param maxlen
131    *          max length of ngram sequences
132    */
133   public NGramProfile(String   name, int minlen, int maxlen) {
134     ngrams = new Hashtable  ();
135     this.max_ngram_length = maxlen;
136     this.min_ngram_length = minlen;
137     this.name = name;
138   }
139 
140   /**
141    * Add ngrams from a token to this profile
142    * 
143    * @param t
144    *          Token to be added
145    */
146   public void addFromToken(Token t) {
147     tokensb.setLength(0);
148     tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR);
149     addNGrams(tokensb);
150   }
151 
152   /**
153    * Analyze a piece of text
154    * 
155    * @param text
156    *          the text to be analyzed
157    */
158   public void analyze(StringBuffer   text) {
159     StringBuffer   word;
160     int i;
161 
162     if (ngrams != null) {
163       ngrams.clear();
164     }
165 
166     word = new StringBuffer  ().append(SEPARATOR);
167     for (i = 0; i < text.length(); i++) {
168       char c = Character.toLowerCase(text.charAt(i));
169 
170       if (Character.isLetter(c)) {
171         word.append(c);
172       } else {
173         //found word boundary
174         if (word.length() > 1) {
175           //we have a word!
176           word.append(SEPARATOR);
177           addNGrams(word);
178           word.delete(0, word.length());
179         }
180       }
181     }
182 
183     if (word.length() > 1) {
184       //we have a last word
185       word.append(SEPARATOR);
186       addNGrams(word);
187     }
188     normalize();
189   }
190 
191   /**
192    * Normalize profile
193    */
194   protected void normalize() {
195     Vector   sorted = getSorted();
196     int sum = 0;
197 
198     //only calculate ngramcount if it was not available in profile
199     if (ngramcount == 0) {
200       for (int i = 0; i < sorted.size(); i++) {
201         ngramcount += ((NGramEntry) sorted.get(i)).count;
202       }
203     }
204 
205     if (sorted.size() > 0) {
206       Iterator   i = sorted.iterator();
207 
208       while (i.hasNext()) {
209         NGramEntry e = (NGramEntry) i.next();
210         e.normalized_count = e.count / (float)ngramcount;
211       }
212     }
213   }
214 
215   /**
216    * Add ngrams from a single word to this profile
217    * 
218    * @param word
219    */
220   public void addNGrams(StringBuffer   word) {
221     int i;
222 
223     for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) {
224       addNGrams(word, i);
225     }
226   }
227 
228   /**
229    * @param word
230    * @param n
231    *          sequence length
232    */
233   private void addNGrams(StringBuffer   word, int n) {
234     NGramEntry nge;
235     StringBuffer   sb;
236     int i;
237 
238     for (i = 0; i <= word.length() - n; i++) {
239 
240       CharSequence   cs = word.subSequence(i, i + n);
241 
242       if (ngrams.containsKey(cs)) {
243         nge = (NGramEntry) ngrams.get(cs);
244       } else {
245         nge = new NGramEntry(cs);
246       }
247       nge.inc();
248       ngrams.put(cs, nge);
249     }
250   }
251 
252   /**
253    * Return sorted vector of ngrams (sort done by 1. count 2. sequence)
254    * 
255    * @return sorted vector of ngrams
256    */
257   public Vector   getSorted() {
258     //make sure srting is done only once
259     if (sorted == null) {
260       sorted = new Vector  (ngrams.values());
261       Collections.sort(sorted);
262 
263       //trim at NGRAM_LENGTH entries
264       if (sorted.size() > NGRAM_LENGTH)
265         sorted.setSize(NGRAM_LENGTH);
266     }
267 
268     return sorted;
269   }
270 
271   /**
272    * Return ngramprofile as text
273    * 
274    * @return ngramprofile as text
275    */
276   public String   toString() {
277     StringBuffer   s = new StringBuffer  ();
278 
279     Iterator   i = getSorted().iterator();
280 
281     s.append("NGramProfile: ").append(name).append("\n");
282     while (i.hasNext()) {
283       NGramEntry entry = (NGramEntry) i.next();
284       s.append(entry.count).append(':').append(entry.seq).append(" ").append(
285           entry.normalized_count).append("\n");
286     }
287     return s.toString();
288   }
289 
290   /**
291    * Calculate a score how well NGramProfiles match each other
292    * 
293    * @param another
294    *          ngram profile to compare against
295    * @return similarity 0=exact match
296    */
297   public float getSimilarity(NGramProfile another) {
298     float sum = 0;
299 
300     try {
301       Iterator   i = another.getSorted().iterator();
302       while (i.hasNext()) {
303         NGramEntry other = (NGramEntry) i.next();
304         if (ngrams.containsKey(other.seq)) {
305           sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams
306               .get(other.seq)).normalized_count)) / 2;
307         } else {
308           sum += other.normalized_count;
309         }
310       }
311       i = getSorted().iterator();
312       while (i.hasNext()) {
313         NGramEntry other = (NGramEntry) i.next();
314         if (another.ngrams.containsKey(other.seq)) {
315           sum += Math
316               .abs((other.normalized_count - ((NGramEntry) another.ngrams
317                   .get(other.seq)).normalized_count)) / 2;
318         } else {
319           sum += other.normalized_count;
320         }
321       }
322     } catch (Exception   e) {
323       LOG.severe(e.toString());
324     }
325     return sum;
326   }
327 
328   /**
329    * Loads a ngram profile from InputStream (assumes UTF-8 encoded content)
330    */
331   public void load(InputStream   is) throws IOException   {
332     BufferedReader   bis = new BufferedReader  (new InputStreamReader  (is, "UTF-8"));
333     String   line;
334 
335     ngrams.clear();
336 
337     while ((line = bis.readLine()) != null) {
338 
339       // # starts a comment line
340       if (line.charAt(0) != '#') {
341         int spacepos = line.indexOf(' ');
342         String   ngramsequence = line.substring(0, spacepos).trim();
343         int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
344 
345         if (!line.startsWith("ngram_count")) {
346           NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
347           ngrams.put(en.getSeq(), en);
348         } else {
349           this.ngramcount = ngramcount;
350         }
351       }
352     }
353     normalize();
354   }
355 
356   /**
357    * Create a new Language profile from (preferably quite large) text file
358    * 
359    * @param name
360    *          name of profile
361    * @param is
362    * @param encoding
363    *          encoding of stream
364    */
365   public static NGramProfile createNgramProfile(String   name, InputStream   is,
366       String   encoding) {
367 
368     NGramProfile newProfile = new NGramProfile(name);
369     BufferedInputStream   bis = new BufferedInputStream  (is);
370 
371     byte buffer[] = new byte[4096];
372     StringBuffer   text = new StringBuffer  ();
373     int len;
374 
375     try {
376       while ((len = bis.read(buffer)) != -1) {
377         text.append(new String  (buffer, 0, len, encoding));
378       }
379     } catch (IOException   e) {
380       e.printStackTrace();
381     }
382 
383     newProfile.analyze(text);
384 
385     return newProfile;
386   }
387 
388   /**
389    * Writes NGramProfile content into OutputStream, content is outputted with
390    * UTF-8 encoding
391    * 
392    * @param os
393    *          Stream to output to
394    * @throws IOException
395    */
396 
397   public void save(OutputStream   os) throws IOException   {
398     Vector   v = getSorted();
399     Iterator   i = v.iterator();
400     os
401         .write(("# NgramProfile generated at " + new Date  () + " for Nutch Language Identification\n")
402             .getBytes());
403     os.write(("ngram_count " + ngramcount + "\n").getBytes());
404 
405     while (i.hasNext()) {
406       NGramEntry e = (NGramEntry) i.next();
407       String   line = e.getSeq().toString() + " " + e.getCount() + "\n";
408       os.write(line.getBytes("UTF-8"));
409     }
410 
411     os.flush();
412   }
413 
414   /**
415    * main method used for testing only
416    * 
417    * @param args
418    */
419   public static void main(String   args[]) {
420 
421     String   usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]";
422     int command = 0;
423 
424     final int CREATE = 1;
425     final int SIMILARITY = 2;
426     final int SCORE = 3;
427 
428     String   profilename = "";
429     String   filename = "";
430     String   filename2 = "";
431     String   encoding = "";
432 
433     if (args.length == 0) {
434       System.err.println(usage);
435       System.exit(-1);
436     }
437 
438     for (int i = 0; i < args.length; i++) { // parse command line
439       if (args[i].equals("-create")) { // found -create option
440         command = CREATE;
441         profilename = args[++i];
442         filename = args[++i];
443         encoding = args[++i];
444       }
445 
446       if (args[i].equals("-similarity")) { // found -similarity option
447         command = SIMILARITY;
448         filename = args[++i];
449         filename2 = args[++i];
450         encoding = args[++i];
451       }
452 
453       if (args[i].equals("-score")) { // found -Score option
454         command = SCORE;
455         profilename = args[++i];
456         filename = args[++i];
457         encoding = args[++i];
458       }
459     }
460 
461     try {
462 
463       switch (command) {
464 
465       case CREATE:
466 
467         File   f = new File  (filename);
468         FileInputStream   fis = new FileInputStream  (f);
469         NGramProfile newProfile = NGramProfile.createNgramProfile(profilename,
470             fis, encoding);
471         fis.close();
472         f = new File  (profilename + "." + NGRAM_FILE_EXTENSION);
473         FileOutputStream   fos = new FileOutputStream  (f);
474         newProfile.save(fos);
475         System.out.println("new profile " + profilename + "."
476             + NGRAM_FILE_EXTENSION + " was created.");
477         break;
478 
479       case SIMILARITY:
480 
481         f = new File  (filename);
482         fis = new FileInputStream  (f);
483         newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
484         newProfile.normalize();
485 
486         f = new File  (filename2);
487         fis = new FileInputStream  (f);
488         NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2,
489             fis, encoding);
490         newProfile2.normalize();
491         System.out.println("Similarity is "
492             + newProfile.getSimilarity(newProfile2));
493         break;
494 
495       case SCORE:
496         f = new File  (filename);
497         fis = new FileInputStream  (f);
498         newProfile = NGramProfile.createNgramProfile(filename, fis, encoding);
499 
500         f = new File  (profilename + "." + NGRAM_FILE_EXTENSION);
501         fis = new FileInputStream  (f);
502         NGramProfile compare = new NGramProfile(profilename);
503         compare.load(fis);
504         System.out.println("Score is " + compare.getSimilarity(newProfile));
505 
506         break;
507 
508       }
509 
510     } catch (Exception   e) {
511       LOG.severe("Caught an exception:" + e);
512     }
513   }
514 
515   /**
516    * @return Returns the name.
517    */
518   public String   getName() {
519     return name;
520   }
521 
522   /**
523    * @param name
524    *          The name to set.
525    */
526   public void setName(String   name) {
527     this.name = name;
528   }
529 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags