1 2 3 4 package net.nutch.analysis.lang; 5 6 import java.io.BufferedInputStream ; 7 import java.io.BufferedReader ; 8 import java.io.File ; 9 import java.io.FileInputStream ; 10 import java.io.FileOutputStream ; 11 import java.io.IOException ; 12 import java.io.InputStream ; 13 import java.io.InputStreamReader ; 14 import java.io.OutputStream ; 15 import java.util.Date ; 16 import java.util.Collections ; 17 import java.util.Hashtable ; 18 import java.util.Iterator ; 19 import java.util.Vector ; 20 import java.util.logging.Logger ; 21 22 import net.nutch.util.LogFormatter; 23 24 import org.apache.lucene.analysis.Token; 25 26 36 public class NGramProfile { 37 38 public static final Logger LOG = LogFormatter 39 .getLogger("net.nutch.analysis.lang.NGramProfile"); 40 41 private String name; 42 43 private Vector sorted = null; 44 45 private StringBuffer tokensb = new StringBuffer (); 46 47 private int min_ngram_length = DEFAULT_MIN_NGRAM_LENGTH; 48 49 private int max_ngram_length = DEFAULT_MAX_NGRAM_LENGTH; 50 51 private int ngramcount = 0; 52 53 static final String NGRAM_FILE_EXTENSION = "ngp"; 54 55 static final int NGRAM_LENGTH = 1000; 56 57 static final char SEPARATOR = '_'; 59 60 static final int DEFAULT_MIN_NGRAM_LENGTH = 1; 62 63 static final int DEFAULT_MAX_NGRAM_LENGTH = 4; 65 66 Hashtable ngrams = null; 68 69 72 class NGramEntry implements Comparable { 73 private CharSequence seq; 74 75 private int count; 76 77 private float normalized_count; 78 79 public NGramEntry(CharSequence seq) { 80 this.seq = seq; 81 } 82 83 87 public NGramEntry(String ngramsequence, int ngramcount) { 88 seq = new StringBuffer (ngramsequence).subSequence(0, ngramsequence 89 .length()); 90 this.count = ngramcount; 91 } 92 93 public int getCount() { 94 return count; 95 } 96 97 public CharSequence getSeq() { 98 return seq; 99 } 100 101 public int compareTo(Object o) { 102 if (((NGramEntry) o).count - count != 0) 103 return ((NGramEntry) o).count - count; 104 else 105 return (seq.toString().compareTo(((NGramEntry) o).seq.toString())); 106 } 107 108 public void inc() { 109 count++; 110 } 111 } 112 113 119 public NGramProfile(String name) { 120 this(name, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH); 121 } 122 123 133 public NGramProfile(String name, int minlen, int maxlen) { 134 ngrams = new Hashtable (); 135 this.max_ngram_length = maxlen; 136 this.min_ngram_length = minlen; 137 this.name = name; 138 } 139 140 146 public void addFromToken(Token t) { 147 tokensb.setLength(0); 148 tokensb.append(SEPARATOR).append(t.termText()).append(SEPARATOR); 149 addNGrams(tokensb); 150 } 151 152 158 public void analyze(StringBuffer text) { 159 StringBuffer word; 160 int i; 161 162 if (ngrams != null) { 163 ngrams.clear(); 164 } 165 166 word = new StringBuffer ().append(SEPARATOR); 167 for (i = 0; i < text.length(); i++) { 168 char c = Character.toLowerCase(text.charAt(i)); 169 170 if (Character.isLetter(c)) { 171 word.append(c); 172 } else { 173 if (word.length() > 1) { 175 word.append(SEPARATOR); 177 addNGrams(word); 178 word.delete(0, word.length()); 179 } 180 } 181 } 182 183 if (word.length() > 1) { 184 word.append(SEPARATOR); 186 addNGrams(word); 187 } 188 normalize(); 189 } 190 191 194 protected void normalize() { 195 Vector sorted = getSorted(); 196 int sum = 0; 197 198 if (ngramcount == 0) { 200 for (int i = 0; i < sorted.size(); i++) { 201 ngramcount += ((NGramEntry) sorted.get(i)).count; 202 } 203 } 204 205 if (sorted.size() > 0) { 206 Iterator i = sorted.iterator(); 207 208 while (i.hasNext()) { 209 NGramEntry e = (NGramEntry) i.next(); 210 e.normalized_count = e.count / (float)ngramcount; 211 } 212 } 213 } 214 215 220 public void addNGrams(StringBuffer word) { 221 int i; 222 223 for (i = min_ngram_length; i <= max_ngram_length && i < word.length(); i++) { 224 addNGrams(word, i); 225 } 226 } 227 228 233 private void addNGrams(StringBuffer word, int n) { 234 NGramEntry nge; 235 StringBuffer sb; 236 int i; 237 238 for (i = 0; i <= word.length() - n; i++) { 239 240 CharSequence cs = word.subSequence(i, i + n); 241 242 if (ngrams.containsKey(cs)) { 243 nge = (NGramEntry) ngrams.get(cs); 244 } else { 245 nge = new NGramEntry(cs); 246 } 247 nge.inc(); 248 ngrams.put(cs, nge); 249 } 250 } 251 252 257 public Vector getSorted() { 258 if (sorted == null) { 260 sorted = new Vector (ngrams.values()); 261 Collections.sort(sorted); 262 263 if (sorted.size() > NGRAM_LENGTH) 265 sorted.setSize(NGRAM_LENGTH); 266 } 267 268 return sorted; 269 } 270 271 276 public String toString() { 277 StringBuffer s = new StringBuffer (); 278 279 Iterator i = getSorted().iterator(); 280 281 s.append("NGramProfile: ").append(name).append("\n"); 282 while (i.hasNext()) { 283 NGramEntry entry = (NGramEntry) i.next(); 284 s.append(entry.count).append(':').append(entry.seq).append(" ").append( 285 entry.normalized_count).append("\n"); 286 } 287 return s.toString(); 288 } 289 290 297 public float getSimilarity(NGramProfile another) { 298 float sum = 0; 299 300 try { 301 Iterator i = another.getSorted().iterator(); 302 while (i.hasNext()) { 303 NGramEntry other = (NGramEntry) i.next(); 304 if (ngrams.containsKey(other.seq)) { 305 sum += Math.abs((other.normalized_count - ((NGramEntry) ngrams 306 .get(other.seq)).normalized_count)) / 2; 307 } else { 308 sum += other.normalized_count; 309 } 310 } 311 i = getSorted().iterator(); 312 while (i.hasNext()) { 313 NGramEntry other = (NGramEntry) i.next(); 314 if (another.ngrams.containsKey(other.seq)) { 315 sum += Math 316 .abs((other.normalized_count - ((NGramEntry) another.ngrams 317 .get(other.seq)).normalized_count)) / 2; 318 } else { 319 sum += other.normalized_count; 320 } 321 } 322 } catch (Exception e) { 323 LOG.severe(e.toString()); 324 } 325 return sum; 326 } 327 328 331 public void load(InputStream is) throws IOException { 332 BufferedReader bis = new BufferedReader (new InputStreamReader (is, "UTF-8")); 333 String line; 334 335 ngrams.clear(); 336 337 while ((line = bis.readLine()) != null) { 338 339 if (line.charAt(0) != '#') { 341 int spacepos = line.indexOf(' '); 342 String ngramsequence = line.substring(0, spacepos).trim(); 343 int ngramcount = Integer.parseInt(line.substring(spacepos + 1)); 344 345 if (!line.startsWith("ngram_count")) { 346 NGramEntry en = new NGramEntry(ngramsequence, ngramcount); 347 ngrams.put(en.getSeq(), en); 348 } else { 349 this.ngramcount = ngramcount; 350 } 351 } 352 } 353 normalize(); 354 } 355 356 365 public static NGramProfile createNgramProfile(String name, InputStream is, 366 String encoding) { 367 368 NGramProfile newProfile = new NGramProfile(name); 369 BufferedInputStream bis = new BufferedInputStream (is); 370 371 byte buffer[] = new byte[4096]; 372 StringBuffer text = new StringBuffer (); 373 int len; 374 375 try { 376 while ((len = bis.read(buffer)) != -1) { 377 text.append(new String (buffer, 0, len, encoding)); 378 } 379 } catch (IOException e) { 380 e.printStackTrace(); 381 } 382 383 newProfile.analyze(text); 384 385 return newProfile; 386 } 387 388 396 397 public void save(OutputStream os) throws IOException { 398 Vector v = getSorted(); 399 Iterator i = v.iterator(); 400 os 401 .write(("# NgramProfile generated at " + new Date () + " for Nutch Language Identification\n") 402 .getBytes()); 403 os.write(("ngram_count " + ngramcount + "\n").getBytes()); 404 405 while (i.hasNext()) { 406 NGramEntry e = (NGramEntry) i.next(); 407 String line = e.getSeq().toString() + " " + e.getCount() + "\n"; 408 os.write(line.getBytes("UTF-8")); 409 } 410 411 os.flush(); 412 } 413 414 419 public static void main(String args[]) { 420 421 String usage = "Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]"; 422 int command = 0; 423 424 final int CREATE = 1; 425 final int SIMILARITY = 2; 426 final int SCORE = 3; 427 428 String profilename = ""; 429 String filename = ""; 430 String filename2 = ""; 431 String encoding = ""; 432 433 if (args.length == 0) { 434 System.err.println(usage); 435 System.exit(-1); 436 } 437 438 for (int i = 0; i < args.length; i++) { if (args[i].equals("-create")) { command = CREATE; 441 profilename = args[++i]; 442 filename = args[++i]; 443 encoding = args[++i]; 444 } 445 446 if (args[i].equals("-similarity")) { command = SIMILARITY; 448 filename = args[++i]; 449 filename2 = args[++i]; 450 encoding = args[++i]; 451 } 452 453 if (args[i].equals("-score")) { command = SCORE; 455 profilename = args[++i]; 456 filename = args[++i]; 457 encoding = args[++i]; 458 } 459 } 460 461 try { 462 463 switch (command) { 464 465 case CREATE: 466 467 File f = new File (filename); 468 FileInputStream fis = new FileInputStream (f); 469 NGramProfile newProfile = NGramProfile.createNgramProfile(profilename, 470 fis, encoding); 471 fis.close(); 472 f = new File (profilename + "." + NGRAM_FILE_EXTENSION); 473 FileOutputStream fos = new FileOutputStream (f); 474 newProfile.save(fos); 475 System.out.println("new profile " + profilename + "." 476 + NGRAM_FILE_EXTENSION + " was created."); 477 break; 478 479 case SIMILARITY: 480 481 f = new File (filename); 482 fis = new FileInputStream (f); 483 newProfile = NGramProfile.createNgramProfile(filename, fis, encoding); 484 newProfile.normalize(); 485 486 f = new File (filename2); 487 fis = new FileInputStream (f); 488 NGramProfile newProfile2 = NGramProfile.createNgramProfile(filename2, 489 fis, encoding); 490 newProfile2.normalize(); 491 System.out.println("Similarity is " 492 + newProfile.getSimilarity(newProfile2)); 493 break; 494 495 case SCORE: 496 f = new File (filename); 497 fis = new FileInputStream (f); 498 newProfile = NGramProfile.createNgramProfile(filename, fis, encoding); 499 500 f = new File (profilename + "." + NGRAM_FILE_EXTENSION); 501 fis = new FileInputStream (f); 502 NGramProfile compare = new NGramProfile(profilename); 503 compare.load(fis); 504 System.out.println("Score is " + compare.getSimilarity(newProfile)); 505 506 break; 507 508 } 509 510 } catch (Exception e) { 511 LOG.severe("Caught an exception:" + e); 512 } 513 } 514 515 518 public String getName() { 519 return name; 520 } 521 522 526 public void setName(String name) { 527 this.name = name; 528 } 529 } | Popular Tags |