1 17 18 19 20 package org.apache.fop.hyphenation; 21 22 import java.io.BufferedReader ; 23 import java.io.File ; 24 import java.io.FileInputStream ; 25 import java.io.FileOutputStream ; 26 import java.io.FileReader ; 27 import java.io.IOException ; 28 import java.io.ObjectInputStream ; 29 import java.io.ObjectOutputStream ; 30 import java.io.Serializable ; 31 import java.net.MalformedURLException ; 32 import java.util.ArrayList ; 33 import java.util.HashMap ; 34 35 import org.xml.sax.InputSource ; 36 37 44 public class HyphenationTree extends TernaryTree 45 implements PatternConsumer, Serializable { 46 47 private static final long serialVersionUID = -7842107987915665573L; 48 49 52 protected ByteVector vspace; 53 54 57 protected HashMap stoplist; 58 59 62 protected TernaryTree classmap; 63 64 67 private transient TernaryTree ivalues; 68 69 public HyphenationTree() { 70 stoplist = new HashMap (23); classmap = new TernaryTree(); 72 vspace = new ByteVector(); 73 vspace.alloc(1); } 75 76 85 protected int packValues(String values) { 86 int i, n = values.length(); 87 int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; 88 int offset = vspace.alloc(m); 89 byte[] va = vspace.getArray(); 90 for (i = 0; i < n; i++) { 91 int j = i >> 1; 92 byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f); 93 if ((i & 1) == 1) { 94 va[j + offset] = (byte)(va[j + offset] | v); 95 } else { 96 va[j + offset] = (byte)(v << 4); } 98 } 99 va[m - 1 + offset] = 0; return offset; 101 } 102 103 protected String unpackValues(int k) { 104 StringBuffer buf = new StringBuffer (); 105 byte v = vspace.get(k++); 106 while (v != 0) { 107 char c = (char)((v >>> 4) - 1 + '0'); 108 buf.append(c); 109 c = (char)(v & 0x0f); 110 if (c == 0) { 111 break; 112 } 113 c = (char)(c - 1 + '0'); 114 buf.append(c); 115 v = vspace.get(k++); 116 } 117 return buf.toString(); 118 } 119 120 125 public void loadPatterns(String filename) throws HyphenationException { 126 File f = new File (filename); 127 try { 128 InputSource src = new InputSource (f.toURL().toExternalForm()); 129 loadPatterns(src); 130 } catch (MalformedURLException e) { 131 throw new HyphenationException("Error converting the File '" + f + "' to a URL: " 132 + e.getMessage()); 133 } 134 } 135 136 141 public void loadPatterns(InputSource source) throws HyphenationException { 142 PatternParser pp = new PatternParser(this); 143 ivalues = new TernaryTree(); 144 145 pp.parse(source); 146 147 trimToSize(); 150 vspace.trimToSize(); 151 classmap.trimToSize(); 152 153 ivalues = null; 155 } 156 157 public String findPattern(String pat) { 158 int k = super.find(pat); 159 if (k >= 0) { 160 return unpackValues(k); 161 } 162 return ""; 163 } 164 165 169 protected int hstrcmp(char[] s, int si, char[] t, int ti) { 170 for (; s[si] == t[ti]; si++, ti++) { 171 if (s[si] == 0) { 172 return 0; 173 } 174 } 175 if (t[ti] == 0) { 176 return 0; 177 } 178 return s[si] - t[ti]; 179 } 180 181 protected byte[] getValues(int k) { 182 StringBuffer buf = new StringBuffer (); 183 byte v = vspace.get(k++); 184 while (v != 0) { 185 char c = (char)((v >>> 4) - 1); 186 buf.append(c); 187 c = (char)(v & 0x0f); 188 if (c == 0) { 189 break; 190 } 191 c = (char)(c - 1); 192 buf.append(c); 193 v = vspace.get(k++); 194 } 195 byte[] res = new byte[buf.length()]; 196 for (int i = 0; i < res.length; i++) { 197 res[i] = (byte)buf.charAt(i); 198 } 199 return res; 200 } 201 202 226 protected void searchPatterns(char[] word, int index, byte[] il) { 227 byte[] values; 228 int i = index; 229 char p, q; 230 char sp = word[i]; 231 p = root; 232 233 while (p > 0 && p < sc.length) { 234 if (sc[p] == 0xFFFF) { 235 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { 236 values = getValues(eq[p]); int j = index; 238 for (int k = 0; k < values.length; k++) { 239 if (j < il.length && values[k] > il[j]) { 240 il[j] = values[k]; 241 } 242 j++; 243 } 244 } 245 return; 246 } 247 int d = sp - sc[p]; 248 if (d == 0) { 249 if (sp == 0) { 250 break; 251 } 252 sp = word[++i]; 253 p = eq[p]; 254 q = p; 255 256 while (q > 0 && q < sc.length) { 259 if (sc[q] == 0xFFFF) { break; 261 } 262 if (sc[q] == 0) { 263 values = getValues(eq[q]); 264 int j = index; 265 for (int k = 0; k < values.length; k++) { 266 if (j < il.length && values[k] > il[j]) { 267 il[j] = values[k]; 268 } 269 j++; 270 } 271 break; 272 } else { 273 q = lo[q]; 274 275 280 } 281 } 282 } else { 283 p = d < 0 ? lo[p] : hi[p]; 284 } 285 } 286 } 287 288 298 public Hyphenation hyphenate(String word, int remainCharCount, 299 int pushCharCount) { 300 char[] w = word.toCharArray(); 301 return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); 302 } 303 304 326 327 339 public Hyphenation hyphenate(char[] w, int offset, int len, 340 int remainCharCount, int pushCharCount) { 341 int i; 342 char[] word = new char[len + 3]; 343 344 char[] c = new char[2]; 346 int iIgnoreAtBeginning = 0; 347 int iLength = len; 348 boolean bEndOfLetters = false; 349 for (i = 1; i <= len; i++) { 350 c[0] = w[offset + i - 1]; 351 int nc = classmap.find(c, 0); 352 if (nc < 0) { if (i == (1 + iIgnoreAtBeginning)) { 354 iIgnoreAtBeginning ++; 356 } else { 357 bEndOfLetters = true; 359 } 360 iLength --; 361 } else { 362 if (!bEndOfLetters) { 363 word[i - iIgnoreAtBeginning] = (char)nc; 364 } else { 365 return null; 366 } 367 } 368 } 369 len = iLength; 370 if (len < (remainCharCount + pushCharCount)) { 371 return null; 373 } 374 int[] result = new int[len + 1]; 375 int k = 0; 376 377 String sw = new String (word, 1, len); 379 if (stoplist.containsKey(sw)) { 380 ArrayList hw = (ArrayList )stoplist.get(sw); 382 int j = 0; 383 for (i = 0; i < hw.size(); i++) { 384 Object o = hw.get(i); 385 if (o instanceof String ) { 388 j += ((String )o).length(); 389 if (j >= remainCharCount && j < (len - pushCharCount)) { 390 result[k++] = j + iIgnoreAtBeginning; 391 } 392 } 393 } 394 } else { 395 word[0] = '.'; word[len + 1] = '.'; word[len + 2] = 0; byte[] il = new byte[len + 3]; for (i = 0; i < len + 1; i++) { 401 searchPatterns(word, i, il); 402 } 403 404 for (i = 0; i < len; i++) { 409 if (((il[i + 1] & 1) == 1) && i >= remainCharCount 410 && i <= (len - pushCharCount)) { 411 result[k++] = i + iIgnoreAtBeginning; 412 } 413 } 414 } 415 416 417 if (k > 0) { 418 int[] res = new int[k]; 420 System.arraycopy(result, 0, res, 0, k); 421 return new Hyphenation(new String (w, offset, len), res); 422 } else { 423 return null; 424 } 425 } 426 427 439 public void addClass(String chargroup) { 440 if (chargroup.length() > 0) { 441 char equivChar = chargroup.charAt(0); 442 char[] key = new char[2]; 443 key[1] = 0; 444 for (int i = 0; i < chargroup.length(); i++) { 445 key[0] = chargroup.charAt(i); 446 classmap.insert(key, 0, equivChar); 447 } 448 } 449 } 450 451 459 public void addException(String word, ArrayList hyphenatedword) { 460 stoplist.put(word, hyphenatedword); 461 } 462 463 473 public void addPattern(String pattern, String ivalue) { 474 int k = ivalues.find(ivalue); 475 if (k <= 0) { 476 k = packValues(ivalue); 477 ivalues.insert(ivalue, (char)k); 478 } 479 insert(pattern, (char)k); 480 } 481 482 public void printStats() { 483 System.out.println("Value space size = " 484 + Integer.toString(vspace.length())); 485 super.printStats(); 486 487 } 488 489 public static void main(String [] argv) throws Exception { 490 HyphenationTree ht = null; 491 int minCharCount = 2; 492 BufferedReader in = 493 new BufferedReader (new java.io.InputStreamReader (System.in)); 494 while (true) { 495 System.out.print("l:\tload patterns from XML\n" 496 + "L:\tload patterns from serialized object\n" 497 + "s:\tset minimum character count\n" 498 + "w:\twrite hyphenation tree to object file\n" 499 + "h:\thyphenate\n" 500 + "f:\tfind pattern\n" 501 + "b:\tbenchmark\n" 502 + "q:\tquit\n\n" 503 + "Command:"); 504 String token = in.readLine().trim(); 505 if (token.equals("f")) { 506 System.out.print("Pattern: "); 507 token = in.readLine().trim(); 508 System.out.println("Values: " + ht.findPattern(token)); 509 } else if (token.equals("s")) { 510 System.out.print("Minimun value: "); 511 token = in.readLine().trim(); 512 minCharCount = Integer.parseInt(token); 513 } else if (token.equals("l")) { 514 ht = new HyphenationTree(); 515 System.out.print("XML file name: "); 516 token = in.readLine().trim(); 517 ht.loadPatterns(token); 518 } else if (token.equals("L")) { 519 ObjectInputStream ois = null; 520 System.out.print("Object file name: "); 521 token = in.readLine().trim(); 522 try { 523 ois = new ObjectInputStream (new FileInputStream (token)); 524 ht = (HyphenationTree)ois.readObject(); 525 } catch (Exception e) { 526 e.printStackTrace(); 527 } finally { 528 if (ois != null) { 529 try { 530 ois.close(); 531 } catch (IOException e) { 532 } 534 } 535 } 536 } else if (token.equals("w")) { 537 System.out.print("Object file name: "); 538 token = in.readLine().trim(); 539 ObjectOutputStream oos = null; 540 try { 541 oos = new ObjectOutputStream (new FileOutputStream (token)); 542 oos.writeObject(ht); 543 } catch (Exception e) { 544 e.printStackTrace(); 545 } finally { 546 if (oos != null) { 547 try { 548 oos.flush(); 549 } catch (IOException e) { 550 } 552 try { 553 oos.close(); 554 } catch (IOException e) { 555 } 557 } 558 } 559 } else if (token.equals("h")) { 560 System.out.print("Word: "); 561 token = in.readLine().trim(); 562 System.out.print("Hyphenation points: "); 563 System.out.println(ht.hyphenate(token, minCharCount, 564 minCharCount)); 565 } else if (token.equals("b")) { 566 if (ht == null) { 567 System.out.println("No patterns have been loaded."); 568 break; 569 } 570 System.out.print("Word list filename: "); 571 token = in.readLine().trim(); 572 long starttime = 0; 573 int counter = 0; 574 try { 575 BufferedReader reader = 576 new BufferedReader (new FileReader (token)); 577 String line; 578 579 starttime = System.currentTimeMillis(); 580 while ((line = reader.readLine()) != null) { 581 Hyphenation hyp = ht.hyphenate(line, minCharCount, 583 minCharCount); 584 if (hyp != null) { 585 String hword = hyp.toString(); 586 } else { 589 } 591 counter++; 592 } 593 } catch (Exception ioe) { 594 System.out.println("Exception " + ioe); 595 ioe.printStackTrace(); 596 } 597 long endtime = System.currentTimeMillis(); 598 long result = endtime - starttime; 599 System.out.println(counter + " words in " + result 600 + " Milliseconds hyphenated"); 601 602 } else if (token.equals("q")) { 603 break; 604 } 605 } 606 607 } 608 609 } 610 | Popular Tags |