1 19 20 37 package org.htmlparser.util; 38 39 import org.htmlparser.Node; 40 import org.htmlparser.Parser; 41 import org.htmlparser.RemarkNode; 42 import org.htmlparser.StringNode; 43 import org.htmlparser.tags.EndTag; 44 import org.htmlparser.tags.LinkTag; 45 import org.htmlparser.tags.Tag; 46 47 62 public class Generate 63 { 64 67 protected Parser parser; 68 69 72 protected static final String nl = 73 System.getProperty("line.separator", "\n"); 74 75 81 public Generate() throws ParserException 82 { 83 parser = 84 new Parser("http://www.w3.org/TR/REC-html40/sgml/entities.html"); 85 parser.registerScanners(); 86 } 87 88 98 public String translate(String string) 99 { 100 int index; 101 int amp; 102 StringBuffer ret; 103 104 ret = new StringBuffer (4096); 105 106 index = 0; 107 while ((index < string.length()) 108 && (-1 != (amp = string.indexOf('&', index)))) 109 { 110 ret.append(string.substring(index, amp)); 112 if (string.startsWith(" ", amp)) 113 { 114 ret.append(" "); 115 index = amp + 6; 116 } 117 else if (string.startsWith("<", amp)) 118 { 119 ret.append("<"); 120 index = amp + 4; 121 } 122 else if (string.startsWith(">", amp)) 123 { 124 ret.append(">"); 125 index = amp + 4; 126 } 127 else if (string.startsWith("&", amp)) 128 { 129 ret.append("&"); 130 index = amp + 5; 131 } 132 else if (string.startsWith(""e;", amp)) 133 { 134 ret.append("\""); 135 index = amp + 7; 136 } 137 else if (string.startsWith("÷", amp)) 138 { 139 ret.append('\u00F7'); 140 index = amp + 8; 141 } 142 else if (string.startsWith("©", amp)) 143 { 144 ret.append('\u00A9'); 145 index = amp + 6; 146 } 147 else 148 { 149 System.out.println( 150 "unknown special character starting with " 151 + string.substring(amp, amp + 7)); 152 ret.append("&"); 153 index = amp + 1; 154 } 155 } 156 ret.append(string.substring(index)); 157 158 return (ret.toString()); 159 } 160 161 164 public void parse() throws ParserException 165 { 166 Node node; 167 StringBuffer buffer = new StringBuffer (4096); 168 169 for (NodeIterator e = parser.elements(); e.hasMoreNodes();) 172 { 173 node = e.nextNode(); 174 175 if (node instanceof StringNode) 176 { 177 StringNode stringNode = (StringNode) node; 180 buffer.append(stringNode.getText()); 182 } 183 else if (node instanceof LinkTag) 184 { 185 LinkTag linkNode = (LinkTag) node; 188 buffer.append(linkNode.getLinkText()); 190 } 191 else if (node instanceof Tag) 192 { 193 String contents = ((Tag) node).getText(); 194 if (contents.equals("BR") || contents.equals("P")) 195 buffer.append(nl); 196 } 197 else if (node instanceof EndTag) 198 { 199 String contents = ((EndTag) node).getText(); 200 if (contents.equals("BR") || contents.equals("P")) 201 buffer.append(nl); 202 } 203 else if (node instanceof RemarkNode) 204 { 205 } 206 else 207 { 208 System.out.println(); 209 System.out.println(node.toString()); 210 } 211 } 212 213 String text = translate(buffer.toString()); 214 sgml(text); 215 } 216 217 223 public int indexOfWhitespace(String string, int index) 224 { 225 int space; 226 int cr; 227 int ret; 228 229 space = string.indexOf(" ", index); 230 cr = string.indexOf(nl, index); 231 if (-1 == space) 232 ret = cr; 233 else if (-1 == cr) 234 ret = space; 235 else 236 ret = Math.min(space, cr); 237 238 return (ret); 239 } 240 241 252 public String pack(String string) 253 { 254 int index; 255 int spaces; 256 StringBuffer ret; 257 258 ret = new StringBuffer (string.length()); 259 260 if (string.startsWith("-- ")) 261 string = string.substring(3); 262 index = 0; 264 while ((index < string.length()) 265 && (-1 != (spaces = indexOfWhitespace(string, index)))) 266 { 267 ret.append(string.substring(index, spaces)); 268 ret.append(" "); 269 while ((spaces < string.length()) 270 && Character.isWhitespace(string.charAt(spaces))) 271 spaces++; 272 index = spaces; 273 } 274 if (index < string.length()) 275 ret.append(string.substring(index)); 276 277 return (ret.toString()); 278 } 279 280 285 public String pretty(String string) 286 { 287 int index; 288 int spaces; 289 StringBuffer ret; 290 291 ret = new StringBuffer (string.length()); 292 293 index = 0; 295 while ((index < string.length()) 296 && (-1 != (spaces = string.indexOf(" ", index)))) 297 { 298 ret.append(" // " + string.substring(index, spaces)); 299 if (!string.substring(index, spaces).endsWith(nl)) 300 ret.append(nl); 301 while ((spaces < string.length()) 302 && Character.isWhitespace(string.charAt(spaces))) 303 spaces++; 304 index = spaces; 305 } 306 if (index < string.length()) 307 ret.append(" // " + string.substring(index)); 308 309 return (ret.toString()); 310 } 311 312 319 public String pad(String string, char character, int length) 320 { 321 StringBuffer ret; 322 323 ret = new StringBuffer (length); 324 ret.append(string); 325 while (length > ret.length()) 326 ret.insert(0, character); 327 328 return (ret.toString()); 329 } 330 331 337 public String unicode(String string) 338 { 339 int code; 340 341 if (string.startsWith("\"&#") && string.endsWith(";\"")) 342 { 343 string = string.substring(3, string.length() - 2); 344 try 345 { 346 code = Integer.parseInt(string); 347 string = 348 "new Character ('\\u" 349 + pad(Integer.toHexString(code), '0', 4) 350 + "')"; 351 } 352 catch (Exception e) 353 { 354 e.printStackTrace(); 355 } 356 return (string); 357 } 358 else 359 return (string); 360 } 361 362 370 public void extract(String string) 371 { 372 int space; 373 String token; 374 String code; 375 int comment; 376 String description; 377 378 if (string.startsWith("<!--")) 379 System.out.println( 380 pretty(string.substring(4, string.length() - 3).trim())); 381 else if (string.startsWith("<!ENTITY")) 382 { 383 string = string.substring(8, string.length() - 3).trim(); 384 if (-1 != (space = string.indexOf(" "))) 385 { 386 token = string.substring(0, space); 387 string = string.substring(space).trim(); 388 if (string.startsWith("CDATA")) 389 { 390 string = string.substring(5).trim(); 391 if (-1 != (space = string.indexOf(" "))) 392 { 393 code = string.substring(0, space).trim(); 394 code = unicode(code); 395 string = string.substring(space).trim(); 396 System 397 .out 398 .println(" mRefChar.put (\"" + token + "\"," 399 +pad(code, ' ', code.length() + 9 - token.length()) 401 + ");" 402 + " // " 403 + pack(string)); 404 } 405 else 406 System.out.println(string); 407 } 408 else 409 System.out.println(string); 410 } 411 else 412 System.out.println(string); 413 } 414 else 415 System.out.println(string); 416 } 417 418 427 public void sgml(String string) 428 { 429 int index; 430 int begin; 431 int end; 432 StringBuffer ret; 433 434 ret = new StringBuffer (4096); 435 436 index = 0; 437 while (-1 != (begin = string.indexOf("<", index))) 438 { 439 if (-1 != (end = string.indexOf("-->", begin))) 440 { 441 extract(string.substring(begin, end + 3)); 442 index = end + 3; 443 } 444 else 445 index = begin + 1; 446 } 447 } 448 449 456 public static void main(String [] args) throws ParserException 457 { 458 Generate filter = new Generate(); 459 System.out.println("import java.util.Hashtable;"); 460 System.out.println("import java.util.Iterator;"); 461 System.out.println(); 462 System.out.println("/**"); 463 System.out.println( 464 " * Translate numeric character references and character entity references to unicode characters."); 465 System.out.println( 466 " * Based on tables found at <a HREF=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">"); 467 System.out.println( 468 " * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>"); 469 System.out.println( 470 " * <p><b>Note: Do not edit! This class is created by the Generate class.</b>"); 471 System.out.println(" * <p>Typical usage:"); 472 System.out.println(" * <pre>"); 473 System.out.println( 474 " * String s = Translate.decode (getTextFromHtmlPage ());"); 475 System.out.println(" * </pre>"); 476 System.out.println( 477 " * @author <a HREF='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>"); 478 System.out.println(" */"); 479 System.out.println("public class Translate"); 480 System.out.println("{"); 481 System.out.println(" /**"); 482 System.out.println( 483 " * Table mapping entity reference kernel to character."); 484 System.out.println( 485 " * <p><code>String</code>-><code>Character</code>"); 486 System.out.println(" */"); 487 System.out.println(" protected static Hashtable mRefChar;"); 488 System.out.println(" static"); 489 System.out.println(" {"); 490 System.out.println(" mRefChar = new Hashtable (1000);"); 491 System.out.println(); 492 filter.parse(); 493 System.out.println(" }"); 494 System.out.println(); 495 System.out.println(" /**"); 496 System.out.println( 497 " * Table mapping character to entity reference kernel."); 498 System.out.println( 499 " * <p><code>Character</code>-><code>String</code>"); 500 System.out.println(" */"); 501 System.out.println(" protected static Hashtable mCharRef;"); 502 System.out.println(" static"); 503 System.out.println(" {"); 504 System.out.println( 505 " mCharRef = new Hashtable (mRefChar.size ());"); 506 System.out.println(); 507 System.out.println( 508 " Iterator iterator = mRefChar.keySet ().iterator ();"); 509 System.out.println(" while (iterator.hasNext ())"); 510 System.out.println(" {"); 511 System.out.println( 512 " String key = (String)iterator.next ();"); 513 System.out.println( 514 " Character character = (Character)mRefChar.get (key);"); 515 System.out.println(" mCharRef.put (character, key);"); 516 System.out.println(" }"); 517 System.out.println(" }"); 518 System.out.println(); 519 System.out.println(" /**"); 520 System.out.println(" * Private constructor."); 521 System.out.println( 522 " * This class is fully static and thread safe."); 523 System.out.println(" */"); 524 System.out.println(" private Translate ()"); 525 System.out.println(" {"); 526 System.out.println(" }"); 527 System.out.println(); 528 System.out.println(" /**"); 529 System.out.println( 530 " * Convert a reference to a unicode character."); 531 System.out.println( 532 " * Convert a single numeric character reference or character entity reference"); 533 System.out.println(" * to a unicode character."); 534 System.out.println( 535 " * @param string The string to convert. Of the form &xxxx; or &#xxxx; with"); 536 System.out.println( 537 " * or without the leading ampersand or trailing semi-colon."); 538 System.out.println( 539 " * @return The converted character or '\\0' (zero) if the string is an"); 540 System.out.println(" * invalid reference."); 541 System.out.println(" */"); 542 System.out.println( 543 " public static char convertToChar (String string)"); 544 System.out.println(" {"); 545 System.out.println(" int length;"); 546 System.out.println(" Character item;"); 547 System.out.println(" char ret;"); 548 System.out.println(); 549 System.out.println(" ret = 0;"); 550 System.out.println(); 551 System.out.println(" length = string.length ();"); 552 System.out.println(" if (0 < length)"); 553 System.out.println(" {"); 554 System.out.println(" if ('&' == string.charAt (0))"); 555 System.out.println(" {"); 556 System.out.println(" string = string.substring (1);"); 557 System.out.println(" length--;"); 558 System.out.println(" }"); 559 System.out.println(" if (0 < length)"); 560 System.out.println(" {"); 561 System.out.println( 562 " if (';' == string.charAt (length - 1))"); 563 System.out.println( 564 " string = string.substring (0, --length);"); 565 System.out.println(" if (0 < length)"); 566 System.out.println(" {"); 567 System.out.println(" if ('#' == string.charAt (0))"); 568 System.out.println(" try"); 569 System.out.println(" {"); 570 System.out.println( 571 " ret = (char)Integer.parseInt (string.substring (1));"); 572 System.out.println(" }"); 573 System.out.println( 574 " catch (NumberFormatException nfe)"); 575 System.out.println(" {"); 576 System.out.println( 577 " /* failed conversion, return 0 */"); 578 System.out.println(" }"); 579 System.out.println(" else"); 580 System.out.println(" {"); 581 System.out.println( 582 " item = (Character)refChar.get (string);"); 583 System.out.println(" if (null != item)"); 584 System.out.println( 585 " ret = item.charValue ();"); 586 System.out.println(" }"); 587 System.out.println(" }"); 588 System.out.println(" }"); 589 System.out.println(" }"); 590 System.out.println(); 591 System.out.println(" return (ret);"); 592 System.out.println(" }"); 593 System.out.println(); 594 System.out.println(" /**"); 595 System.out.println(" * Decode a string containing references."); 596 System.out.println( 597 " * Change all numeric character reference and character entity references"); 598 System.out.println(" * to unicode characters."); 599 System.out.println(" * @param string The string to translate."); 600 System.out.println(" */"); 601 System.out.println(" public static String decode (String string)"); 602 System.out.println(" {"); 603 System.out.println(" int index;"); 604 System.out.println(" int length;"); 605 System.out.println(" int amp;"); 606 System.out.println(" int semi;"); 607 System.out.println(" String code;"); 608 System.out.println(" char character;"); 609 System.out.println(" StringBuffer ret;"); 610 System.out.println(); 611 System.out.println( 612 " ret = new StringBuffer (string.length ());"); 613 System.out.println(); 614 System.out.println(" index = 0;"); 615 System.out.println(" length = string.length ();"); 616 System.out.println( 617 " while ((index < length) && (-1 != (amp = string.indexOf ('&', index))))"); 618 System.out.println(" {"); 619 System.out.println( 620 " ret.append (string.substring (index, amp));"); 621 System.out.println(" index = amp + 1;"); 622 System.out.println(" if (amp < length - 1)"); 623 System.out.println(" {"); 624 System.out.println(" semi = string.indexOf (';', amp);"); 625 System.out.println(" if (-1 != semi)"); 626 System.out.println( 627 " code = string.substring (amp, semi + 1);"); 628 System.out.println(" else"); 629 System.out.println( 630 " code = string.substring (amp);"); 631 System.out.println( 632 " if (0 != (character = convertToChar (code)))"); 633 System.out.println(" index += code.length () - 1;"); 634 System.out.println(" else"); 635 System.out.println(" character = '&';"); 636 System.out.println(" }"); 637 System.out.println(" else"); 638 System.out.println(" character = '&';"); 639 System.out.println(" ret.append (character);"); 640 System.out.println(" }"); 641 System.out.println(" if (index < length)"); 642 System.out.println( 643 " ret.append (string.substring (index));"); 644 System.out.println(); 645 System.out.println(" return (ret.toString ());"); 646 System.out.println(" }"); 647 System.out.println(); 648 System.out.println(" /**"); 649 System.out.println( 650 " * Convert a character to a character entity reference."); 651 System.out.println( 652 " * Convert a unicode character to a character entity reference of"); 653 System.out.println(" * the form &xxxx;."); 654 System.out.println(" * @param character The character to convert."); 655 System.out.println( 656 " * @return The converted character or <code>null</code> if the character"); 657 System.out.println(" * is not one of the known entity references."); 658 System.out.println(" */"); 659 System.out.println( 660 " public static String convertToString (Character character)"); 661 System.out.println(" {"); 662 System.out.println(" StringBuffer buffer;"); 663 System.out.println(" String ret;"); 664 System.out.println(); 665 System.out.println( 666 " if (null != (ret = (String)mCharRef.get (character)))"); 667 System.out.println(" {"); 668 System.out.println( 669 " buffer = new StringBuffer (ret.length () + 2);"); 670 System.out.println(" buffer.append ('&');"); 671 System.out.println(" buffer.append (ret);"); 672 System.out.println(" buffer.append (';');"); 673 System.out.println(" ret = buffer.toString ();"); 674 System.out.println(" }"); 675 System.out.println(); 676 System.out.println(" return (ret);"); 677 System.out.println(" }"); 678 System.out.println(); 679 System.out.println(" /**"); 680 System.out.println( 681 " * Convert a character to a numeric character reference."); 682 System.out.println( 683 " * Convert a unicode character to a numeric character reference of"); 684 System.out.println(" * the form &#xxxx;."); 685 System.out.println(" * @param character The character to convert."); 686 System.out.println(" * @return The converted character."); 687 System.out.println(" */"); 688 System.out.println( 689 " public static String convertToString (int character)"); 690 System.out.println(" {"); 691 System.out.println(" StringBuffer ret;"); 692 System.out.println(); 693 System.out.println( 694 " ret = new StringBuffer (13); /* � */"); 695 System.out.println(" ret.append (\"&#\");"); 696 System.out.println(" ret.append (character);"); 697 System.out.println(" ret.append (';');"); 698 System.out.println(); 699 System.out.println(" return (ret.toString ());"); 700 System.out.println(" }"); 701 System.out.println(); 702 System.out.println(" /**"); 703 System.out.println(" * Encode a string to use references."); 704 System.out.println( 705 " * Change all characters that are not ASCII to their numeric character"); 706 System.out.println(" * reference or character entity reference."); 707 System.out.println( 708 " * This implementation is inefficient, allocating a new"); 709 System.out.println( 710 " * <code>Character</code> for each character in the string,"); 711 System.out.println( 712 " * but this class is primarily intended to decode strings"); 713 System.out.println( 714 " * so efficiency and speed in the encoding was not a priority."); 715 System.out.println(" * @param string The string to translate."); 716 System.out.println(" */"); 717 System.out.println(" public static String encode (String string)"); 718 System.out.println(" {"); 719 System.out.println(" int length;"); 720 System.out.println(" char c;"); 721 System.out.println(" Character character;"); 722 System.out.println(" String value;"); 723 System.out.println(" StringBuffer ret;"); 724 System.out.println(); 725 System.out.println( 726 " ret = new StringBuffer (string.length () * 6);"); 727 System.out.println(" length = string.length ();"); 728 System.out.println(" for (int i = 0; i < length; i++)"); 729 System.out.println(" {"); 730 System.out.println(" c = string.charAt (i);"); 731 System.out.println(" character = new Character (c);"); 732 System.out.println( 733 " if (null != (value = convertToString (character)))"); 734 System.out.println(" ret.append (value);"); 735 System.out.println( 736 " else if (!((c > 0x001F) && (c < 0x007F)))"); 737 System.out.println(" {"); 738 System.out.println(" value = convertToString (c);"); 739 System.out.println(" ret.append (value);"); 740 System.out.println(" }"); 741 System.out.println(" else"); 742 System.out.println(" ret.append (character);"); 743 System.out.println(" }"); 744 System.out.println(); 745 System.out.println(" return (ret.toString ());"); 746 System.out.println(" }"); 747 System.out.println("}"); 748 } 749 } 750 | Popular Tags |