1 27 package org.htmlparser.util; 28 29 import java.io.BufferedReader ; 30 import java.io.BufferedWriter ; 31 import java.io.IOException ; 32 import java.io.InputStream ; 33 import java.io.InputStreamReader ; 34 import java.io.OutputStreamWriter ; 35 import java.io.PrintStream ; 36 import java.io.PrintWriter ; 37 import java.io.Reader ; 38 import java.io.UnsupportedEncodingException ; 39 40 import org.htmlparser.util.sort.Sort; 41 42 46 class CharacterReferenceEx extends CharacterReference 47 { 48 51 protected int mStart; 52 53 56 protected int mEnd; 57 58 62 public CharacterReferenceEx () 63 { 64 super ("", 0); 65 } 66 67 70 public void setStart (int start) 71 { 72 mStart = start; 73 } 74 75 79 public void setEnd (int end) 80 { 81 mEnd = end; 82 } 83 84 88 public String getKernel () 89 { 90 return (mKernel.substring (mStart, mEnd)); 91 } 92 93 97 101 public int compare (Object that) 102 { 103 CharacterReference r; 104 String kernel; 105 int length; 106 int ret; 107 108 ret = 0; 109 r = (CharacterReference)that; 110 kernel = r.getKernel (); 111 length = kernel.length (); 112 for (int i = mStart, j = 0; i < mEnd; i++, j++) 113 { 114 if (j >= length) 115 { 116 ret = 1; 117 break; 118 } 119 ret = mKernel.charAt (i) - kernel.charAt (j); 120 if (0 != ret) 121 break; 122 } 123 124 return (ret); 125 } 126 } 127 128 141 public class Translate 142 { 143 147 static public boolean DECODE_LINE_BY_LINE = false; 148 149 154 static public boolean ENCODE_HEXADECIMAL = false; 155 156 160 protected static final CharacterReference[] mCharacterReferences = 161 { 162 new CharacterReference ("nbsp", '\u00a0'), new CharacterReference ("iexcl", '\u00a1'), new CharacterReference ("cent", '\u00a2'), new CharacterReference ("pound", '\u00a3'), new CharacterReference ("curren", '\u00a4'), new CharacterReference ("yen", '\u00a5'), new CharacterReference ("brvbar", '\u00a6'), new CharacterReference ("sect", '\u00a7'), new CharacterReference ("uml", '\u00a8'), new CharacterReference ("copy", '\u00a9'), new CharacterReference ("ordf", '\u00aa'), new CharacterReference ("laquo", '\u00ab'), new CharacterReference ("not", '\u00ac'), new CharacterReference ("shy", '\u00ad'), new CharacterReference ("reg", '\u00ae'), new CharacterReference ("macr", '\u00af'), new CharacterReference ("deg", '\u00b0'), new CharacterReference ("plusmn", '\u00b1'), new CharacterReference ("sup2", '\u00b2'), new CharacterReference ("sup3", '\u00b3'), new CharacterReference ("acute", '\u00b4'), new CharacterReference ("micro", '\u00b5'), new CharacterReference ("para", '\u00b6'), new CharacterReference ("middot", '\u00b7'), new CharacterReference ("cedil", '\u00b8'), new CharacterReference ("sup1", '\u00b9'), new CharacterReference ("ordm", '\u00ba'), new CharacterReference ("raquo", '\u00bb'), new CharacterReference ("frac14", '\u00bc'), new CharacterReference ("frac12", '\u00bd'), new CharacterReference ("frac34", '\u00be'), new CharacterReference ("iquest", '\u00bf'), new CharacterReference ("Agrave", '\u00c0'), new CharacterReference ("Aacute", '\u00c1'), new CharacterReference ("Acirc", '\u00c2'), new CharacterReference ("Atilde", '\u00c3'), new CharacterReference ("Auml", '\u00c4'), new CharacterReference ("Aring", '\u00c5'), new CharacterReference ("AElig", '\u00c6'), new CharacterReference ("Ccedil", '\u00c7'), new CharacterReference ("Egrave", '\u00c8'), new CharacterReference ("Eacute", '\u00c9'), new CharacterReference ("Ecirc", '\u00ca'), new CharacterReference ("Euml", '\u00cb'), new CharacterReference ("Igrave", '\u00cc'), new CharacterReference ("Iacute", '\u00cd'), new CharacterReference ("Icirc", '\u00ce'), new CharacterReference ("Iuml", '\u00cf'), new CharacterReference ("ETH", '\u00d0'), new CharacterReference ("Ntilde", '\u00d1'), new CharacterReference ("Ograve", '\u00d2'), new CharacterReference ("Oacute", '\u00d3'), new CharacterReference ("Ocirc", '\u00d4'), new CharacterReference ("Otilde", '\u00d5'), new CharacterReference ("Ouml", '\u00d6'), new CharacterReference ("times", '\u00d7'), new CharacterReference ("Oslash", '\u00d8'), new CharacterReference ("Ugrave", '\u00d9'), new CharacterReference ("Uacute", '\u00da'), new CharacterReference ("Ucirc", '\u00db'), new CharacterReference ("Uuml", '\u00dc'), new CharacterReference ("Yacute", '\u00dd'), new CharacterReference ("THORN", '\u00de'), new CharacterReference ("szlig", '\u00df'), new CharacterReference ("agrave", '\u00e0'), new CharacterReference ("aacute", '\u00e1'), new CharacterReference ("acirc", '\u00e2'), new CharacterReference ("atilde", '\u00e3'), new CharacterReference ("auml", '\u00e4'), new CharacterReference ("aring", '\u00e5'), new CharacterReference ("aelig", '\u00e6'), new CharacterReference ("ccedil", '\u00e7'), new CharacterReference ("egrave", '\u00e8'), new CharacterReference ("eacute", '\u00e9'), new CharacterReference ("ecirc", '\u00ea'), new CharacterReference ("euml", '\u00eb'), new CharacterReference ("igrave", '\u00ec'), new CharacterReference ("iacute", '\u00ed'), new CharacterReference ("icirc", '\u00ee'), new CharacterReference ("iuml", '\u00ef'), new CharacterReference ("eth", '\u00f0'), new CharacterReference ("ntilde", '\u00f1'), new CharacterReference ("ograve", '\u00f2'), new CharacterReference ("oacute", '\u00f3'), new CharacterReference ("ocirc", '\u00f4'), new CharacterReference ("otilde", '\u00f5'), new CharacterReference ("ouml", '\u00f6'), new CharacterReference ("divide", '\u00f7'), new CharacterReference ("oslash", '\u00f8'), new CharacterReference ("ugrave", '\u00f9'), new CharacterReference ("uacute", '\u00fa'), new CharacterReference ("ucirc", '\u00fb'), new CharacterReference ("uuml", '\u00fc'), new CharacterReference ("yacute", '\u00fd'), new CharacterReference ("thorn", '\u00fe'), new CharacterReference ("yuml", '\u00ff'), new CharacterReference ("fnof", '\u0192'), new CharacterReference ("Alpha", '\u0391'), new CharacterReference ("Beta", '\u0392'), new CharacterReference ("Gamma", '\u0393'), new CharacterReference ("Delta", '\u0394'), new CharacterReference ("Epsilon", '\u0395'), new CharacterReference ("Zeta", '\u0396'), new CharacterReference ("Eta", '\u0397'), new CharacterReference ("Theta", '\u0398'), new CharacterReference ("Iota", '\u0399'), new CharacterReference ("Kappa", '\u039a'), new CharacterReference ("Lambda", '\u039b'), new CharacterReference ("Mu", '\u039c'), new CharacterReference ("Nu", '\u039d'), new CharacterReference ("Xi", '\u039e'), new CharacterReference ("Omicron", '\u039f'), new CharacterReference ("Pi", '\u03a0'), new CharacterReference ("Rho", '\u03a1'), new CharacterReference ("Sigma", '\u03a3'), new CharacterReference ("Tau", '\u03a4'), new CharacterReference ("Upsilon", '\u03a5'), new CharacterReference ("Phi", '\u03a6'), new CharacterReference ("Chi", '\u03a7'), new CharacterReference ("Psi", '\u03a8'), new CharacterReference ("Omega", '\u03a9'), new CharacterReference ("alpha", '\u03b1'), new CharacterReference ("beta", '\u03b2'), new CharacterReference ("gamma", '\u03b3'), new CharacterReference ("delta", '\u03b4'), new CharacterReference ("epsilon", '\u03b5'), new CharacterReference ("zeta", '\u03b6'), new CharacterReference ("eta", '\u03b7'), new CharacterReference ("theta", '\u03b8'), new CharacterReference ("iota", '\u03b9'), new CharacterReference ("kappa", '\u03ba'), new CharacterReference ("lambda", '\u03bb'), new CharacterReference ("mu", '\u03bc'), new CharacterReference ("nu", '\u03bd'), new CharacterReference ("xi", '\u03be'), new CharacterReference ("omicron", '\u03bf'), new CharacterReference ("pi", '\u03c0'), new CharacterReference ("rho", '\u03c1'), new CharacterReference ("sigmaf", '\u03c2'), new CharacterReference ("sigma", '\u03c3'), new CharacterReference ("tau", '\u03c4'), new CharacterReference ("upsilon", '\u03c5'), new CharacterReference ("phi", '\u03c6'), new CharacterReference ("chi", '\u03c7'), new CharacterReference ("psi", '\u03c8'), new CharacterReference ("omega", '\u03c9'), new CharacterReference ("thetasym", '\u03d1'), new CharacterReference ("upsih", '\u03d2'), new CharacterReference ("piv", '\u03d6'), new CharacterReference ("bull", '\u2022'), new CharacterReference ("hellip", '\u2026'), new CharacterReference ("prime", '\u2032'), new CharacterReference ("Prime", '\u2033'), new CharacterReference ("oline", '\u203e'), new CharacterReference ("frasl", '\u2044'), new CharacterReference ("weierp", '\u2118'), new CharacterReference ("image", '\u2111'), new CharacterReference ("real", '\u211c'), new CharacterReference ("trade", '\u2122'), new CharacterReference ("alefsym", '\u2135'), new CharacterReference ("larr", '\u2190'), new CharacterReference ("uarr", '\u2191'), new CharacterReference ("rarr", '\u2192'), new CharacterReference ("darr", '\u2193'), new CharacterReference ("harr", '\u2194'), new CharacterReference ("crarr", '\u21b5'), new CharacterReference ("lArr", '\u21d0'), new CharacterReference ("uArr", '\u21d1'), new CharacterReference ("rArr", '\u21d2'), new CharacterReference ("dArr", '\u21d3'), new CharacterReference ("hArr", '\u21d4'), new CharacterReference ("forall", '\u2200'), new CharacterReference ("part", '\u2202'), new CharacterReference ("exist", '\u2203'), new CharacterReference ("empty", '\u2205'), new CharacterReference ("nabla", '\u2207'), new CharacterReference ("isin", '\u2208'), new CharacterReference ("notin", '\u2209'), new CharacterReference ("ni", '\u220b'), new CharacterReference ("prod", '\u220f'), new CharacterReference ("sum", '\u2211'), new CharacterReference ("minus", '\u2212'), new CharacterReference ("lowast", '\u2217'), new CharacterReference ("radic", '\u221a'), new CharacterReference ("prop", '\u221d'), new CharacterReference ("infin", '\u221e'), new CharacterReference ("ang", '\u2220'), new CharacterReference ("and", '\u2227'), new CharacterReference ("or", '\u2228'), new CharacterReference ("cap", '\u2229'), new CharacterReference ("cup", '\u222a'), new CharacterReference ("int", '\u222b'), new CharacterReference ("there4", '\u2234'), new CharacterReference ("sim", '\u223c'), new CharacterReference ("cong", '\u2245'), new CharacterReference ("asymp", '\u2248'), new CharacterReference ("ne", '\u2260'), new CharacterReference ("equiv", '\u2261'), new CharacterReference ("le", '\u2264'), new CharacterReference ("ge", '\u2265'), new CharacterReference ("sub", '\u2282'), new CharacterReference ("sup", '\u2283'), new CharacterReference ("nsub", '\u2284'), new CharacterReference ("sube", '\u2286'), new CharacterReference ("supe", '\u2287'), new CharacterReference ("oplus", '\u2295'), new CharacterReference ("otimes", '\u2297'), new CharacterReference ("perp", '\u22a5'), new CharacterReference ("sdot", '\u22c5'), new CharacterReference ("lceil", '\u2308'), new CharacterReference ("rceil", '\u2309'), new CharacterReference ("lfloor", '\u230a'), new CharacterReference ("rfloor", '\u230b'), new CharacterReference ("lang", '\u2329'), new CharacterReference ("rang", '\u232a'), new CharacterReference ("loz", '\u25ca'), new CharacterReference ("spades", '\u2660'), new CharacterReference ("clubs", '\u2663'), new CharacterReference ("hearts", '\u2665'), new CharacterReference ("diams", '\u2666'), new CharacterReference ("quot", '\u0022'), new CharacterReference ("amp", '\u0026'), new CharacterReference ("lt", '\u003c'), new CharacterReference ("gt", '\u003e'), new CharacterReference ("OElig", '\u0152'), new CharacterReference ("oelig", '\u0153'), new CharacterReference ("Scaron", '\u0160'), new CharacterReference ("scaron", '\u0161'), new CharacterReference ("Yuml", '\u0178'), new CharacterReference ("circ", '\u02c6'), new CharacterReference ("tilde", '\u02dc'), new CharacterReference ("ensp", '\u2002'), new CharacterReference ("emsp", '\u2003'), new CharacterReference ("thinsp", '\u2009'), new CharacterReference ("zwnj", '\u200c'), new CharacterReference ("zwj", '\u200d'), new CharacterReference ("lrm", '\u200e'), new CharacterReference ("rlm", '\u200f'), new CharacterReference ("ndash", '\u2013'), new CharacterReference ("mdash", '\u2014'), new CharacterReference ("lsquo", '\u2018'), new CharacterReference ("rsquo", '\u2019'), new CharacterReference ("sbquo", '\u201a'), new CharacterReference ("ldquo", '\u201c'), new CharacterReference ("rdquo", '\u201d'), new CharacterReference ("bdquo", '\u201e'), new CharacterReference ("dagger", '\u2020'), new CharacterReference ("Dagger", '\u2021'), new CharacterReference ("permil", '\u2030'), new CharacterReference ("lsaquo", '\u2039'), new CharacterReference ("rsaquo", '\u203a'), new CharacterReference ("euro", '\u20ac'), }; 495 496 501 protected static final int BREAKPOINT = 0x100; 502 503 510 protected static final CharacterReference[] mCharacterList; 511 512 static 513 { 514 int index; 515 CharacterReference item; 516 int character; 517 518 index = 0; 520 for (int i = 0; i < mCharacterReferences.length; i++) 521 if (mCharacterReferences[i].getCharacter () < BREAKPOINT) 522 index++; 523 mCharacterList = new CharacterReference[BREAKPOINT + mCharacterReferences.length - index]; 525 index = BREAKPOINT; 526 for (int i = 0; i < mCharacterReferences.length; i++) 527 { 528 item = mCharacterReferences[i]; 529 character = mCharacterReferences[i].getCharacter (); 530 if (character < BREAKPOINT) 531 mCharacterList[character] = item; 532 else 533 { 534 int x = BREAKPOINT; 536 while (x < index) 537 if (mCharacterList[x].getCharacter () > character) 538 break; 539 else 540 x++; 541 int y = index - 1; 542 while (y >= x) 543 { 544 mCharacterList[y + 1] = mCharacterList[y]; 545 y--; 546 } 547 mCharacterList[x] = item; 548 index++; 549 } 550 } 551 Sort.QuickSort (mCharacterReferences); 553 } 554 555 559 private Translate () 560 { 561 } 562 563 571 protected static int lookup (CharacterReference[] array, char ref, int lo, int hi) 572 { int num; 573 int mid; 574 int half; 575 int result; 576 int ret; 577 578 ret = -1; 579 580 num = (hi - lo) + 1; 581 while ((-1 == ret) && (lo <= hi)) 582 { 583 half = num / 2; 584 mid = lo + ((0 != (num & 1)) ? half : half - 1); 585 result = ref - array[mid].getCharacter (); 586 if (0 == result) 587 ret = mid; 588 else if (0 > result) 589 { 590 hi = mid - 1; 591 num = ((0 != (num & 1)) ? half : half - 1); 592 } 593 else 594 { 595 lo = mid + 1; 596 num = half; 597 } 598 } 599 if (-1 == ret) 600 ret = lo; 601 602 return (ret); 603 } 604 605 612 public static CharacterReference lookup (char character) 613 { 614 int index; 615 CharacterReference ret; 616 617 if (character < BREAKPOINT) 618 ret = mCharacterList[character]; 619 else 620 { 621 index = lookup (mCharacterList, character, BREAKPOINT, mCharacterList.length - 1); 622 if (index < mCharacterList.length) 623 { 624 ret = mCharacterList[index]; 625 if (character != ret.getCharacter ()) 626 ret = null; 627 } 628 else 629 ret = null; 630 } 631 632 return (ret); 633 } 634 635 644 protected static CharacterReference lookup (CharacterReference key) 645 { 646 String string; 647 int index; 648 String kernel; 649 char character; 650 CharacterReference test; 651 CharacterReference ret; 652 653 ret = null; 660 index = Sort.bsearch (mCharacterReferences, key); 661 string = key.getKernel (); 662 if (index < mCharacterReferences.length) 663 { 664 ret = mCharacterReferences[index]; 665 kernel = ret.getKernel (); 666 if (!string.regionMatches ( 667 0, 668 kernel, 669 0, 670 kernel.length ())) 671 { ret = null; 674 } 675 } 676 if (null == ret) 677 { 678 character = string.charAt (0); 679 while (--index >= 0) 680 { 681 test = mCharacterReferences[index]; 682 kernel = test.getKernel (); 683 if (character == kernel.charAt (0)) 684 { 685 if (string.regionMatches ( 686 0, 687 kernel, 688 0, 689 kernel.length ())) 690 { 691 ret = test; 692 break; 693 } 694 } 695 else 696 break; 697 } 698 } 699 700 return (ret); 701 } 702 703 716 public static CharacterReference lookup (String kernel, int start, int end) 717 { 718 CharacterReferenceEx probe; 719 720 probe = new CharacterReferenceEx (); 721 probe.setKernel (kernel); 722 probe.setStart (start); 723 probe.setEnd (end); 724 725 return (lookup (probe)); 726 } 727 728 740 public static char convertToChar (String string, int start, int end) 741 { 742 return (decode (string.substring (start, end)).charAt (0)); 743 } 744 745 755 public static char convertToChar (String string) 756 { 757 return (decode (string).charAt (0)); 758 } 759 760 766 public static String decode (String string) 767 { 768 CharacterReferenceEx key; 769 int amp; 770 int index; 771 int length; 772 StringBuffer buffer; 773 char character; 774 int number; 775 int radix; 776 int i; 777 int semi; 778 boolean done; 779 CharacterReference item; 780 String ret; 781 782 if (-1 == (amp = string.indexOf ('&'))) 783 ret = string; 784 else 785 { 786 key = null; 787 index = 0; 788 length = string.length (); 789 buffer = new StringBuffer (length); 790 do 791 { 792 while (index < amp) 795 buffer.append (string.charAt (index++)); 796 797 index++; 798 if (index < length) 799 { 800 character = string.charAt (index); 801 if ('#' == character) 802 { 803 index++; 805 number = 0; 806 radix = 0; 807 i = index; 808 done = false; 809 while ((i < length) && !done) 810 { 811 character = string.charAt (i); 812 switch (character) 813 { 814 case '0': 815 case '1': 816 case '2': 817 case '3': 818 case '4': 819 case '5': 820 case '6': 821 case '7': 822 case '8': 823 case '9': 824 if (0 == radix) 825 radix = 10; 826 number = number * radix + (character - '0'); 827 break; 828 case 'A': 829 case 'B': 830 case 'C': 831 case 'D': 832 case 'E': 833 case 'F': 834 if (16 == radix) 835 number = number * radix + (character - 'A' + 10); 836 else 837 done = true; 838 break; 839 case 'a': 840 case 'b': 841 case 'c': 842 case 'd': 843 case 'e': 844 case 'f': 845 if (16 == radix) 846 number = number * radix + (character - 'a' + 10); 847 else 848 done = true; 849 break; 850 case 'x': 851 case 'X': 852 if (0 == radix) 853 radix = 16; 854 else 855 done = true; 856 break; 857 case ';': 858 done = true; 859 i++; 860 break; 861 default: 862 done = true; 863 break; 864 } 865 if (!done) 866 i++; 867 } 868 if (0 != number) 869 { 870 buffer.append ((char)number); 871 index = i; 872 amp = index; 873 } 874 875 } 876 else if (Character.isLetter (character)) { 878 i = index + 1; 880 done = false; 881 semi = length; 882 while ((i < length) && !done) 883 { 884 character = string.charAt (i); 885 if (';' == character) 886 { 887 done = true; 888 semi = i; 889 i++; 890 } 891 else if (Character.isLetterOrDigit (character)) 892 i++; 893 else 894 { 895 done = true; 896 semi = i; 897 } 898 } 899 if (null == key) 901 key = new CharacterReferenceEx (); 902 key.setKernel (string); 903 key.setStart (index); 904 key.setEnd (semi); 905 item = lookup (key); 906 if (null != item) 907 { 908 buffer.append ((char)item.getCharacter ()); 909 index += item.getKernel ().length (); 910 if ((index < length) && (';' == string.charAt (index))) 911 index++; 912 amp = index; 913 } 914 } 915 else 916 { 917 } 919 } 920 while (amp < index) 922 buffer.append (string.charAt (amp++)); 923 } 924 while ((index < length) && (-1 != (amp = string.indexOf ('&', index)))); 925 while (index < length) 928 buffer.append (string.charAt (index++)); 929 ret = buffer.toString (); 930 } 931 932 return (ret); 933 } 934 935 942 public static String decode (StringBuffer buffer) 943 { 944 return decode (buffer.toString()); 945 } 946 947 959 public static void decode (InputStream in, PrintStream out) 960 { 961 Reader reader; 962 StringBuffer buffer; 963 int character; 964 String string; 965 boolean newlines; 966 967 try 968 { 969 try 970 { 971 reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1")); 972 } 973 catch (UnsupportedEncodingException use) 974 { 975 reader = new BufferedReader (new InputStreamReader (in)); 977 } 978 buffer = new StringBuffer (1024); 979 newlines = false; 980 if (DECODE_LINE_BY_LINE) 981 while (-1 != (character = reader.read ())) 982 { 983 if (('\r' == character) || ('\n' == character)) 984 { 985 if (!newlines) 986 { 987 string = decode (buffer.toString ()); 988 out.print (string); 989 buffer.setLength (0); 990 newlines = true; 991 } 992 buffer.append ((char)character); 993 } 994 else 995 { 996 if (newlines) 997 { 998 out.print (buffer.toString ()); 999 buffer.setLength (0); 1000 newlines = false; 1001 } 1002 buffer.append ((char)character); 1003 } 1004 } 1005 else 1006 while (-1 != (character = reader.read ())) 1007 buffer.append ((char)character); 1008 if (0 != buffer.length ()) 1009 { 1010 if (newlines) 1011 out.print (buffer.toString ()); 1012 else 1013 { 1014 string = decode (buffer.toString ()); 1015 out.print (string); 1016 } 1017 } 1018 } 1019 catch (IOException ioe) 1020 { 1021 out.println (); 1022 out.println (ioe.getMessage ()); 1023 } 1024 finally 1025 { 1026 out.flush (); 1027 } 1028 } 1029 1030 1038 public static String convertToString (int character) 1039 { 1040 return (encode (character)); 1041 } 1042 1043 1050 public static String encode (int character) 1051 { 1052 StringBuffer ret; 1053 1054 ret = new StringBuffer (13); 1055 ret.append ("&#"); 1056 if (ENCODE_HEXADECIMAL) 1057 { 1058 ret.append ("x"); 1059 ret.append (Integer.toHexString (character)); 1060 } 1061 else 1062 ret.append (character); 1063 ret.append (';'); 1064 1065 return (ret.toString ()); 1066 } 1067 1068 1075 public static String encode (String string) 1076 { 1077 int length; 1078 char c; 1079 CharacterReference candidate; 1080 StringBuffer ret; 1081 1082 ret = new StringBuffer (string.length () * 6); 1083 length = string.length (); 1084 for (int i = 0; i < length; i++) 1085 { 1086 c = string.charAt (i); 1087 candidate = lookup (c); 1088 if (null != candidate) 1089 { 1090 ret.append ('&'); 1091 ret.append (candidate.getKernel ()); 1092 ret.append (';'); 1093 } 1094 else if (!(c < 0x007F)) 1095 { 1096 ret.append ("&#"); 1097 if (ENCODE_HEXADECIMAL) 1098 { 1099 ret.append ("x"); 1100 ret.append (Integer.toHexString (c)); 1101 } 1102 else 1103 ret.append ((int)c); 1104 ret.append (';'); 1105 } 1106 else 1107 ret.append (c); 1108 } 1109 1110 return (ret.toString ()); 1111 } 1112 1113 1122 public static void encode (InputStream in, PrintStream out) 1123 { 1124 Reader reader; 1125 char c; 1126 int index; 1127 CharacterReference candidate; 1128 PrintWriter output; 1129 1130 try 1131 { 1132 reader = new BufferedReader (new InputStreamReader (in, "ISO-8859-1")); 1133 output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out, "ISO-8859-1"))); 1134 } 1135 catch (UnsupportedEncodingException use) 1136 { 1137 reader = new BufferedReader (new InputStreamReader (in)); 1139 output = new PrintWriter (new BufferedWriter (new OutputStreamWriter (out))); 1140 } 1141 try 1142 { 1143 while (-1 != (index = reader.read ())) 1144 { 1145 c = (char)index; 1146 candidate = lookup (c); 1147 if (null != candidate) 1148 { 1149 output.print ('&'); 1150 output.print (candidate.getKernel ()); 1151 output.print (';'); 1152 } 1153 else if (!(c < 0x007F)) 1154 { 1155 output.print ("&#"); 1156 if (ENCODE_HEXADECIMAL) 1157 { 1158 output.print ("x"); 1159 output.print (Integer.toHexString (c)); 1160 } 1161 else 1162 output.print ((int)c); 1163 output.print (';'); 1164 } 1165 else 1166 output.print (c); 1167 } 1168 } 1169 catch (IOException ioe) 1170 { 1171 output.println (); 1172 output.println (ioe.getMessage ()); 1173 } 1174 finally 1175 { 1176 output.flush (); 1177 } 1178 } 1179 1180 1187 public static void main (String [] args) 1188 { 1189 boolean encode; 1190 1191 if (0 < args.length && args[0].equalsIgnoreCase ("-encode")) 1192 encode = true; 1193 else 1194 encode = false; 1195 if (encode) 1196 encode (System.in, System.out); 1197 else 1198 decode (System.in, System.out); 1199 } 1200} 1201 | Popular Tags |