1 5 package org.h2.tools.indexer; 6 7 import java.util.HashMap ; 8 9 public class HtmlConverter { 10 private static HashMap charMap = new HashMap (); 11 12 private static final String [] CHARS = { 13 "quot:34", "amp:38", "lt:60", "gt:62", "nbsp:160", "iexcl:161", "cent:162", 14 "pound:163", "curren:164", "yen:165", "brvbar:166", "sect:167", "uml:168", 15 "copy:169", "ordf:170", "laquo:171", "not:172", "shy:173", "reg:174", "macr:175", 16 "deg:176", "plusmn:177", "sup2:178", "sup3:179", "acute:180", "micro:181", 17 "para:182", "middot:183", "cedil:184", "sup1:185", "ordm:186", "raquo:187", 18 "frac14:188", "frac12:189", "frac34:190", "iquest:191", "Agrave:192", 19 "Aacute:193", "Acirc:194", "Atilde:195", "Auml:196", "Aring:197", "AElig:198", 20 "Ccedil:199", "Egrave:200", "Eacute:201", "Ecirc:202", "Euml:203", "Igrave:204", 21 "Iacute:205", "Icirc:206", "Iuml:207", "ETH:208", "Ntilde:209", "Ograve:210", 22 "Oacute:211", "Ocirc:212", "Otilde:213", "Ouml:214", "times:215", "Oslash:216", 23 "Ugrave:217", "Uacute:218", "Ucirc:219", "Uuml:220", "Yacute:221", "THORN:222", 24 "szlig:223", "agrave:224", "aacute:225", "acirc:226", "atilde:227", "auml:228", 25 "aring:229", "aelig:230", "ccedil:231", "egrave:232", "eacute:233", "ecirc:234", 26 "euml:235", "igrave:236", "iacute:237", "icirc:238", "iuml:239", "eth:240", 27 "ntilde:241", "ograve:242", "oacute:243", "ocirc:244", "otilde:245", "ouml:246", 28 "divide:247", "oslash:248", "ugrave:249", "uacute:250", "ucirc:251", "uuml:252", 29 "yacute:253", "thorn:254", "yuml:255", "Alpha:913", "alpha:945", "Beta:914", 30 "beta:946", "Gamma:915", "gamma:947", "Delta:916", "delta:948", "Epsilon:917", 31 "epsilon:949", "Zeta:918", "zeta:950", "Eta:919", "eta:951", "Theta:920", 32 "theta:952", "Iota:921", "iota:953", "Kappa:922", "kappa:954", "Lambda:923", 33 "lambda:955", "Mu:924", "mu:956", "Nu:925", "nu:957", "Xi:926", "xi:958", 34 "Omicron:927", "omicron:959", "Pi:928", "pi:960", "Rho:929", "rho:961", 35 "Sigma:931", "sigmaf:962", "sigma:963", "Tau:932", "tau:964", "Upsilon:933", 36 "upsilon:965", "Phi:934", "phi:966", "Chi:935", "chi:967", "Psi:936", "psi:968", 37 "Omega:937", "omega:969", "thetasym:977", "upsih:978", "piv:982", "forall:8704", 38 "part:8706", "exist:8707", "empty:8709", "nabla:8711", "isin:8712", "notin:8713", 39 "ni:8715", "prod:8719", "sum:8721", "minus:8722", "lowast:8727", "radic:8730", 40 "prop:8733", "infin:8734", "ang:8736", "and:8743", "or:8744", "cap:8745", "cup:8746", 41 "int:8747", "there4:8756", "sim:8764", "cong:8773", "asymp:8776", "ne:8800", 42 "equiv:8801", "le:8804", "ge:8805", "sub:8834", "sup:8835", "nsub:8836", "sube:8838", 43 "supe:8839", "oplus:8853", "otimes:8855", "perp:8869", "sdot:8901", "loz:9674", 44 "lceil:8968", "rceil:8969", "lfloor:8970", "rfloor:8971", "lang:9001", "rang:9002", 45 "larr:8592", "uarr:8593", "rarr:8594", "darr:8595", "harr:8596", "crarr:8629", 46 "lArr:8656", "uArr:8657", "rArr:8658", "dArr:8659", "hArr:8660", "bull:8226", 47 "prime:8242", "oline:8254", "frasl:8260", "weierp:8472", "image:8465", "real:8476", 48 "trade:8482", "euro:8364", "alefsym:8501", "spades:9824", "clubs:9827", "hearts:9829", 49 "diams:9830", "ensp:8194", "emsp:8195", "thinsp:8201", "zwnj:8204", "zwj:8205", 50 "lrm:8206", "rlm:8207", "ndash:8211", "mdash:8212", "lsquo:8216", "rsquo:8217", 51 "sbquo:8218", "ldquo:8220", "rdquo:8221", "bdquo:8222", "dagger:8224", 52 "Dagger:8225", "hellip:8230", "permil:8240", "lsaquo:8249", "rsaquo:8250" 53 }; 54 55 static { 56 for(int i=0; i<CHARS.length; i++) { 57 String token = CHARS[i]; 58 int idx = token.indexOf(':'); 59 String key = token.substring(0, idx); 60 int ch = Integer.parseInt(token.substring(idx+1)); 61 charMap.put(key, new Character ((char)ch)); 62 } 63 } 64 65 public static String convertHtml(String html) { 66 if(html == null) { 67 return null; 68 } 69 if(html.length() == 0) { 70 return html; 71 } 72 if(html.indexOf('&') < 0) { 73 return html; 74 } 75 StringBuffer buff = new StringBuffer (); 76 for(int i=0; i<html.length(); i++) { 77 char ch = html.charAt(i); 78 if(ch!='&') { 79 buff.append(ch); 80 continue; 81 } 82 int idx = html.indexOf(';', i+1); 83 if(idx < 0) { 84 buff.append("???"); 85 continue; 86 } 87 String key = html.substring(i+1, idx); 88 Character repl; 89 if(key.startsWith("#")) { 90 try { 91 int code = Integer.parseInt(key.substring(1)); 92 if(code < 0 || code > 0xffff) { 93 repl = null; 94 } else { 95 repl = new Character ((char)code); 96 } 97 } catch(NumberFormatException e) { 98 repl = null; 99 } 100 } else { 101 repl = (Character ) charMap.get(key); 102 } 103 if(repl == null) { 104 buff.append("???" + key + "???"); 105 continue; 106 } else { 107 buff.append(repl.charValue()); 108 } 109 i = idx; 110 } 111 return buff.toString(); 112 } 113 114 115 } 116 | Popular Tags |