1 17 package com.sun.syndication.io.impl; 18 19 import java.io.IOException ; 20 import java.io.Reader ; 21 import java.io.InputStreamReader ; 22 import java.io.BufferedReader ; 23 import java.util.HashMap ; 24 import java.util.Map ; 25 import java.util.regex.Pattern ; 26 import java.util.regex.Matcher ; 27 import java.net.URL ; 28 29 32 public class XmlFixerReader extends Reader { 33 34 public static void main(String [] args) throws Exception { 35 Reader r = new InputStreamReader (new URL (args[0]).openStream()); 36 r = new XmlFixerReader(r); 37 BufferedReader br = new BufferedReader (r); 38 String l = br.readLine(); 39 while (l!=null) { 40 System.out.println(l); 41 l = br.readLine(); 42 } 43 } 44 45 protected Reader in; 46 47 public XmlFixerReader(Reader in) { 48 super(in); 49 this.in = in; 50 _buffer = new StringBuffer (); 51 _state = 0; 52 } 53 54 private boolean trimmed; 55 private StringBuffer _buffer; 56 private int _bufferPos; 57 private int _state = 0; 58 59 private boolean trimStream() throws IOException { 60 boolean hasContent = true; 61 int state = 0; 62 boolean loop; 63 int c; 64 do { 65 switch (state) { 66 case 0: 67 c = in.read(); 68 if (c==-1) { 69 loop = false; 70 hasContent = false; 71 } 72 else 73 if (c==' ' || c=='\n') { 74 loop = true; 75 } 76 else 77 if (c=='<') { 78 state = 1; 79 _buffer.setLength(0); 80 _bufferPos = 0; 81 _buffer.append((char)c); 82 loop = true; 83 } 84 else { 85 _buffer.setLength(0); 86 _bufferPos = 0; 87 _buffer.append((char)c); 88 loop = false; 89 hasContent = true; 90 _state = 3; 91 } 92 break; 93 case 1: 94 c = in.read(); 95 if (c==-1) { 96 loop = false; 97 hasContent = true; 98 _state = 3; 99 } 100 else 101 if (c!='!') { 102 _buffer.append((char)c); 103 _state = 3; 104 loop = false; 105 hasContent = true; 106 _state = 3; 107 } 108 else { 109 _buffer.append((char)c); 110 state = 2; 111 loop = true; 112 } 113 break; 114 case 2: 115 c = in.read(); 116 if (c==-1) { 117 loop = false; 118 hasContent = true; 119 _state = 3; 120 } 121 else 122 if (c=='-') { 123 _buffer.append((char)c); 124 state = 3; 125 loop = true; 126 } 127 else { 128 _buffer.append((char)c); 129 loop = false; 130 hasContent = true; 131 _state = 3; 132 } 133 break; 134 case 3: 135 c = in.read(); 136 if (c==-1) { 137 loop = false; 138 hasContent = true; 139 _state = 3; 140 } 141 else 142 if (c=='-') { 143 _buffer.append((char)c); 144 state = 4; 145 loop = true; 146 } 147 else { 148 _buffer.append((char)c); 149 loop = false; 150 hasContent = true; 151 _state = 3; 152 } 153 break; 154 case 4: 155 c = in.read(); 156 if (c==-1) { 157 loop = false; 158 hasContent = true; 159 _state = 3; 160 } 161 else 162 if (c!='-') { 163 _buffer.append((char)c); 164 loop = true; 165 } 166 else { 167 _buffer.append((char)c); 168 state = 5; 169 loop = true; 170 } 171 break; 172 case 5: 173 c = in.read(); 174 if (c==-1) { 175 loop = false; 176 hasContent = true; 177 _state = 3; 178 } 179 else 180 if (c!='-') { 181 _buffer.append((char)c); 182 loop = true; 183 state = 4; 184 } 185 else { 186 _buffer.append((char)c); 187 state = 6; 188 loop = true; 189 } 190 break; 191 case 6: 192 c = in.read(); 193 if (c==-1) { 194 loop = false; 195 hasContent = true; 196 _state = 3; 197 } 198 else 199 if (c!='>') { 200 _buffer.append((char)c); 201 loop = true; 202 state = 4; 203 } 204 else { 205 _buffer.setLength(0); 206 state = 0; 207 loop = true; 208 } 209 break; 210 default: 211 throw new IOException ("It shouldn't happen"); 212 } 213 } while (loop); 214 return hasContent; 215 } 216 217 public int read() throws IOException { 218 boolean loop; 219 if (!trimmed) { trimmed = true; 221 if (!trimStream()) { 222 return -1; 223 } 224 } 225 int c; 226 do { switch (_state) { 228 case 0: c = in.read(); 230 if (c>-1) { 231 if (c=='&') { 232 _state = 1; 233 _buffer.setLength(0); 234 _bufferPos = 0; 235 _buffer.append((char)c); 236 _state = 1; 237 loop = true; 238 } 239 else { 240 loop = false; 241 } 242 } 243 else { 244 loop = false; 245 } 246 break; 247 case 1: c = in.read(); 249 if (c>-1) { 250 if (c==';') { 251 _buffer.append((char)c); 252 _state = 2; 253 loop = true; 254 } 255 else 256 if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c=='#') || (c>='0' && c<='9')) { 257 _buffer.append((char)c); 258 loop = true; 259 } 260 else { 261 _state = 3; 262 loop = true; 263 } 264 } 265 else { 266 _state = 3; 267 loop = true; 268 } 269 break; 270 case 2: c = 0; 272 String literalEntity = _buffer.toString(); 273 String codedEntity = (String ) CODED_ENTITIES.get(literalEntity); 274 if (codedEntity!=null) { 275 _buffer.setLength(0); 276 _buffer.append(codedEntity); 277 } _state = 3; 279 loop = true; 280 break; 281 case 3: if (_bufferPos<_buffer.length()) { 283 c = _buffer.charAt(_bufferPos++); 284 loop = false; 285 } 286 else { 287 c = 0; 288 _state = 0; 289 loop = true; 290 } 291 break; 292 default: 293 throw new IOException ("It shouldn't happen"); 294 } 295 } while (loop); 296 return c; 297 } 298 299 public int read(char[] buffer,int offset,int len) throws IOException { 300 int charsRead = 0; 301 int c = read(); 302 if (c==-1) { 303 return -1; 304 } 305 buffer[offset+(charsRead++)] = (char) c; 306 while (charsRead<len && (c=read())>-1) { 307 buffer[offset+(charsRead++)] = (char) c; 308 } 309 return charsRead; 310 } 311 312 public long skip(long n) throws IOException { 313 if (n==0) { 314 return 0; 315 } 316 else 317 if (n<0) { 318 throw new IllegalArgumentException ("'n' cannot be negative"); 319 } 320 int c = read(); 321 long counter = 1; 322 while (c>-1 && counter<n) { 323 c = read(); 324 counter++; 325 } 326 return counter; 327 } 328 329 public boolean ready() throws IOException { 330 return (_state!=0) || in.ready(); 331 } 332 333 public boolean markSupported() { 334 return false; 335 } 336 337 public void mark(int readAheadLimit) throws IOException { 338 throw new IOException ("Stream does not support mark"); 339 } 340 341 public void reset() throws IOException { 342 throw new IOException ("Stream does not support mark"); 343 } 344 345 public void close() throws IOException { 346 in.close(); 347 } 348 349 private static Map CODED_ENTITIES = new HashMap (); 350 351 static { 352 CODED_ENTITIES.put(" ", " "); 353 CODED_ENTITIES.put("¡", "¡"); 354 CODED_ENTITIES.put("¢", "¢"); 355 CODED_ENTITIES.put("£", "£"); 356 CODED_ENTITIES.put("¤","¤"); 357 CODED_ENTITIES.put("¥", "¥"); 358 CODED_ENTITIES.put("¦","¦"); 359 CODED_ENTITIES.put("§", "§"); 360 CODED_ENTITIES.put("¨", "¨"); 361 CODED_ENTITIES.put("©", "©"); 362 CODED_ENTITIES.put("ª", "ª"); 363 CODED_ENTITIES.put("«", "«"); 364 CODED_ENTITIES.put("¬", "¬"); 365 CODED_ENTITIES.put("­", "­"); 366 CODED_ENTITIES.put("®", "®"); 367 CODED_ENTITIES.put("¯", "¯"); 368 CODED_ENTITIES.put("°", "°"); 369 CODED_ENTITIES.put("±","±"); 370 CODED_ENTITIES.put("²", "²"); 371 CODED_ENTITIES.put("³", "³"); 372 CODED_ENTITIES.put("´", "´"); 373 CODED_ENTITIES.put("µ", "µ"); 374 CODED_ENTITIES.put("¶", "¶"); 375 CODED_ENTITIES.put("·","·"); 376 CODED_ENTITIES.put("¸", "¸"); 377 CODED_ENTITIES.put("¹", "¹"); 378 CODED_ENTITIES.put("º", "º"); 379 CODED_ENTITIES.put("»", "»"); 380 CODED_ENTITIES.put("¼","¼"); 381 CODED_ENTITIES.put("½","½"); 382 CODED_ENTITIES.put("¾","¾"); 383 CODED_ENTITIES.put("¿","¿"); 384 CODED_ENTITIES.put("À","À"); 385 CODED_ENTITIES.put("Á","Á"); 386 CODED_ENTITIES.put("Â", "Â"); 387 CODED_ENTITIES.put("Ã","Ã"); 388 CODED_ENTITIES.put("Ä", "Ä"); 389 CODED_ENTITIES.put("Å", "Å"); 390 CODED_ENTITIES.put("Æ", "Æ"); 391 CODED_ENTITIES.put("Ç","Ç"); 392 CODED_ENTITIES.put("È","È"); 393 CODED_ENTITIES.put("É","É"); 394 CODED_ENTITIES.put("Ê", "Ê"); 395 CODED_ENTITIES.put("Ë", "Ë"); 396 CODED_ENTITIES.put("Ì","Ì"); 397 CODED_ENTITIES.put("Í","Í"); 398 CODED_ENTITIES.put("Î", "Î"); 399 CODED_ENTITIES.put("Ï", "Ï"); 400 CODED_ENTITIES.put("Ð", "Ð"); 401 CODED_ENTITIES.put("Ñ","Ñ"); 402 CODED_ENTITIES.put("Ò","Ò"); 403 CODED_ENTITIES.put("Ó","Ó"); 404 CODED_ENTITIES.put("Ô", "Ô"); 405 CODED_ENTITIES.put("Õ","Õ"); 406 CODED_ENTITIES.put("Ö", "Ö"); 407 CODED_ENTITIES.put("×", "×"); 408 CODED_ENTITIES.put("Ø","Ø"); 409 CODED_ENTITIES.put("Ù","Ù"); 410 CODED_ENTITIES.put("Ú","Ú"); 411 CODED_ENTITIES.put("Û", "Û"); 412 CODED_ENTITIES.put("Ü", "Ü"); 413 CODED_ENTITIES.put("Ý","Ý"); 414 CODED_ENTITIES.put("Þ", "Þ"); 415 CODED_ENTITIES.put("ß", "ß"); 416 CODED_ENTITIES.put("à","à"); 417 CODED_ENTITIES.put("á","á"); 418 CODED_ENTITIES.put("â", "â"); 419 CODED_ENTITIES.put("ã","ã"); 420 CODED_ENTITIES.put("ä", "ä"); 421 CODED_ENTITIES.put("å", "å"); 422 CODED_ENTITIES.put("æ", "æ"); 423 CODED_ENTITIES.put("ç","ç"); 424 CODED_ENTITIES.put("è","è"); 425 CODED_ENTITIES.put("é","é"); 426 CODED_ENTITIES.put("ê", "ê"); 427 CODED_ENTITIES.put("ë", "ë"); 428 CODED_ENTITIES.put("ì","ì"); 429 CODED_ENTITIES.put("í","í"); 430 CODED_ENTITIES.put("î", "î"); 431 CODED_ENTITIES.put("ï", "ï"); 432 CODED_ENTITIES.put("ð", "ð"); 433 CODED_ENTITIES.put("ñ","ñ"); 434 CODED_ENTITIES.put("ò","ò"); 435 CODED_ENTITIES.put("ó","ó"); 436 CODED_ENTITIES.put("ô", "ô"); 437 CODED_ENTITIES.put("õ","õ"); 438 CODED_ENTITIES.put("ö", "ö"); 439 CODED_ENTITIES.put("÷","÷"); 440 CODED_ENTITIES.put("ø","ø"); 441 CODED_ENTITIES.put("ù","ù"); 442 CODED_ENTITIES.put("ú","ú"); 443 CODED_ENTITIES.put("û", "û"); 444 CODED_ENTITIES.put("ü", "ü"); 445 CODED_ENTITIES.put("ý","ý"); 446 CODED_ENTITIES.put("þ", "þ"); 447 CODED_ENTITIES.put("ÿ", "ÿ"); 448 } 449 450 454 private static Pattern ENTITIES_PATTERN = Pattern.compile( "&[A-Za-z^#]+;" ); 455 456 457 public String processHtmlEntities(String s) { 458 if (s.indexOf('&')==-1) { 459 return s; 460 } 461 StringBuffer sb = new StringBuffer (s.length()); 462 int pos = 0; 463 while (pos<s.length()) { 464 String chunck = s.substring(pos); 465 Matcher m = ENTITIES_PATTERN.matcher(chunck); 466 if (m.find()) { 467 int b = pos + m.start(); 468 int e = pos + m.end(); 469 if (b>pos) { 470 sb.append(s.substring(pos,b)); 471 pos = b; 472 } 473 chunck = s.substring(pos,e); 474 String codedEntity = (String ) CODED_ENTITIES.get(chunck); 475 if (codedEntity==null) { 476 codedEntity = chunck; 477 } 478 sb.append(codedEntity); 479 pos = e; 480 } 481 else { 482 sb.append(chunck); 483 pos += chunck.length(); 484 } 485 } 486 return sb.toString(); 487 } 488 489 } 490 | Popular Tags |