1 17 package com.sun.syndication.io; 18 19 import java.io.*; 20 import java.net.URL ; 21 import java.net.URLConnection ; 22 import java.net.HttpURLConnection ; 23 import java.util.regex.Pattern ; 24 import java.util.regex.Matcher ; 25 import java.text.MessageFormat ; 26 27 51 public class XmlReader extends Reader { 52 private static final int PUSHBACK_MAX_SIZE = 4096; 53 54 private static final String UTF_8 = "UTF-8"; 55 private static final String US_ASCII = "US-ASCII"; 56 private static final String UTF_16BE = "UTF-16BE"; 57 private static final String UTF_16LE = "UTF-16LE"; 58 private static final String UTF_16 = "UTF-16"; 59 60 private Reader _reader; 61 private String _encoding; 62 63 76 public XmlReader(File file) throws IOException { 77 this(new FileInputStream(file)); 78 } 79 80 92 public XmlReader(InputStream is) throws IOException { 93 this(is,true); 94 } 95 96 120 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException { 121 try { 122 doRawStream(is,lenient); 123 } 124 catch (XmlReaderException ex) { 125 if (!lenient) { 126 throw ex; 127 } 128 else { 129 doLenientDetection(null,ex); 130 } 131 } 132 } 133 134 150 public XmlReader(URL url) throws IOException { 151 this(url.openConnection()); 152 } 153 154 170 public XmlReader(URLConnection conn) throws IOException { 171 boolean lenient = true; 172 if (conn instanceof HttpURLConnection ) { 173 try { 174 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient); 175 } 176 catch (XmlReaderException ex) { 177 doLenientDetection(conn.getContentType(),ex); 178 } 179 } 180 else 181 if (conn.getContentType()!=null) { 182 try { 183 doHttpStream(conn.getInputStream(),conn.getContentType(),lenient); 184 } 185 catch (XmlReaderException ex) { 186 doLenientDetection(conn.getContentType(),ex); 187 } 188 } 189 else { 190 try { 191 doRawStream(conn.getInputStream(),lenient); 192 } 193 catch (XmlReaderException ex) { 194 doLenientDetection(null,ex); 195 } 196 } 197 } 198 199 214 public XmlReader(InputStream is,String httpContentType) throws IOException { 215 this(is,httpContentType,true); 216 } 217 218 246 public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException { 247 try { 248 doHttpStream(is,httpContentType,lenient); 249 } 250 catch (XmlReaderException ex) { 251 if (!lenient) { 252 throw ex; 253 } 254 else { 255 doLenientDetection(httpContentType,ex); 256 } 257 } 258 } 259 260 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException { 261 if (httpContentType!=null) { 262 if (httpContentType.startsWith("text/html")) { 263 httpContentType = httpContentType.substring("text/html".length()); 264 httpContentType = "text/xml" + httpContentType; 265 try { 266 doHttpStream(ex.getInputStream(),httpContentType,true); 267 ex = null; 268 } 269 catch (XmlReaderException ex2) { 270 ex = ex2; 271 } 272 } 273 } 274 if (ex!=null) { 275 String encoding = ex.getXmlEncoding(); 276 if (encoding==null) { 277 encoding = ex.getContentTypeEncoding(); 278 } 279 if (encoding==null) { 280 encoding = UTF_8; 281 } 282 prepareReader(ex.getInputStream(),encoding); 283 } 284 } 285 286 292 public String getEncoding() { 293 return _encoding; 294 } 295 296 public int read(char[] buf,int offset,int len) throws IOException { 297 return _reader.read(buf,offset,len); 298 } 299 300 306 public void close() throws IOException { 307 _reader.close(); 308 } 309 310 private void doRawStream(InputStream is,boolean lenient) throws IOException { 311 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE); 312 String bomEnc = getBOMEncoding(pis); 313 String xmlGuessEnc = getXMLGuessEncoding(pis); 314 String xmlEnc = getXmlProlog(pis,xmlGuessEnc); 315 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis); 316 prepareReader(pis,encoding); 317 } 318 319 private void doHttpStream(InputStream is,String httpContentType,boolean lenient) throws IOException { 320 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE); 321 String cTMime = getContentTypeMime(httpContentType); 322 String cTEnc = getContentTypeEncoding(httpContentType); 323 String bomEnc = getBOMEncoding(pis); 324 String xmlGuessEnc = getXMLGuessEncoding(pis); 325 String xmlEnc = getXmlProlog(pis,xmlGuessEnc); 326 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis,lenient); 327 prepareReader(pis,encoding); 328 } 329 330 private void prepareReader(InputStream is,String encoding) throws IOException { 331 _reader = new InputStreamReader(is,encoding); 332 _encoding = encoding; 333 } 334 335 private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException { 337 String encoding; 338 if (bomEnc==null) { 339 if (xmlGuessEnc==null || xmlEnc==null) { 340 encoding = UTF_8; 341 } 342 else 343 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 344 encoding = xmlGuessEnc; 345 } 346 else { 347 encoding = xmlEnc; 348 } 349 } 350 else 351 if (bomEnc.equals(UTF_8)) { 352 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) { 353 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 354 bomEnc,xmlGuessEnc,xmlEnc,is); 355 } 356 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) { 357 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 358 bomEnc,xmlGuessEnc,xmlEnc,is); 359 } 360 encoding = UTF_8; 361 } 362 else 363 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 364 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) { 365 throw new IOException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 366 } 367 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 368 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 369 bomEnc,xmlGuessEnc,xmlEnc,is); 370 } 371 encoding =bomEnc; 372 } 373 else { 374 throw new XmlReaderException(RAW_EX_2.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 375 bomEnc,xmlGuessEnc,xmlEnc,is); 376 } 377 return encoding; 378 } 379 380 private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is,boolean lenient) throws IOException { 382 String encoding; 383 if (lenient & xmlEnc!=null) { 384 encoding = xmlEnc; 385 } 386 else { 387 boolean appXml = isAppXml(cTMime); 388 boolean textXml = isTextXml(cTMime); 389 if (appXml || textXml) { 390 if (cTEnc==null) { 391 if (appXml) { 392 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is); 393 } 394 else { 395 encoding = US_ASCII; 396 } 397 } 398 else 399 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) { 400 throw new XmlReaderException(HTTP_EX_1.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 401 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 402 } 403 else 404 if (cTEnc.equals(UTF_16)) { 405 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) { 406 encoding = bomEnc; 407 } 408 else { 409 throw new XmlReaderException(HTTP_EX_2.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 410 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 411 } 412 } 413 else { 414 encoding = cTEnc; 415 } 416 } 417 else { 418 throw new XmlReaderException(HTTP_EX_3.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 419 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 420 } 421 } 422 return encoding; 423 } 424 425 private static String getContentTypeMime(String httpContentType) { 427 String mime = null; 428 if (httpContentType!=null) { 429 int i = httpContentType.indexOf(";"); 430 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim(); 431 } 432 return mime; 433 } 434 435 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)"); 436 437 private static String getContentTypeEncoding(String httpContentType) { 439 String encoding = null; 440 if (httpContentType!=null) { 441 int i = httpContentType.indexOf(";"); 442 if (i>-1) { 443 String postMime = httpContentType.substring(i+1); 444 Matcher m = CHARSET_PATTERN.matcher(postMime); 445 encoding = (m.find()) ? m.group(1) : null; 446 encoding = (encoding!=null) ? encoding.toUpperCase() : null; 447 } 448 } 449 return encoding; 450 } 451 452 private static String getBOMEncoding(PushbackInputStream is) throws IOException { 455 String encoding = null; 456 int[] bytes = new int[3]; 457 bytes[0] = is.read(); 458 bytes[1] = is.read(); 459 bytes[2] = is.read(); 460 461 if (bytes[0] == 0xFE && bytes[1] == 0xFF) { 462 encoding = UTF_16BE; 463 is.unread(bytes[2]); 464 } 465 else 466 if (bytes[0] == 0xFF && bytes[1] == 0xFE) { 467 encoding = UTF_16LE; 468 is.unread(bytes[2]); 469 } 470 else 471 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { 472 encoding = UTF_8; 473 } 474 else { 475 for (int i=bytes.length-1;i>=0;i--) { 476 is.unread(bytes[i]); 477 } 478 } 479 return encoding; 480 } 481 482 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException { 484 String encoding = null; 485 int[] bytes = new int[4]; 486 bytes[0] = is.read(); 487 bytes[1] = is.read(); 488 bytes[2] = is.read(); 489 bytes[3] = is.read(); 490 for (int i=bytes.length-1;i>=0;i--) { 491 is.unread(bytes[i]); 492 } 493 494 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) { 495 encoding = UTF_16BE; 496 } 497 else 498 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) { 499 encoding = UTF_16LE; 500 } 501 else 502 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) { 503 encoding = UTF_8; 504 } 505 return encoding; 506 } 507 508 private static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding=\"(.[^\"]*)\".*\\?>"); 509 510 private static String getXmlProlog(PushbackInputStream is,String guessedEnc) throws IOException { 512 String encoding = null; 513 if (guessedEnc!=null) { 514 byte[] bytes = new byte[PUSHBACK_MAX_SIZE]; 515 int offset = 0; 516 int max = PUSHBACK_MAX_SIZE; 517 int c = is.read(bytes,offset,max); 518 while (c!=-1 && offset<PUSHBACK_MAX_SIZE) { 519 offset += c; 520 max -= c; 521 c = is.read(bytes,offset,max); 522 } 523 int bytesRead = offset; 524 if (bytesRead>0) { 525 is.unread(bytes,0,bytesRead); 526 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc); 527 BufferedReader br = new BufferedReader(reader); 528 String prolog = br.readLine(); 529 Matcher m = ENCODING_PATTERN.matcher(prolog); 530 encoding = (m.find()) ? m.group(1).toUpperCase() : null; 531 } 532 } 533 return encoding; 534 } 535 536 private static boolean isAppXml(String mime) { 538 return mime!=null && 539 (mime.equals("application/xml") || 540 mime.equals("application/xml-dtd") || 541 mime.equals("application/xml-external-parsed-entity") || 542 (mime.startsWith("application/") && mime.endsWith("+xml"))); 543 } 544 545 private static boolean isTextXml(String mime) { 547 return mime!=null && 548 (mime.equals("text/xml") || 549 mime.equals("text/xml-external-parsed-entity") || 550 (mime.startsWith("text/") && mime.endsWith("+xml"))); 551 } 552 553 private static final MessageFormat RAW_EX_1 = new MessageFormat ( 554 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); 555 556 private static final MessageFormat RAW_EX_2 = new MessageFormat ( 557 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"); 558 559 private static final MessageFormat HTTP_EX_1 = new MessageFormat ( 560 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"); 561 562 private static final MessageFormat HTTP_EX_2 = new MessageFormat ( 563 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"); 564 565 private static final MessageFormat HTTP_EX_3 = new MessageFormat ( 566 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"); 567 568 } 569 | Popular Tags |