1 20 package org.outerj.daisy.xmlutil; 21 22 import java.io.*; 23 import java.net.URL ; 24 import java.net.URLConnection ; 25 import java.net.HttpURLConnection ; 26 import java.util.regex.Pattern ; 27 import java.util.regex.Matcher ; 28 import java.text.MessageFormat ; 29 30 54 public class XmlReader extends Reader { 55 private static final int PUSHBACK_MAX_SIZE = 1024; 56 57 private static final String UTF_8 = "UTF-8"; 58 private static final String US_ASCII = "US-ASCII"; 59 private static final String UTF_16BE = "UTF-16BE"; 60 private static final String UTF_16LE = "UTF-16LE"; 61 private static final String UTF_16 = "UTF-16"; 62 63 private Reader _reader; 64 private String _encoding; 65 66 79 public XmlReader(File file) throws IOException { 80 this(new FileInputStream(file)); 81 } 82 83 95 public XmlReader(InputStream is) throws IOException { 96 this(is,true); 97 } 98 99 123 public XmlReader(InputStream is,boolean lenient) throws IOException, XmlReaderException { 124 try { 125 doRawStream(is); 126 } 127 catch (XmlReaderException ex) { 128 if (!lenient) { 129 throw ex; 130 } 131 else { 132 doLenientDetection(null,ex); 133 } 134 } 135 } 136 137 153 public XmlReader(URL url) throws IOException { 154 this(url.openConnection()); 155 } 156 157 173 public XmlReader(URLConnection conn) throws IOException { 174 if (conn instanceof HttpURLConnection ) { 175 try { 176 doHttpStream(conn.getInputStream(),conn.getContentType()); 177 } 178 catch (XmlReaderException ex) { 179 doLenientDetection(conn.getContentType(),ex); 180 } 181 } 182 else 183 if (conn.getContentType()!=null) { 184 try { 185 doHttpStream(conn.getInputStream(),conn.getContentType()); 186 } 187 catch (XmlReaderException ex) { 188 doLenientDetection(conn.getContentType(),ex); 189 } 190 } 191 else { 192 try { 193 doRawStream(conn.getInputStream()); 194 } 195 catch (XmlReaderException ex) { 196 doLenientDetection(null,ex); 197 } 198 } 199 } 200 201 216 public XmlReader(InputStream is,String httpContentType) throws IOException { 217 this(is,httpContentType,true); 218 } 219 220 248 public XmlReader(InputStream is,String httpContentType,boolean lenient) throws IOException, XmlReaderException { 249 try { 250 doHttpStream(is,httpContentType); 251 } 252 catch (XmlReaderException ex) { 253 if (!lenient) { 254 throw ex; 255 } 256 else { 257 doLenientDetection(httpContentType,ex); 258 } 259 } 260 } 261 262 private void doLenientDetection(String httpContentType,XmlReaderException ex) throws IOException { 263 if (httpContentType!=null) { 264 if (httpContentType.startsWith("text/html")) { 265 httpContentType = httpContentType.substring("text/html".length()); 266 httpContentType = "text/xml" + httpContentType; 267 try { 268 doHttpStream(ex.getInputStream(),httpContentType); 269 ex = null; 270 } 271 catch (XmlReaderException ex2) { 272 ex = ex2; 273 } 274 } 275 } 276 if (ex!=null) { 277 String encoding = ex.getXmlEncoding(); 278 if (encoding==null) { 279 encoding = ex.getContentTypeEncoding(); 280 } 281 if (encoding==null) { 282 encoding = UTF_8; 283 } 284 prepareReader(ex.getInputStream(),encoding); 285 } 286 } 287 288 294 public String getEncoding() { 295 return _encoding; 296 } 297 298 public int read(char[] buf,int offset,int len) throws IOException { 299 return _reader.read(buf,offset,len); 300 } 301 302 308 public void close() throws IOException { 309 _reader.close(); 310 } 311 312 private void doRawStream(InputStream is) throws IOException { 313 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE); 314 String bomEnc = getBOMEncoding(pis); 315 String xmlGuessEnc = getXMLGuessEncoding(pis); 316 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc); 317 String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis); 318 prepareReader(pis,encoding); 319 } 320 321 private void doHttpStream(InputStream is,String httpContentType) throws IOException { 322 PushbackInputStream pis = new PushbackInputStream(is,PUSHBACK_MAX_SIZE); 323 String cTMime = getContentTypeMime(httpContentType); 324 String cTEnc = getContentTypeEncoding(httpContentType); 325 String bomEnc = getBOMEncoding(pis); 326 String xmlGuessEnc = getXMLGuessEncoding(pis); 327 String xmlEnc = getXMLPrologEncoding(pis,xmlGuessEnc); 328 String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis); 329 prepareReader(pis,encoding); 330 } 331 332 private void prepareReader(InputStream is,String encoding) throws IOException { 333 _reader = new InputStreamReader(is,encoding); 334 _encoding = encoding; 335 } 336 337 private static String calculateRawEncoding(String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException { 339 String encoding; 340 if (bomEnc==null) { 341 if (xmlGuessEnc==null || xmlEnc==null) { 342 encoding = UTF_8; 343 } 344 else 345 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 346 encoding = xmlGuessEnc; 347 } 348 else { 349 encoding = xmlEnc; 350 } 351 } 352 else 353 if (bomEnc.equals(UTF_8)) { 354 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(UTF_8)) { 355 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 356 bomEnc,xmlGuessEnc,xmlEnc,is); 357 } 358 if (xmlEnc!=null && !xmlEnc.equals(UTF_8)) { 359 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 360 bomEnc,xmlGuessEnc,xmlEnc,is); 361 } 362 encoding = UTF_8; 363 } 364 else 365 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 366 if (xmlGuessEnc!=null && !xmlGuessEnc.equals(bomEnc)) { 367 throw new IOException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc})); 368 } 369 if (xmlEnc!=null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 370 throw new XmlReaderException(RAW_EX_1.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 371 bomEnc,xmlGuessEnc,xmlEnc,is); 372 } 373 encoding =bomEnc; 374 } 375 else { 376 throw new XmlReaderException(RAW_EX_2.format(new Object []{bomEnc,xmlGuessEnc,xmlEnc}), 377 bomEnc,xmlGuessEnc,xmlEnc,is); 378 } 379 return encoding; 380 } 381 382 private static String calculateHttpEncoding(String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is) throws IOException { 384 boolean appXml = isAppXml(cTMime); 385 boolean textXml = isTextXml(cTMime); 386 String encoding; 387 if (appXml || textXml) { 388 if (cTEnc==null) { 389 if (appXml) { 390 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, is); 391 } 392 else { 393 encoding = US_ASCII; 394 } 395 } 396 else 397 if (bomEnc!=null && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) { 398 throw new XmlReaderException(HTTP_EX_1.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 399 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 400 } 401 else 402 if (cTEnc.equals(UTF_16)) { 403 if (bomEnc!=null && bomEnc.startsWith(UTF_16)) { 404 encoding = bomEnc; 405 } 406 else { 407 throw new XmlReaderException(HTTP_EX_2.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 408 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 409 } 410 } 411 else { 412 encoding = cTEnc; 413 } 414 } 415 else { 416 throw new XmlReaderException(HTTP_EX_3.format(new Object []{cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc}), 417 cTMime,cTEnc,bomEnc,xmlGuessEnc,xmlEnc,is); 418 } 419 return encoding; 420 } 421 422 private static String getContentTypeMime(String httpContentType) { 424 String mime = null; 425 if (httpContentType!=null) { 426 int i = httpContentType.indexOf(";"); 427 mime = ((i==-1) ? httpContentType : httpContentType.substring(0,i)).trim(); 428 } 429 return mime; 430 } 431 432 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=([.[^; ]]*)"); 433 434 private static String getContentTypeEncoding(String httpContentType) { 436 String encoding = null; 437 if (httpContentType!=null) { 438 int i = httpContentType.indexOf(";"); 439 if (i>-1) { 440 String postMime = httpContentType.substring(i+1); 441 Matcher m = CHARSET_PATTERN.matcher(postMime); 442 encoding = (m.find()) ? m.group(1) : null; 443 encoding = (encoding!=null) ? encoding.toUpperCase() : null; 444 } 445 } 446 return encoding; 447 } 448 449 private static String getBOMEncoding(PushbackInputStream is) throws IOException { 452 String encoding = null; 453 int[] bytes = new int[3]; 454 bytes[0] = is.read(); 455 bytes[1] = is.read(); 456 bytes[2] = is.read(); 457 458 if (bytes[0] == 0xFE && bytes[1] == 0xFF) { 459 encoding = UTF_16BE; 460 is.unread(bytes[2]); 461 } 462 else 463 if (bytes[0] == 0xFF && bytes[1] == 0xFE) { 464 encoding = UTF_16LE; 465 is.unread(bytes[2]); 466 } 467 else 468 if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { 469 encoding = UTF_8; 470 } 471 else { 472 for (int i=bytes.length-1;i>=0;i--) { 473 is.unread(bytes[i]); 474 } 475 } 476 return encoding; 477 } 478 479 private static String getXMLGuessEncoding(PushbackInputStream is) throws IOException { 481 String encoding = null; 482 int[] bytes = new int[4]; 483 bytes[0] = is.read(); 484 bytes[1] = is.read(); 485 bytes[2] = is.read(); 486 bytes[3] = is.read(); 487 for (int i=bytes.length-1;i>=0;i--) { 488 is.unread(bytes[i]); 489 } 490 491 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F) { 492 encoding = UTF_16BE; 493 } 494 else 495 if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00) { 496 encoding = UTF_16LE; 497 } 498 else 499 if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D) { 500 encoding = UTF_8; 501 } 502 return encoding; 503 } 504 505 private static final Pattern ENCODING_PATTERN = Pattern.compile("^<\\?xml.*encoding=\"(.*)\".*\\?>"); 506 507 private static String getXMLPrologEncoding(PushbackInputStream is,String guessedEnc) throws IOException { 509 String encoding = null; 510 if (guessedEnc!=null) { 511 byte[] bytes = new byte[PUSHBACK_MAX_SIZE]; 512 int offset = 0; 513 int max = PUSHBACK_MAX_SIZE; 514 int c = is.read(bytes,offset,max); 515 while (c!=-1 && offset<PUSHBACK_MAX_SIZE) { 516 offset += c; 517 max -= c; 518 c = is.read(bytes,offset,max); 519 } 520 int bytesRead = offset; 521 if (bytesRead>0) { 522 is.unread(bytes,0,bytesRead); 523 Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes,0,bytesRead), guessedEnc); 524 BufferedReader br = new BufferedReader(reader); 525 String prolog = br.readLine(); 526 Matcher m = ENCODING_PATTERN.matcher(prolog); 527 encoding = (m.find()) ? m.group(1).toUpperCase() : null; 528 } 529 } 530 return encoding; 531 } 532 533 private static boolean isAppXml(String mime) { 535 return mime!=null && 536 (mime.equals("application/xml") || 537 mime.equals("application/xml-dtd") || 538 mime.equals("application/xml-external-parsed-entity") || 539 (mime.startsWith("application/") && mime.endsWith("+xml"))); 540 } 541 542 private static boolean isTextXml(String mime) { 544 return mime!=null && 545 (mime.equals("text/xml") || 546 mime.equals("text/xml-external-parsed-entity") || 547 (mime.startsWith("text/") && mime.endsWith("+xml"))); 548 } 549 550 private static final MessageFormat RAW_EX_1 = new MessageFormat ( 551 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"); 552 553 private static final MessageFormat RAW_EX_2 = new MessageFormat ( 554 "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"); 555 556 private static final MessageFormat HTTP_EX_1 = new MessageFormat ( 557 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"); 558 559 private static final MessageFormat HTTP_EX_2 = new MessageFormat ( 560 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"); 561 562 private static final MessageFormat HTTP_EX_3 = new MessageFormat ( 563 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME"); 564 565 } 566 | Popular Tags |