| 1 47 package com.lowagie.text.pdf; 48 49 import java.io.*; 50 import java.util.Stack; 51 import java.util.HashMap; 52 53 72 public class SimpleXMLParser { 73 private static final HashMap fIANA2JavaMap = new HashMap(); 74 private static final HashMap entityMap = new HashMap(); 75 76 private static int popMode(Stack st) { 77 if(!st.empty()) 78 return ((Integer)st.pop()).intValue(); 79 else 80 return PRE; 81 } 82 83 private final static int 84 TEXT = 1, 85 ENTITY = 2, 86 OPEN_TAG = 3, 87 CLOSE_TAG = 4, 88 START_TAG = 5, 89 ATTRIBUTE_LVALUE = 6, 90 ATTRIBUTE_EQUAL = 9, 91 ATTRIBUTE_RVALUE = 10, 92 QUOTE = 7, 93 IN_TAG = 8, 94 SINGLE_TAG = 12, 95 COMMENT = 13, 96 DONE = 11, 97 DOCTYPE = 14, 98 PRE = 15, 99 CDATA = 16; 100 101 private SimpleXMLParser() { 102 } 103 104 110 public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { 111 byte b4[] = new byte[4]; 112 int count = in.read(b4); 113 if (count != 4) 114 throw new IOException("Insufficient length."); 115 String encoding = getEncodingName(b4); 116 String decl = null; 117 if (encoding.equals("UTF-8")) { 118 StringBuffer sb = new StringBuffer(); 119 int c; 120 while ((c = in.read()) != -1) { 121 if (c == '>') 122 break; 123 sb.append((char)c); 124 } 125 decl = sb.toString(); 126 } 127 else if (encoding.equals("CP037")) { 128 ByteArrayOutputStream bi = new ByteArrayOutputStream(); 129 int c; 130 while ((c = in.read()) != -1) { 131 if (c == 0x6e) break; 133 bi.write(c); 134 } 135 decl = new String(bi.toByteArray(), "CP037"); 136 } 137 if (decl != null) { 138 decl = getDeclaredEncoding(decl); 139 if (decl != null) 140 encoding = decl; 141 } 142 parse(doc, new InputStreamReader(in, getJavaEncoding(encoding))); 143 } 144 145 private static String getDeclaredEncoding(String decl) { 146 if (decl == null) 147 return null; 148 int idx = decl.indexOf("encoding"); 149 if (idx < 0) 150 return null; 151 int idx1 = decl.indexOf('"', idx); 152 int idx2 = decl.indexOf('\'', idx); 153 if (idx1 == idx2) 154 return null; 155 if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { 156 int idx3 = decl.indexOf('\'', idx2 + 1); 157 if (idx3 < 0) 158 return null; 159 return decl.substring(idx2 + 1, idx3); 160 } 161 if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { 162 int idx3 = decl.indexOf('"', idx1 + 1); 163 if (idx3 < 0) 164 return null; 165 return decl.substring(idx1 + 1, idx3); 166 } 167 return null; 168 } 169 170 176 public static String getJavaEncoding(String iana) { 177 String IANA = iana.toUpperCase(); 178 String jdec = (String)fIANA2JavaMap.get(IANA); 179 if (jdec == null) 180 jdec = iana; 181 return jdec; 182 } 183 184 public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { 185 parse(doc, null, r, false); 186 } 187 188 194 public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { 195 BufferedReader reader; 196 if (r instanceof BufferedReader) 197 reader = (BufferedReader)r; 198 else 199 reader = new BufferedReader(r); 200 Stack st = new Stack(); 201 int depth = 0; 202 int mode = PRE; 203 int c = 0; 204 int quotec = '"'; 205 depth = 0; 206 StringBuffer sb = new StringBuffer(); 207 StringBuffer etag = new StringBuffer(); 208 String tagName = null; 209 String lvalue = null; 210 String rvalue = null; 211 HashMap attrs = null; 212 st = new Stack(); 213 doc.startDocument(); 214 int line=1, col=0; 215 boolean eol = false; 216 if (html) 217 mode = TEXT; 218 int pushBack = -1; 219 while(true) { 220 if (pushBack != -1) { 221 c = pushBack; 222 pushBack = -1; 223 } 224 else 225 c = reader.read(); 226 if (c == -1) 227 break; 228 229 if(c == '\n' && eol) { 232 eol = false; 233 continue; 234 } else if(eol) { 235 eol = false; 236 } else if(c == '\n') { 237 line++; 238 col=0; 239 } else if(c == '\r') { 240 eol = true; 241 c = '\n'; 242 line++; 243 col=0; 244 } else { 245 col++; 246 } 247 248 if(mode == DONE) { 249 doc.endDocument(); 250 return; 251 252 } else if(mode == TEXT) { 254 if(c == '<') { 255 st.push(new Integer(mode)); 256 mode = START_TAG; 257 if(sb.length() > 0) { 258 doc.text(sb.toString()); 259 sb.setLength(0); 260 } 261 } else if(c == '&') { 262 st.push(new Integer(mode)); 263 mode = ENTITY; 264 etag.setLength(0); 265 } else 266 sb.append((char)c); 267 268 } else if(mode == CLOSE_TAG) { 270 if(c == '>') { 271 mode = popMode(st); 272 tagName = sb.toString(); 273 if (html) 274 tagName = tagName.toLowerCase(); 275 sb.setLength(0); 276 depth--; 277 if(!html && depth==0) 278 mode = DONE; 279 doc.endElement(tagName); 280 } else { 281 if (!Character.isWhitespace((char)c)) 282 sb.append((char)c); 283 } 284 285 } else if(mode == CDATA) { 287 if(c == '>' 288 && sb.toString().endsWith("]]")) { 289 sb.setLength(sb.length()-2); 290 doc.text(sb.toString()); 291 sb.setLength(0); 292 mode = popMode(st); 293 } else 294 sb.append((char)c); 295 296 } else if(mode == COMMENT) { 299 if(c == '>' 300 && sb.toString().endsWith("--")) { 301 if (comment != null) { 302 sb.setLength(sb.length() - 2); 303 comment.comment(sb.toString()); 304 } 305 sb.setLength(0); 306 mode = popMode(st); 307 } else 308 sb.append((char)c); 309 310 } else if(mode == PRE) { 312 if(c == '<') { 313 mode = TEXT; 314 st.push(new Integer(mode)); 315 mode = START_TAG; 316 } 317 318 } else if(mode == DOCTYPE) { 321 if(c == '>') { 322 mode = popMode(st); 323 if(mode == TEXT) mode = PRE; 324 } 325 326 } else if(mode == START_TAG) { 330 mode = popMode(st); 331 if(c == '/') { 332 st.push(new Integer(mode)); 333 mode = CLOSE_TAG; 334 } else if (c == '?') { 335 mode = DOCTYPE; 336 } else { 337 st.push(new Integer(mode)); 338 mode = OPEN_TAG; 339 tagName = null; 340 attrs = new HashMap(); 341 sb.append((char)c); 342 } 343 344 } else if(mode == ENTITY) { 346 if(c == ';') { 347 mode = popMode(st); 348 String cent = etag.toString(); 349 etag.setLength(0); 350 if(cent.startsWith("#x")) { 351 try { 352 char ci = (char)Integer.parseInt(cent.substring(2),16); 353 sb.append(ci); 354 } 355 catch (Exception es) { 356 sb.append('&').append(cent).append(';'); 357 } 358 } 359 else if(cent.startsWith("#")) { 360 try { 361 char ci = (char)Integer.parseInt(cent.substring(1)); 362 sb.append(ci); 363 } 364 catch (Exception es) { 365 sb.append('&').append(cent).append(';'); 366 } 367 } 368 else { 369 char ce = decodeEntity(cent); 370 if (ce == '\0') 371 sb.append('&').append(cent).append(';'); 372 else 373 sb.append(ce); 374 } 375 } else if ((c != '#' && (c < '0' || c > '9') && (c < 'a' || c > 'z') 376 && (c < 'A' || c > 'Z')) || etag.length() >= 7) { 377 mode = popMode(st); 378 pushBack = c; 379 sb.append('&').append(etag.toString()); 380 etag.setLength(0); 381 } 382 else { 383 etag.append((char)c); 384 } 385 386 } else if(mode == SINGLE_TAG) { 390 if(tagName == null) 391 tagName = sb.toString(); 392 if (html) 393 tagName = tagName.toLowerCase(); 394 if(c != '>') 395 exc("Expected > for tag: <"+tagName+"/>",line,col); 396 doc.startElement(tagName,attrs); 397 doc.endElement(tagName); 398 if(!html && depth==0) { 399 doc.endDocument(); 400 return; 401 } 402 sb.setLength(0); 403 attrs = new HashMap(); 404 tagName = null; 405 mode = popMode(st); 406 407 } else if(mode == OPEN_TAG) { 411 if(c == '>') { 412 if(tagName == null) 413 tagName = sb.toString(); 414 if (html) 415 tagName = tagName.toLowerCase(); 416 sb.setLength(0); 417 depth++; 418 doc.startElement(tagName,attrs); 419 tagName = null; 420 attrs = new HashMap(); 421 mode = popMode(st); 422 } else if(c == '/') { 423 mode = SINGLE_TAG; 424 } else if(c == '-' && sb.toString().equals("!-")) { 425 mode = COMMENT; 426 sb.setLength(0); 427 } else if(c == '[' && sb.toString().equals("![CDATA")) { 428 mode = CDATA; 429 sb.setLength(0); 430 } else if(c == 'E' && sb.toString().equals("!DOCTYP")) { 431 sb.setLength(0); 432 mode = DOCTYPE; 433 } else if(Character.isWhitespace((char)c)) { 434 tagName = sb.toString(); 435 if (html) 436 tagName = tagName.toLowerCase(); 437 sb.setLength(0); 438 mode = IN_TAG; 439 } else { 440 sb.append((char)c); 441 } 442 443 } else if(mode == QUOTE) { 446 if (html && quotec == ' ' && c == '>') { 447 rvalue = sb.toString(); 448 sb.setLength(0); 449 attrs.put(lvalue,rvalue); 450 mode = popMode(st); 451 doc.startElement(tagName,attrs); 452 depth++; 453 tagName = null; 454 attrs = new HashMap(); 455 } 456 else if (html && quotec == ' ' && Character.isWhitespace((char)c)) { 457 rvalue = sb.toString(); 458 sb.setLength(0); 459 attrs.put(lvalue,rvalue); 460 mode = IN_TAG; 461 } 462 else if (html && quotec == ' ') { 463 sb.append((char)c); 464 } 465 else if(c == quotec) { 466 rvalue = sb.toString(); 467 sb.setLength(0); 468 attrs.put(lvalue,rvalue); 469 mode = IN_TAG; 470 } else if(" \r\n\u0009".indexOf(c)>=0) { 473 sb.append(' '); 474 } else if(c == '&') { 475 st.push(new Integer(mode)); 476 mode = ENTITY; 477 etag.setLength(0); 478 } else { 479 sb.append((char)c); 480 } 481 482 } else if(mode == ATTRIBUTE_RVALUE) { 483 if(c == '"' || c == '\'') { 484 quotec = c; 485 mode = QUOTE; 486 } else if(Character.isWhitespace((char)c)) { 487 ; 488 } else if (html && c == '>') { 489 attrs.put(lvalue,sb.toString()); 490 sb.setLength(0); 491 mode = popMode(st); 492 doc.startElement(tagName,attrs); 493 depth++; 494 tagName = null; 495 attrs = new HashMap(); 496 } else if (html) { 497 sb.append((char)c); 498 quotec = ' '; 499 mode = QUOTE; 500 } else { 501 exc("Error in attribute processing",line,col); 502 } 503 504 } else if(mode == ATTRIBUTE_LVALUE) { 505 if(Character.isWhitespace((char)c)) { 506 lvalue = sb.toString(); 507 if (html) 508 lvalue = lvalue.toLowerCase(); 509 sb.setLength(0); 510 mode = ATTRIBUTE_EQUAL; 511 } else if(c == '=') { 512 lvalue = sb.toString(); 513 if (html) 514 lvalue = lvalue.toLowerCase(); 515 sb.setLength(0); 516 mode = ATTRIBUTE_RVALUE; 517 } else if (html && c == '>') { 518 sb.setLength(0); 519 mode = popMode(st); 520 doc.startElement(tagName,attrs); 521 depth++; 522 tagName = null; 523 attrs = new HashMap(); 524 } else { 525 sb.append((char)c); 526 } 527 528 } else if(mode == ATTRIBUTE_EQUAL) { 529 if(c == '=') { 530 mode = ATTRIBUTE_RVALUE; 531 } else if(Character.isWhitespace((char)c)) { 532 ; 533 } else if (html && c == '>') { 534 sb.setLength(0); 535 mode = popMode(st); 536 doc.startElement(tagName,attrs); 537 depth++; 538 tagName = null; 539 attrs = new HashMap(); 540 } else if (html && c == '/') { 541 sb.setLength(0); 542 mode = SINGLE_TAG; 543 } else if (html) { 544 sb.setLength(0); 545 sb.append((char)c); 546 mode = ATTRIBUTE_LVALUE; 547 } else { 548 exc("Error in attribute processing.",line,col); 549 } 550 551 } else if(mode == IN_TAG) { 552 if(c == '>') { 553 mode = popMode(st); 554 doc.startElement(tagName,attrs); 555 depth++; 556 tagName = null; 557 attrs = new HashMap(); 558 } else if(c == '/') { 559 mode = SINGLE_TAG; 560 } else if(Character.isWhitespace((char)c)) { 561 ; 562 } else { 563 mode = ATTRIBUTE_LVALUE; 564 sb.append((char)c); 565 } 566 } 567 } 568 if(html || mode == DONE) { 569 if (html && mode == TEXT) 570 doc.text(sb.toString()); 571 doc.endDocument(); 572 } 573 else 574 exc("missing end tag",line,col); 575 } 576 private static void exc(String s,int line,int col) throws IOException { 577 throw new IOException(s+" near line "+line+", column "+col); 578 } 579 580 586 public static String escapeXML(String s, boolean onlyASCII) { 587 char cc[] = s.toCharArray(); 588 int len = cc.length; 589 StringBuffer sb = new StringBuffer(); 590 for (int k = 0; k < len; ++k) { 591 int c = cc[k]; 592 switch (c) { 593 case '<': 594 sb.append("<"); 595 break; 596 case '>': 597 sb.append(">"); 598 break; 599 case '&': 600 sb.append("&"); 601 break; 602 case '"': 603 sb.append("""); 604 break; 605 case '\'': 606 sb.append("'"); 607 break; 608 default: 609 if (onlyASCII && c > 127) 610 sb.append("&#").append(c).append(";"); 611 else 612 sb.append((char)c); 613 } 614 } 615 return sb.toString(); 616 } 617 618 public static char decodeEntity(String s) { 619 Character c = (Character)entityMap.get(s); 620 if (c == null) 621 return '\0'; 622 else 623 return c.charValue(); 624 } 625 626 private static String getEncodingName(byte[] b4) { 627 628 int b0 = b4[0] & 0xFF; 630 int b1 = b4[1] & 0xFF; 631 if (b0 == 0xFE && b1 == 0xFF) { 632 return "UTF-16BE"; 634 } 635 if (b0 == 0xFF && b1 == 0xFE) { 636 return "UTF-16LE"; 638 } 639 640 int b2 = b4[2] & 0xFF; 642 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 643 return "UTF-8"; 644 } 645 646 int b3 = b4[3] & 0xFF; 648 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 649 return "ISO-10646-UCS-4"; 651 } 652 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 653 return "ISO-10646-UCS-4"; 655 } 656 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 657 return "ISO-10646-UCS-4"; 660 } 661 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 662 return "ISO-10646-UCS-4"; 665 } 666 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 667 return "UTF-16BE"; 671 } 672 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 673 return "UTF-16LE"; 676 } 677 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 678 return "CP037"; 681 } 682 683 return "UTF-8"; 685 } 686 687 static { 688 fIANA2JavaMap.put("BIG5", "Big5"); 690 fIANA2JavaMap.put("CSBIG5", "Big5"); 691 fIANA2JavaMap.put("CP037", "CP037"); 692 fIANA2JavaMap.put("IBM037", "CP037"); 693 fIANA2JavaMap.put("CSIBM037", "CP037"); 694 fIANA2JavaMap.put("EBCDIC-CP-US", "CP037"); 695 fIANA2JavaMap.put("EBCDIC-CP-CA", "CP037"); 696 fIANA2JavaMap.put("EBCDIC-CP-NL", "CP037"); 697 fIANA2JavaMap.put("EBCDIC-CP-WT", "CP037"); 698 fIANA2JavaMap.put("IBM277", "CP277"); 699 fIANA2JavaMap.put("CP277", "CP277"); 700 fIANA2JavaMap.put("CSIBM277", "CP277"); 701 fIANA2JavaMap.put("EBCDIC-CP-DK", "CP277"); 702 fIANA2JavaMap.put("EBCDIC-CP-NO", "CP277"); 703 fIANA2JavaMap.put("IBM278", "CP278"); 704 fIANA2JavaMap.put("CP278", "CP278"); 705 fIANA2JavaMap.put("CSIBM278", "CP278"); 706 fIANA2JavaMap.put("EBCDIC-CP-FI", "CP278"); 707 fIANA2JavaMap.put("EBCDIC-CP-SE", "CP278"); 708 fIANA2JavaMap.put("IBM280", "CP280"); 709 fIANA2JavaMap.put("CP280", "CP280"); 710 fIANA2JavaMap.put("CSIBM280", "CP280"); 711 fIANA2JavaMap.put("EBCDIC-CP-IT", "CP280"); 712 fIANA2JavaMap.put("IBM284", "CP284"); 713 fIANA2JavaMap.put("CP284", "CP284"); 714 fIANA2JavaMap.put("CSIBM284", "CP284"); 715 fIANA2JavaMap.put("EBCDIC-CP-ES", "CP284"); 716 fIANA2JavaMap.put("EBCDIC-CP-GB", "CP285"); 717 fIANA2JavaMap.put("IBM285", "CP285"); 718 fIANA2JavaMap.put("CP285",
|