1 76 package com.lowagie.text.xml.simpleparser; 77 78 import java.io.BufferedReader ; 79 import java.io.ByteArrayOutputStream ; 80 import java.io.IOException ; 81 import java.io.InputStream ; 82 import java.io.InputStreamReader ; 83 import java.io.Reader ; 84 import java.util.HashMap ; 85 import java.util.Stack ; 86 87 103 public class SimpleXMLParser { 104 105 private final static int UNKNOWN = 0; 106 private final static int TEXT = 1; 107 private final static int TAG_ENCOUNTERED = 2; 108 private final static int EXAMIN_TAG = 3; 109 private final static int TAG_EXAMINED = 4; 110 private final static int IN_CLOSETAG = 5; 111 private final static int SINGLE_TAG = 6; 112 private final static int CDATA = 7; 113 private final static int COMMENT = 8; 114 private final static int PI = 9; 115 private final static int ENTITY = 10; 116 private final static int QUOTE = 11; 117 private final static int ATTRIBUTE_KEY = 12; 118 private final static int ATTRIBUTE_EQUAL = 13; 119 private final static int ATTRIBUTE_VALUE = 14; 120 121 122 protected Stack stack; 123 124 protected int character = 0; 125 126 protected int previousCharacter = -1; 127 128 protected int lines = 1; 129 130 protected int columns = 0; 131 132 protected boolean eol = false; 133 134 protected int state; 135 136 protected boolean html; 137 138 protected StringBuffer text = new StringBuffer (); 139 140 protected StringBuffer entity = new StringBuffer (); 141 142 protected String tag = null; 143 144 protected HashMap attributes = null; 145 146 protected SimpleXMLDocHandler doc; 147 148 protected SimpleXMLDocHandlerComment comment; 149 150 int nested = 0; 151 152 protected int quoteCharacter = '"'; 153 154 String attributekey = null; 155 156 String attributevalue = null; 157 158 162 private SimpleXMLParser(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, boolean html) { 163 this.doc = doc; 164 this.comment = comment; 165 this.html = html; 166 stack = new Stack (); 167 state = html ? TEXT : UNKNOWN; 168 } 169 170 174 private void go(Reader r) throws IOException { 175 BufferedReader reader; 176 if (r instanceof BufferedReader ) 177 reader = (BufferedReader )r; 178 else 179 reader = new BufferedReader (r); 180 doc.startDocument(); 181 while(true) { 182 if (previousCharacter == -1) { 184 character = reader.read(); 185 } 186 else { 188 character = previousCharacter; 189 previousCharacter = -1; 190 } 191 192 if (character == -1) { 194 if (html) { 195 if (html && state == TEXT) 196 flush(); 197 doc.endDocument(); 198 } else { 199 throwException("Missing end tag"); 200 } 201 return; 202 } 203 204 if (character == '\n' && eol) { 206 eol = false; 207 continue; 208 } else if (eol) { 209 eol = false; 210 } else if (character == '\n') { 211 lines++; 212 columns = 0; 213 } else if (character == '\r') { 214 eol = true; 215 character = '\n'; 216 lines++; 217 columns = 0; 218 } else { 219 columns++; 220 } 221 222 switch(state) { 223 case UNKNOWN: 225 if(character == '<') { 226 saveState(TEXT); 227 state = TAG_ENCOUNTERED; 228 } 229 break; 230 case TEXT: 232 if(character == '<') { 233 flush(); 234 saveState(state); 235 state = TAG_ENCOUNTERED; 236 } else if(character == '&') { 237 saveState(state); 238 entity.setLength(0); 239 state = ENTITY; 240 } else 241 text.append((char)character); 242 break; 243 case TAG_ENCOUNTERED: 246 initTag(); 247 if(character == '/') { 248 state = IN_CLOSETAG; 249 } else if (character == '?') { 250 restoreState(); 251 state = PI; 252 } else { 253 text.append((char)character); 254 state = EXAMIN_TAG; 255 } 256 break; 257 case EXAMIN_TAG: 260 if(character == '>') { 261 doTag(); 262 processTag(true); 263 initTag(); 264 state = restoreState(); 265 } else if(character == '/') { 266 state = SINGLE_TAG; 267 } else if(character == '-' && text.toString().equals("!-")) { 268 flush(); 269 state = COMMENT; 270 } else if(character == '[' && text.toString().equals("![CDATA")) { 271 flush(); 272 state = CDATA; 273 } else if(character == 'E' && text.toString().equals("!DOCTYP")) { 274 flush(); 275 state = PI; 276 } else if(Character.isWhitespace((char)character)) { 277 doTag(); 278 state = TAG_EXAMINED; 279 } else { 280 text.append((char)character); 281 } 282 break; 283 case TAG_EXAMINED: 285 if(character == '>') { 286 processTag(true); 287 initTag(); 288 state = restoreState(); 289 } else if(character == '/') { 290 state = SINGLE_TAG; 291 } else if(Character.isWhitespace((char)character)) { 292 } else { 294 text.append((char)character); 295 state = ATTRIBUTE_KEY; 296 } 297 break; 298 299 case IN_CLOSETAG: 301 if(character == '>') { 302 doTag(); 303 processTag(false); 304 if(!html && nested==0) return; 305 state = restoreState(); 306 } else { 307 if (!Character.isWhitespace((char)character)) 308 text.append((char)character); 309 } 310 break; 311 312 case SINGLE_TAG: 315 if(character != '>') 316 throwException("Expected > for tag: <"+tag+"/>"); 317 doTag(); 318 processTag(true); 319 processTag(false); 320 initTag(); 321 if(!html && nested==0) { 322 doc.endDocument(); 323 return; 324 } 325 state = restoreState(); 326 break; 327 328 case CDATA: 330 if(character == '>' 331 && text.toString().endsWith("]]")) { 332 text.setLength(text.length()-2); 333 flush(); 334 state = restoreState(); 335 } else 336 text.append((char)character); 337 break; 338 339 case COMMENT: 342 if(character == '>' 343 && text.toString().endsWith("--")) { 344 text.setLength(text.length() - 2); 345 flush(); 346 state = restoreState(); 347 } else 348 text.append((char)character); 349 break; 350 351 case PI: 353 if(character == '>') { 354 state = restoreState(); 355 if(state == TEXT) state = UNKNOWN; 356 } 357 break; 358 359 case ENTITY: 361 if(character == ';') { 362 state = restoreState(); 363 String cent = entity.toString(); 364 entity.setLength(0); 365 char ce = EntitiesToUnicode.decodeEntity(cent); 366 if (ce == '\0') 367 text.append('&').append(cent).append(';'); 368 else 369 text.append(ce); 370 } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') 371 && (character < 'A' || character > 'Z')) || entity.length() >= 7) { 372 state = restoreState(); 373 previousCharacter = character; 374 text.append('&').append(entity.toString()); 375 entity.setLength(0); 376 } 377 else { 378 entity.append((char)character); 379 } 380 break; 381 case QUOTE: 383 if (html && quoteCharacter == ' ' && character == '>') { 384 flush(); 385 processTag(true); 386 initTag(); 387 state = restoreState(); 388 } 389 else if (html && quoteCharacter == ' ' && Character.isWhitespace((char)character)) { 390 flush(); 391 state = TAG_EXAMINED; 392 } 393 else if (html && quoteCharacter == ' ') { 394 text.append((char)character); 395 } 396 else if(character == quoteCharacter) { 397 flush(); 398 state = TAG_EXAMINED; 399 } else if(" \r\n\u0009".indexOf(character)>=0) { 400 text.append(' '); 401 } else if(character == '&') { 402 saveState(state); 403 state = ENTITY; 404 entity.setLength(0); 405 } else { 406 text.append((char)character); 407 } 408 break; 409 410 case ATTRIBUTE_KEY: 411 if(Character.isWhitespace((char)character)) { 412 flush(); 413 state = ATTRIBUTE_EQUAL; 414 } else if(character == '=') { 415 flush(); 416 state = ATTRIBUTE_VALUE; 417 } else if (html && character == '>') { 418 text.setLength(0); 419 processTag(true); 420 initTag(); 421 state = restoreState(); 422 } else { 423 text.append((char)character); 424 } 425 break; 426 427 case ATTRIBUTE_EQUAL: 428 if(character == '=') { 429 state = ATTRIBUTE_VALUE; 430 } else if(Character.isWhitespace((char)character)) { 431 } else if (html && character == '>') { 433 text.setLength(0); 434 processTag(true); 435 initTag(); 436 state = restoreState(); 437 } else if (html && character == '/') { 438 flush(); 439 state = SINGLE_TAG; 440 } else if (html) { 441 flush(); 442 text.append((char)character); 443 state = ATTRIBUTE_KEY; 444 } else { 445 throwException("Error in attribute processing."); 446 } 447 break; 448 449 case ATTRIBUTE_VALUE: 450 if(character == '"' || character == '\'') { 451 quoteCharacter = character; 452 state = QUOTE; 453 } else if(Character.isWhitespace((char)character)) { 454 } else if (html && character == '>') { 456 flush(); 457 processTag(true); 458 initTag(); 459 state = restoreState(); 460 } else if (html) { 461 text.append((char)character); 462 quoteCharacter = ' '; 463 state = QUOTE; 464 } else { 465 throwException("Error in attribute processing"); 466 } 467 break; 468 } 469 } 470 } 471 472 476 private int restoreState() { 477 if(!stack.empty()) 478 return ((Integer )stack.pop()).intValue(); 479 else 480 return UNKNOWN; 481 } 482 486 private void saveState(int s) { 487 stack.push(new Integer (s)); 488 } 489 494 private void flush() { 495 switch(state){ 496 case TEXT: 497 case CDATA: 498 if(text.length() > 0) { 499 doc.text(text.toString()); 500 } 501 break; 502 case COMMENT: 503 if (comment != null) { 504 comment.comment(text.toString()); 505 } 506 break; 507 case ATTRIBUTE_KEY: 508 attributekey = text.toString(); 509 if (html) 510 attributekey = attributekey.toLowerCase(); 511 break; 512 case QUOTE: 513 case ATTRIBUTE_VALUE: 514 attributevalue = text.toString(); 515 attributes.put(attributekey,attributevalue); 516 break; 517 default: 518 } 520 text.setLength(0); 521 } 522 525 private void initTag() { 526 tag = null; 527 attributes = new HashMap (); 528 } 529 530 private void doTag() { 531 if(tag == null) 532 tag = text.toString(); 533 if (html) 534 tag = tag.toLowerCase(); 535 text.setLength(0); 536 } 537 541 private void processTag(boolean start) { 542 if (start) { 543 nested++; 544 doc.startElement(tag,attributes); 545 } 546 else { 547 nested--; 548 doc.endElement(tag); 549 } 550 } 551 552 private void throwException(String s) throws IOException { 553 throw new IOException (s+" near line " + lines + ", column " + columns); 554 } 555 556 562 public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException { 563 SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html); 564 parser.go(r); 565 } 566 567 573 public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException { 574 byte b4[] = new byte[4]; 575 int count = in.read(b4); 576 if (count != 4) 577 throw new IOException ("Insufficient length."); 578 String encoding = getEncodingName(b4); 579 String decl = null; 580 if (encoding.equals("UTF-8")) { 581 StringBuffer sb = new StringBuffer (); 582 int c; 583 while ((c = in.read()) != -1) { 584 if (c == '>') 585 break; 586 sb.append((char)c); 587 } 588 decl = sb.toString(); 589 } 590 else if (encoding.equals("CP037")) { 591 ByteArrayOutputStream bi = new ByteArrayOutputStream (); 592 int c; 593 while ((c = in.read()) != -1) { 594 if (c == 0x6e) break; 596 bi.write(c); 597 } 598 decl = new String (bi.toByteArray(), "CP037"); 599 } 600 if (decl != null) { 601 decl = getDeclaredEncoding(decl); 602 if (decl != null) 603 encoding = decl; 604 } 605 parse(doc, new InputStreamReader (in, IanaEncodings.getJavaEncoding(encoding))); 606 } 607 608 private static String getDeclaredEncoding(String decl) { 609 if (decl == null) 610 return null; 611 int idx = decl.indexOf("encoding"); 612 if (idx < 0) 613 return null; 614 int idx1 = decl.indexOf('"', idx); 615 int idx2 = decl.indexOf('\'', idx); 616 if (idx1 == idx2) 617 return null; 618 if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) { 619 int idx3 = decl.indexOf('\'', idx2 + 1); 620 if (idx3 < 0) 621 return null; 622 return decl.substring(idx2 + 1, idx3); 623 } 624 if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) { 625 int idx3 = decl.indexOf('"', idx1 + 1); 626 if (idx3 < 0) 627 return null; 628 return decl.substring(idx1 + 1, idx3); 629 } 630 return null; 631 } 632 633 public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException { 634 parse(doc, null, r, false); 635 } 636 637 643 public static String escapeXML(String s, boolean onlyASCII) { 644 char cc[] = s.toCharArray(); 645 int len = cc.length; 646 StringBuffer sb = new StringBuffer (); 647 for (int k = 0; k < len; ++k) { 648 int c = cc[k]; 649 switch (c) { 650 case '<': 651 sb.append("<"); 652 break; 653 case '>': 654 sb.append(">"); 655 break; 656 case '&': 657 sb.append("&"); 658 break; 659 case '"': 660 sb.append("""); 661 break; 662 case '\'': 663 sb.append("'"); 664 break; 665 default: 666 if (onlyASCII && c > 127) 667 sb.append("&#").append(c).append(';'); 668 else 669 sb.append((char)c); 670 } 671 } 672 return sb.toString(); 673 } 674 683 private static String getEncodingName(byte[] b4) { 684 685 int b0 = b4[0] & 0xFF; 687 int b1 = b4[1] & 0xFF; 688 if (b0 == 0xFE && b1 == 0xFF) { 689 return "UTF-16BE"; 691 } 692 if (b0 == 0xFF && b1 == 0xFE) { 693 return "UTF-16LE"; 695 } 696 697 int b2 = b4[2] & 0xFF; 699 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 700 return "UTF-8"; 701 } 702 703 int b3 = b4[3] & 0xFF; 705 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 706 return "ISO-10646-UCS-4"; 708 } 709 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 710 return "ISO-10646-UCS-4"; 712 } 713 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 714 return "ISO-10646-UCS-4"; 717 } 718 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 719 return "ISO-10646-UCS-4"; 722 } 723 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 724 return "UTF-16BE"; 728 } 729 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 730 return "UTF-16LE"; 733 } 734 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 735 return "CP037"; 738 } 739 740 return "UTF-8"; 742 } 743 } | Popular Tags |