1 package com.quadcap.text.sax; 2 3 40 41 import java.io.CharArrayWriter ; 42 import java.io.IOException ; 43 import java.io.InputStreamReader ; 44 import java.io.Reader ; 45 46 import org.xml.sax.DocumentHandler ; 47 import org.xml.sax.DTDHandler ; 48 import org.xml.sax.EntityResolver ; 49 import org.xml.sax.ErrorHandler ; 50 import org.xml.sax.HandlerBase ; 51 import org.xml.sax.InputSource ; 52 import org.xml.sax.SAXException ; 53 54 import com.quadcap.text.NoStringPool; 55 import com.quadcap.text.StringPool; 56 import com.quadcap.util.collections.ArrayQueue; 57 58 import com.quadcap.util.Debug; 59 60 65 public class Parser implements org.xml.sax.Parser { 66 boolean docStarted = false; 67 HandlerBase defaultHandler = new HandlerBase (); 68 StringPool pool = new NoStringPool(); 69 InputSource in; 70 Reader r; 71 DocumentHandler docHandler = defaultHandler; 72 DTDHandler dtdHandler = defaultHandler; 73 EntityResolver entityResolver = defaultHandler; 74 ErrorHandler errorHandler = defaultHandler; 75 char[] ebuf = new char[6]; 76 char[] tag = new char[1024*32]; 77 int taglen = 0; 78 CharArrayWriter data = new CharArrayWriter (); 79 AttributeList attributes = new AttributeList(); 80 String attrName = null; 81 String tagName = null; 82 ArrayQueue inStack = null; 83 ArrayQueue locStack = null; 84 int lineNumber = 1; 85 int columnNumber = 1; 86 String lastEntityVal = ""; 87 boolean trace = false; 88 int commentLevel = 0; 89 90 public Parser() {} 91 92 public void parse(InputSource in) throws SAXException ,IOException { 93 this.in = in; 94 this.r = getCharacterStream(in); 95 taglen = 0; 96 lineNumber = 1; 97 columnNumber = 1; 98 data.reset(); 99 try { 100 parse(); 101 } catch (SAXException ex) { 102 if (locStack != null) { 103 for (int i = 0; i < locStack.size(); i++) { 104 com.quadcap.util.Debug.println(" at " + locStack.top(i)); 105 } 106 } 107 throw ex; 108 } 109 } 110 111 final Reader getCharacterStream(InputSource in) { 112 Reader rd = in.getCharacterStream(); 113 if (rd == null) { 114 rd = new InputStreamReader (in.getByteStream()); 115 } 116 return rd; 117 } 118 119 public void pushInputSource(InputSource in2) { 120 if (inStack == null) { 121 inStack = new ArrayQueue(); 122 locStack = new ArrayQueue(); 123 } 124 inStack.push(in); 125 locStack.push("" + lineNumber + ":" + columnNumber); 126 lineNumber = 1; 127 columnNumber = 1; 128 in = in2; 129 r = getCharacterStream(in); 130 } 131 132 boolean popInputSource() { 133 if (inStack == null || inStack.size() == 0) return false; 134 in = (InputSource )inStack.pop(); 135 String s = locStack.pop().toString(); 136 int idx = s.indexOf(':'); 137 lineNumber = Integer.parseInt(s.substring(0, idx)); 138 columnNumber = Integer.parseInt(s.substring(idx+1)); 139 r = getCharacterStream(in); 140 return true; 141 } 142 143 final void addTagChar(int c) throws SAXException { 144 if (taglen >= tag.length) throw new SAXException ("tag too long"); 145 tag[taglen++] = (char)c; 146 } 147 148 public void parse(String s) { 149 } 150 151 public void setDocumentHandler(DocumentHandler dh) { 152 this.docHandler = dh; 153 } 154 155 public void setDTDHandler(DTDHandler dh) { 156 this.dtdHandler = dh; 157 } 158 159 public void setEntityResolver(EntityResolver er) { 160 this.entityResolver = er; 161 } 162 163 public EntityResolver getEntityResolver() { 164 return entityResolver; 165 } 166 167 public void setErrorHandler(ErrorHandler er) { 168 errorHandler = er; 169 } 170 171 public void setLocale(java.util.Locale locale) { 172 } 173 174 final int read() throws IOException { 175 int c = r.read(); 176 if (c == '\n') { 177 lineNumber++; 178 columnNumber = 1; 179 } else { 180 columnNumber++; 181 } 182 return c; 183 } 184 185 final char parseEntity() throws SAXException , IOException { 186 int len = 0; 187 int c; 188 int state = 0; 189 while ((c = read()) >= 0) { 190 ebuf[len++] = (char)c; 191 if (!Character.isLetter((char)c) || len >= ebuf.length) break; 192 } 193 lastEntityVal = new String (ebuf, 0, len); 194 if (len == 5 && ebuf[0] == 'q' && ebuf[1] == 'u' && 195 ebuf[2] == 'o' && ebuf[3] == 't') { 196 return '"'; 197 } 198 if (len == 4 && ebuf[0] == 'a' && ebuf[1] == 'm' && ebuf[2] == 'p') { 199 return '&'; 200 } 201 if (len == 3) { 202 if (ebuf[0] == 'l') { 203 if (ebuf[1] == 't') return '<'; 204 } else if (ebuf[0] == 'g') { 205 if (ebuf[1] == 't') return '>'; 206 } 207 } 208 throw new SAXException ("unknown entity: " + lastEntityVal); 209 210 } 211 212 public int step(int state, int c) throws SAXException , IOException { 213 switch (state) { 216 case 0: 217 if (c == '<') { 218 if (data.size() > 0) { 219 docHandler.characters(data.toCharArray(), 0, data.size()); 220 data.reset(); 221 } 222 state = 1; 223 } else { 224 if (c == '&') { 225 try { 226 c = parseEntity(); 227 } catch (SAXException e) { 228 data.write('&'); 229 data.write(lastEntityVal); 230 break; 231 } 232 } 233 data.write(c); 234 } 235 break; 236 case 1: switch (c) { 238 case '!': 239 state = 30; 240 break; 241 case '\\': 242 state = 4; 243 break; 244 case '/': 245 state = 8; 246 break; 247 case '?': 248 data.reset(); 249 state = 20; 250 break; 251 default: 252 addTagChar(c); 253 state = 5; 254 break; 255 } 256 break; 257 case 4: data.write('<'); 259 data.write(c); 260 state = 0; 261 break; 262 case 5: switch (c) { 264 case ' ': case '\r': case '\n': case '\t': 265 tagName = pool.intern(tag, 0, taglen); 266 taglen = 0; 267 state = 6; 268 break; 269 case '/': 270 tagName = pool.intern(tag, 0, taglen); 271 taglen = 0; 272 state = 9; 273 break; 274 case '>': 275 tagName = pool.intern(tag, 0, taglen); 276 taglen = 0; 277 state = 0; 278 startElement(tagName, attributes); 279 break; 280 case '<': 281 tagName = pool.intern(tag, 0, taglen); 282 taglen = 0; 283 if (data.size() > 0) { 284 docHandler.characters(data.toCharArray(), 285 0, data.size()); 286 data.reset(); 287 } 288 state = 1; 289 break; 290 default: 291 if (Character.isLetter((char)c) || 292 Character.isDigit((char)c) || 293 c == '.' || c == '-' || c == '_' || c == ':') { 294 addTagChar(c); 295 } else { 296 for (int i = 0; i < taglen; i++) { 299 data.write(tag[i]); 300 } 301 data.write(c); 302 state = 0; 303 taglen = 0; 304 break; 305 } 306 307 } 308 break; 309 case 6: switch (c) { 311 case ' ': case '\n': case '\r': case '\t': 312 break; 313 case '/': 314 state = 9; 315 break; 316 case '%': 317 addTagChar(c); 318 break; 319 case '>': 320 state = 0; 321 startElement(tagName, attributes); 322 break; 323 case '=': 324 attrName = pool.intern(tag, 0, taglen); 325 taglen = 0; 326 state = 10; 327 break; 328 case '<': 329 state = 61; 330 break; 331 default: 332 addTagChar(c); 333 } 334 break; 335 case 61: 336 switch (c) { 337 case '?': 338 state = 62; 339 break; 340 default: 341 addTagChar('<'); 342 addTagChar(c); 343 state = 6; 344 break; 345 } 346 break; 347 case 62: 348 switch (c) { 349 case '?': 350 state = 63; 351 break; 352 default: 353 addTagChar(c); 354 break; 355 } 356 break; 357 case 63: 358 switch(c) { 359 case '>': 360 addTagChar(c); 361 state = 6; 362 break; 363 default: 364 addTagChar('?'); 365 if (c != '?') state = 62; 366 break; 367 } 368 break; 369 case 8: if (c == '>') { 371 tagName = pool.intern(tag, 0, taglen); 372 taglen = 0; 373 state = 0; 374 docHandler.endElement(tagName); 375 } else { 376 addTagChar(c); 377 } 378 break; 379 case 9: if (c == '>') { 381 startElement(tagName, attributes); 382 state = 0; 383 docHandler.endElement(tagName); 384 } else { 385 addTagChar('/'); 386 addTagChar(c); 387 state = 6; 388 } 389 break; 390 case 10: if (c == '"') { 392 state = 12; 393 } else if (c == '\'') { 394 state = 121; 395 } else { 396 addTagChar(c); 397 state = 13; 398 } 399 break; 400 case 12: if (c == '"') { 402 attributes.addAttribute(attrName, "CDATA", 403 pool.intern(tag, 0, taglen)); 404 taglen = 0; 405 state = 6; 406 } else { 407 addTagChar(c); 408 } 409 break; 410 case 121: if (c == '\'') { 412 attributes.addAttribute(attrName, "CDATA", 413 pool.intern(tag, 0, taglen)); 414 taglen = 0; 415 state = 6; 416 } else { 417 addTagChar(c); 418 } 419 break; 420 case 13: switch (c) { 422 case ' ': 423 attributes.addAttribute(attrName, "CDATA", 424 pool.intern(tag, 0, taglen)); 425 taglen = 0; 426 state = 6; 427 break; 428 case '/': 429 state = 14; 430 break; 431 case '>': 432 attributes.addAttribute(attrName, "CDATA", 433 pool.intern(tag, 0, taglen)); 434 taglen = 0; 435 state = 0; 436 startElement(tagName, attributes); 437 break; 438 default: 439 addTagChar(c); 440 } 441 break; 442 case 14: if (c == '>') { 444 attributes.addAttribute(attrName, "CDATA", 445 pool.intern(tag, 0, taglen)); 446 taglen = 0; 447 state = 0; 448 startElement(tagName, attributes); 449 docHandler.endElement(tagName); 450 } else { 451 addTagChar('/'); 452 if (c != '/') { 453 addTagChar(c); 454 state = 13; 455 } 456 } 457 break; 458 case 15: 459 if (c == '-') state = 16; 460 break; 461 case 16: 462 if (c == '-') state = 17; 463 else state = 15; 464 break; 465 case 17: 466 if (c == '>') state = 0; 467 else if (c != '-') state = 15; 468 break; 469 case 20: 470 if (c == '?') state = 21; 471 else data.write(c); 472 break; 473 case 21: 474 if (c == '>') { 475 String s = data.toString().trim(); 476 if (s.startsWith("xml")) { 477 if (inStack == null || inStack.size() == 0) { 478 if (!docStarted) { 479 docStarted = true; 480 docHandler.startDocument(); 481 } 482 } 483 } else { 484 int idx = s.indexOf(' '); 485 String dat = ""; 486 String target = s; 487 if (idx >= 0) { 488 target = s.substring(0, idx); 489 dat = s.substring(idx+1).trim(); 490 } 491 docHandler.processingInstruction(target, dat); 492 } 493 data.reset(); 494 state = 0; 495 } else { 496 data.write('?'); 497 if (c != '?') { 498 data.write(c); 499 state = 20; 500 } 501 } 502 break; 503 case 30: if (c == '-') state = 31; 505 else if (c == '[') state = 41; 506 else state = 40; 507 break; 508 case 31: if (c == '-') { 510 commentLevel = 1; 511 state = 32; 512 } 513 else state = 40; 514 break; 515 case 32: if (c == '-') state = 33; 517 else if (c == '<') state = 320; 518 break; 519 case 320: if (c == '!') state = 321; 521 else if (c == '-') state = 33; 522 else state = 32; 523 break; 524 case 321: if (c == '-') state = 322; 526 else state = 32; 527 break; 528 case 322: if (c == '-') { 530 commentLevel++; 531 } 532 state = 32; 533 break; 534 case 33: if (c == '-') state = 34; 536 else state = 32; 537 break; 538 case 34: if (c == '>') { 540 if (--commentLevel == 0) { 541 state = 0; 542 } else { 543 state = 32; 544 } 545 } 546 else if (c != '-') state = 32; 547 break; 548 case 40: if (c == '>') state = 0; 550 break; 551 case 41: if (c == '[') { 553 if (data.toString().equals("CDATA")) { 554 data.reset(); 555 state = 42; 556 } else { 557 state = 40; 558 } 559 } else { 560 data.write(c); 561 } 562 break; 563 case 42: if (c == ']') { 565 state = 43; 566 } else { 567 data.write(c); 568 } 569 break; 570 case 43: if (c == ']') { 572 state = 44; 573 } else { 574 data.write(']'); 575 data.write(c); 576 state = 42; 577 } 578 break; 579 case 44: if (c == '>') { 581 state = 0; 582 } else if (c == ']') { 583 data.write(']'); 584 } else { 585 data.write("]]"); 586 data.write(c); 587 state = 42; 588 } 589 break; 590 default: 591 throw new SAXException ("Bad parser state: " + state); 592 } 593 return state; 594 } 595 596 public void parse() throws SAXException , IOException { 597 int state = 0; 598 docHandler.setDocumentLocator(new Locator(this)); 599 while (parseUntilEOF()) {} 600 docHandler.endDocument(); 601 } 602 603 public boolean parseUntilEOF() throws SAXException , IOException { 604 boolean ret = false; 605 int state = 0; 606 while (state >= 0) { 607 int c = read(); 608 if (c < 0) { 609 try { r.close(); } catch (Exception e) {} 610 ret = popInputSource(); 611 state = -1; 612 } else { 613 state = step(state, c); 614 } 615 } 616 return ret; 617 } 618 619 public int getLineNumber() { 620 return lineNumber; 621 } 622 623 public int getColumnNumber() { 624 return columnNumber; 625 } 626 627 void startElement(String name, AttributeList attributes) throws SAXException { 628 if (!docStarted) { 629 docStarted = true; 630 docHandler.startDocument(); 631 } 632 docHandler.startElement(tagName, attributes); 633 attributes.clear(); 634 } 635 636 } 637 | Popular Tags |