1 28 29 package com.caucho.xml; 30 31 import com.caucho.util.CharBuffer; 32 import com.caucho.util.CharCursor; 33 import com.caucho.util.CharScanner; 34 import com.caucho.util.IntMap; 35 import com.caucho.util.StringCharCursor; 36 37 import org.w3c.dom.Element ; 38 39 import java.io.IOException ; 40 41 44 class HtmlPolicy extends Policy { 45 static final int DOCUMENT = 1; 46 static final int COMMENT = DOCUMENT + 1; 47 static final int TEXT = COMMENT + 1; 48 static final int JSP = TEXT + 1; 49 static final int WHITESPACE = JSP + 1; 50 51 static final int HTML = WHITESPACE + 1; 52 static final int HEAD = HTML + 1; 53 static final int TITLE = HEAD + 1; 54 static final int ISINDEX = TITLE + 1; 55 static final int BASE = ISINDEX + 1; 56 static final int SCRIPT = BASE + 1; 57 static final int STYLE = SCRIPT + 1; 58 static final int META = STYLE + 1; 59 static final int LINK = META + 1; 60 static final int OBJECT = LINK + 1; 61 62 static final int BODY = OBJECT + 1; 63 64 static final int BASEFONT = BODY + 1; 65 static final int BR = BASEFONT + 1; 66 static final int AREA = BR + 1; 67 static final int IMG = AREA + 1; 68 static final int PARAM = IMG + 1; 69 static final int HR = PARAM + 1; 70 static final int INPUT = HR + 1; 71 72 static final int P = INPUT + 1; 73 static final int DT = P + 1; 74 static final int DD = DT + 1; 75 static final int LI = DD + 1; 76 static final int OPTION = LI + 1; 77 78 static final int TABLE = OPTION + 1; 79 static final int CAPTION = TABLE + 1; 80 static final int THEAD = CAPTION + 1; 81 static final int TFOOT = THEAD + 1; 82 static final int COL = TFOOT + 1; 83 static final int COLGROUP = COL + 1; 84 static final int TBODY = COLGROUP + 1; 85 static final int TR = TBODY + 1; 86 static final int TD = TR + 1; 87 static final int TH = TD + 1; 88 89 static final int FRAME = TH + 1; 90 static final int FRAMESET = FRAME + 1; 91 92 static final int BLOCK = FRAMESET + 1; 93 static final int INLINE = BLOCK + 1; 94 95 static IntMap names; 96 static IntMap cbNames; 97 98 static QName htmlName = new QName(null, "html", null); 99 static QName headName = new QName(null, "head", null); 100 static QName bodyName = new QName(null, "body", null); 101 102 boolean toLower = true; 103 boolean isJsp = false; 104 boolean autoHtml = false; 105 boolean hasBody = false; 106 boolean autoHead = false; 107 108 CharBuffer cb = new CharBuffer(); 109 110 public void init() 111 { 112 toLower = true; 113 isJsp = false; 114 autoHtml = false; 115 hasBody = false; 116 autoHead = false; 117 } 118 119 122 public void setToLower(boolean toLower) 123 { 124 this.toLower = toLower; 125 } 126 127 130 public void setJsp(boolean isJsp) 131 { 132 this.isJsp = isJsp; 133 } 134 135 142 QName getName(CharBuffer tag) 143 { 144 if (! toLower) 145 return super.getName(tag); 146 147 cb.clear(); 148 cb.append(tag); 149 cb.toLowerCase(); 150 151 int name = cbNames.get(cb); 152 153 if (name >= 0) 154 return super.getName(cb); 155 else 156 return super.getName(tag); 157 } 158 159 QName getAttributeName(CharBuffer eltName, CharBuffer source) 160 { 161 if (! toLower) 162 return super.getName(source); 163 164 cb.clear(); 165 cb.append(eltName); 166 cb.toLowerCase(); 167 int name = cbNames.get(cb); 168 169 if (name < 0) 170 return super.getName(source); 171 else { 172 source.toLowerCase(); 173 return super.getName(source); 174 } 175 } 176 177 185 int openAction(XmlParser parser, QName node, QName next) 186 throws XmlParseException 187 { 188 String nodeName = node == null ? "#document" : node.getName(); 189 String nextName = next.getName(); 190 191 int nextCode = names.get(nextName); 192 193 switch (names.get(nodeName)) { 194 case DOCUMENT: 195 switch (nextCode) { 196 case HTML: 197 return PUSH; 198 199 case COMMENT: 200 return PUSH; 201 202 case HEAD: case TITLE: case ISINDEX: case BASE: case SCRIPT: 203 case STYLE: case META: case LINK: case OBJECT: 204 opt = htmlName; 205 return PUSH_OPT; 206 207 case WHITESPACE: 208 return IGNORE; 209 210 case JSP: 211 return PUSH; 212 213 default: 214 if (autoHtml) 215 return PUSH; 216 217 autoHtml = true; 218 opt = htmlName; 219 return PUSH_OPT; 220 } 221 222 case HTML: 223 switch (nextCode) { 224 case HTML: 225 return ERROR; 226 227 case HEAD: 228 case COMMENT: 229 case FRAMESET: 230 return PUSH; 231 232 case BODY: 233 hasBody = true; 234 return PUSH; 235 236 case TITLE: case ISINDEX: case BASE: case SCRIPT: 237 case STYLE: case META: case LINK: case OBJECT: 238 opt = headName; 239 autoHead = true; 240 return PUSH_OPT; 241 242 case WHITESPACE: 243 return PUSH; 244 245 case JSP: 246 return PUSH; 247 248 default: 249 if (hasBody) 250 return PUSH; 251 252 hasBody = true; 253 opt = bodyName; 254 return PUSH_OPT; 255 } 256 257 case HEAD: 258 switch (nextCode) { 259 case META: 260 return PUSH_EMPTY; 262 263 case LINK: case ISINDEX: case BASE: 264 return PUSH_EMPTY; 265 266 case SCRIPT: case STYLE: 267 return PUSH_VERBATIM; 268 269 case TITLE: 270 case OBJECT: 271 return PUSH; 272 273 case WHITESPACE: 274 return PUSH; 275 276 case JSP: 277 case TEXT: 278 if (autoHead) 279 return POP; 280 else 281 return PUSH; 282 283 default: 284 return POP; 285 } 286 287 case LI: 288 switch (nextCode) { 289 case LI: 290 return POP; 291 292 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 293 case HR: case INPUT: case COL: case FRAME: case ISINDEX: 294 case BASE: case META: 295 return PUSH_EMPTY; 296 297 case SCRIPT: case STYLE: 298 return PUSH_VERBATIM; 299 300 default: 301 return PUSH; 302 } 303 304 case OPTION: 305 switch (nextCode) { 306 case WHITESPACE: 307 case TEXT: 308 return PUSH; 309 310 default: 311 return POP; 312 } 313 314 case DD: 315 switch (nextCode) { 316 case DD: case DT: 317 return POP; 318 319 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 320 case HR: case INPUT: case COL: case FRAME: case ISINDEX: 321 case BASE: case META: 322 return PUSH_EMPTY; 323 324 case SCRIPT: case STYLE: 325 return PUSH_VERBATIM; 326 327 default: 328 return PUSH; 329 } 330 331 case THEAD: case TFOOT: case COLGROUP: 332 switch (nextCode) { 333 case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: 334 return POP; 335 336 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 337 case HR: case INPUT: case FRAME: case ISINDEX: 338 case BASE: case META: 339 return PUSH_EMPTY; 340 341 case SCRIPT: case STYLE: 342 return PUSH_VERBATIM; 343 344 default: 345 return PUSH; 346 } 347 348 case TR: 349 switch (nextCode) { 350 case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR: 351 return POP; 352 353 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 354 case HR: case INPUT: case FRAME: case ISINDEX: 355 case BASE: case META: 356 return PUSH_EMPTY; 357 358 case TD: case TH: 359 return PUSH; 360 361 case SCRIPT: case STYLE: 362 return PUSH_VERBATIM; 363 364 default: 365 return PUSH; 366 } 367 368 case TD: case TH: 369 switch (nextCode) { 370 case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR: 371 case TD: case TH: 372 return POP; 373 374 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 375 case HR: case INPUT: case FRAME: case ISINDEX: 376 case BASE: case META: 377 return PUSH_EMPTY; 378 379 case SCRIPT: case STYLE: 380 return PUSH_VERBATIM; 381 382 default: 383 return PUSH; 384 } 385 386 case P: case DT: 387 switch (nextCode) { 388 case BLOCK: case P: case TABLE: case CAPTION: case THEAD: 389 case TFOOT: case COLGROUP: case TBODY: case TR: case TD: 390 case TH: case DT: case LI: 391 return POP; 392 393 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 394 case HR: case INPUT: case COL: case FRAME: case ISINDEX: 395 case BASE: case META: 396 return PUSH_EMPTY; 397 398 case SCRIPT: case STYLE: 399 return PUSH_VERBATIM; 400 401 default: 402 return PUSH; 403 } 404 405 case TABLE: 406 switch (nextCode) { 407 case CAPTION: case THEAD: case TFOOT: case COL: case COLGROUP: 408 case TBODY: case TR: 409 return PUSH; 410 411 case SCRIPT: case STYLE: 412 return PUSH_VERBATIM; 413 414 default: 415 419 return PUSH; 420 } 421 422 default: 423 switch (nextCode) { 424 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 425 case HR: case INPUT: case COL: case FRAME: case ISINDEX: 426 case BASE: case META: 427 return PUSH_EMPTY; 428 429 case SCRIPT: case STYLE: 430 return PUSH_VERBATIM; 431 432 default: 433 return PUSH; 434 } 435 } 436 } 437 438 private static CharScanner charsetScanner = new CharScanner(" \t=;"); 439 440 private void checkMetaEncoding(Element elt) 441 { 442 String http = elt.getAttribute("http-equiv"); 443 String content = elt.getAttribute("content"); 444 if (http.equals("") || content.equals("") || 445 ! http.equalsIgnoreCase("content-type")) 446 return; 447 448 CharCursor cursor = new StringCharCursor(content); 449 charsetScanner.scan(cursor); 450 charsetScanner.skip(cursor); 451 CharBuffer buf = CharBuffer.allocate(); 452 while (cursor.current() != cursor.DONE) { 453 buf.clear(); 454 charsetScanner.scan(cursor, buf); 455 if (buf.toString().equalsIgnoreCase("charset")) { 456 charsetScanner.skip(cursor); 457 buf.clear(); 458 charsetScanner.scan(cursor, buf); 459 if (buf.length() > 0) { 460 try { 461 is.setEncoding(buf.close()); 462 } catch (IOException e) { 463 } 464 return; 465 } 466 } 467 } 468 } 469 470 int elementCloseAction(XmlParser parser, QName node, String tagEnd) 471 throws XmlParseException 472 { 473 String nodeName = node.getName(); 474 if (nodeName.equals(tagEnd)) 475 return POP; 476 477 if (nodeName == "#document" && tagEnd.equals("")) { 478 495 return POP; 496 } 497 switch (names.get(tagEnd)) { 498 case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM: 499 case HR: case INPUT: case COL: case FRAME: case ISINDEX: 500 case BASE: case META: 501 String errorTagEnd; 502 if (tagEnd.equals("")) 503 errorTagEnd = L.l("end of file"); 504 else 505 errorTagEnd = "`<" + tagEnd + ">'"; 506 507 throw parser.error(L.l("{0} expects to be empty", 508 errorTagEnd)); 509 } 510 511 switch (names.get(nodeName)) { 512 case BODY: case P: 513 case DT: case DD: case LI: case OPTION: 514 case THEAD: case TFOOT: case TBODY: case COLGROUP: 515 case TR: case TH: case TD: 516 return POP_AND_LOOP; 517 518 case HTML: 519 case HEAD: 520 543 return POP_AND_LOOP; 544 545 default: 546 547 if (forgiving) { 548 556 return POP_AND_LOOP; 557 } 558 559 String errorTagEnd; 560 if (tagEnd.equals("")) 561 errorTagEnd = L.l("end of file"); 562 else 563 errorTagEnd = "`</" + tagEnd + ">'"; 564 565 String expect; 566 if (nodeName.equals("#document")) { 567 throw parser.error(L.l("expected {0} at {1}", 568 L.l("end of document"), errorTagEnd)); 569 } 570 else 571 expect = "`</" + nodeName + ">'"; 572 573 throw parser.error(L.l("expected {0} at {1} (open at {2})", 574 expect, errorTagEnd, 575 "" + parser.getNodeLine())); 576 } 577 } 578 579 private static void addName(String name, int code) 580 { 581 names.put(name, code); 582 cbNames.put(new CharBuffer(name), code); 583 584 String upper = name.toUpperCase(); 585 names.put(upper, code); 586 cbNames.put(new CharBuffer(upper), code); 587 } 588 589 static { 590 names = new IntMap(); 591 cbNames = new IntMap(); 592 593 addName("#document", DOCUMENT); 594 addName("#comment", COMMENT); 595 addName("#text", TEXT); 596 addName("#jsp", JSP); 597 addName("#whitespace", WHITESPACE); 598 addName("html", HTML); 599 600 addName("head", HEAD); 601 addName("title", TITLE); 602 addName("isindex", ISINDEX); 603 addName("base", BASE); 604 addName("script", SCRIPT); 605 addName("style", STYLE); 606 addName("meta", META); 607 addName("link", LINK); 608 addName("object", OBJECT); 609 610 addName("body", BODY); 611 612 addName("basefont", BASEFONT); 613 addName("br", BR); 614 addName("area", AREA); 615 addName("link", LINK); 616 addName("img", IMG); 617 addName("param", PARAM); 618 addName("hr", HR); 619 addName("input", INPUT); 620 addName("frame", FRAME); 621 622 addName("p", P); 623 addName("dt", DT); 624 addName("dd", DD); 625 addName("li", LI); 626 addName("option", OPTION); 627 628 addName("table", TABLE); 629 addName("caption", CAPTION); 630 addName("thead", THEAD); 631 addName("tfoot", TFOOT); 632 addName("col", COL); 633 addName("colgroup", COLGROUP); 634 addName("tbody", TBODY); 635 addName("tr", TR); 636 addName("th", TH); 637 addName("td", TD); 638 639 addName("h1", BLOCK); 640 addName("h2", BLOCK); 641 addName("h3", BLOCK); 642 addName("h4", BLOCK); 643 addName("h5", BLOCK); 644 addName("h6", BLOCK); 645 addName("ul", BLOCK); 646 addName("ol", BLOCK); 647 addName("dir", BLOCK); 648 addName("menu", BLOCK); 649 addName("pre", BLOCK); 650 addName("dl", BLOCK); 651 addName("div", BLOCK); 652 addName("center", BLOCK); 653 addName("noscript", BLOCK); 654 addName("noframes", BLOCK); 655 addName("blockquote", BLOCK); 656 addName("form", BLOCK); 657 addName("fieldset", BLOCK); 658 addName("address", BLOCK); 659 660 addName("tt", INLINE); 661 addName("i", INLINE); 662 addName("b", INLINE); 663 addName("u", INLINE); 664 addName("s", INLINE); 665 addName("strike", INLINE); 666 addName("big", INLINE); 667 addName("small", INLINE); 668 669 addName("em", INLINE); 670 addName("strong", INLINE); 671 addName("dfn", INLINE); 672 addName("code", INLINE); 673 addName("samp", INLINE); 674 addName("kbd", INLINE); 675 addName("var", INLINE); 676 addName("cite", INLINE); 677 addName("abbr", INLINE); 678 addName("acronym", INLINE); 679 addName("font", INLINE); 680 addName("iframe", INLINE); 681 addName("applet", INLINE); 682 addName("ins", INLINE); 683 addName("del", INLINE); 684 685 addName("a", INLINE); 686 addName("map", INLINE); 687 addName("q", INLINE); 688 addName("sub", INLINE); 689 addName("sup", INLINE); 690 addName("span", INLINE); 691 addName("bdo", INLINE); 692 693 addName("select", INLINE); 694 addName("textarea", INLINE); 695 addName("label", INLINE); 696 addName("optgroup", INLINE); 697 addName("button", INLINE); 698 addName("legend", INLINE); 699 addName("frameset", FRAMESET); 700 701 } 703 } 704 | Popular Tags |