KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > caucho > xml > HtmlPolicy


1 /*
2  * Copyright (c) 1998-2006 Caucho Technology -- all rights reserved
3  *
4  * This file is part of Resin(R) Open Source
5  *
6  * Each copy or derived work must preserve the copyright notice and this
7  * notice unmodified.
8  *
9  * Resin Open Source is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * Resin Open Source is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
17  * of NON-INFRINGEMENT. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with Resin Open Source; if not, write to the
22  * Free SoftwareFoundation, Inc.
23  * 59 Temple Place, Suite 330
24  * Boston, MA 02111-1307 USA
25  *
26  * @author Scott Ferguson
27  */

28
29 package com.caucho.xml;
30
31 import com.caucho.util.CharBuffer;
32 import com.caucho.util.CharCursor;
33 import com.caucho.util.CharScanner;
34 import com.caucho.util.IntMap;
35 import com.caucho.util.StringCharCursor;
36
37 import org.w3c.dom.Element JavaDoc;
38
39 import java.io.IOException JavaDoc;
40
41 /**
42  * Policy for parsing an HTML file.
43  */

44 class HtmlPolicy extends Policy {
45   static final int DOCUMENT = 1;
46   static final int COMMENT = DOCUMENT + 1;
47   static final int TEXT = COMMENT + 1;
48   static final int JSP = TEXT + 1;
49   static final int WHITESPACE = JSP + 1;
50
51   static final int HTML = WHITESPACE + 1;
52   static final int HEAD = HTML + 1;
53   static final int TITLE = HEAD + 1;
54   static final int ISINDEX = TITLE + 1;
55   static final int BASE = ISINDEX + 1;
56   static final int SCRIPT = BASE + 1;
57   static final int STYLE = SCRIPT + 1;
58   static final int META = STYLE + 1;
59   static final int LINK = META + 1;
60   static final int OBJECT = LINK + 1;
61
62   static final int BODY = OBJECT + 1;
63
64   static final int BASEFONT = BODY + 1;
65   static final int BR = BASEFONT + 1;
66   static final int AREA = BR + 1;
67   static final int IMG = AREA + 1;
68   static final int PARAM = IMG + 1;
69   static final int HR = PARAM + 1;
70   static final int INPUT = HR + 1;
71
72   static final int P = INPUT + 1;
73   static final int DT = P + 1;
74   static final int DD = DT + 1;
75   static final int LI = DD + 1;
76   static final int OPTION = LI + 1;
77
78   static final int TABLE = OPTION + 1;
79   static final int CAPTION = TABLE + 1;
80   static final int THEAD = CAPTION + 1;
81   static final int TFOOT = THEAD + 1;
82   static final int COL = TFOOT + 1;
83   static final int COLGROUP = COL + 1;
84   static final int TBODY = COLGROUP + 1;
85   static final int TR = TBODY + 1;
86   static final int TD = TR + 1;
87   static final int TH = TD + 1;
88
89   static final int FRAME = TH + 1;
90   static final int FRAMESET = FRAME + 1;
91
92   static final int BLOCK = FRAMESET + 1;
93   static final int INLINE = BLOCK + 1;
94
95   static IntMap names;
96   static IntMap cbNames;
97   
98   static QName htmlName = new QName(null, "html", null);
99   static QName headName = new QName(null, "head", null);
100   static QName bodyName = new QName(null, "body", null);
101
102   boolean toLower = true;
103   boolean isJsp = false;
104   boolean autoHtml = false;
105   boolean hasBody = false;
106   boolean autoHead = false;
107   
108   CharBuffer cb = new CharBuffer();
109
110   public void init()
111   {
112     toLower = true;
113     isJsp = false;
114     autoHtml = false;
115     hasBody = false;
116     autoHead = false;
117   }
118
119   /**
120    * When true, HTML parsing normalizes HTML tags to lower case.
121    */

122   public void setToLower(boolean toLower)
123   {
124     this.toLower = toLower;
125   }
126
127   /**
128    * When true, treat text before HTML specially.
129    */

130   public void setJsp(boolean isJsp)
131   {
132     this.isJsp = isJsp;
133   }
134
135   /**
136    * Return the normalized name.
137    *
138    * @param tag the raw name in the XML file.
139    *
140    * @return the normalized name.
141    */

142   QName getName(CharBuffer tag)
143   {
144     if (! toLower)
145       return super.getName(tag);
146     
147     cb.clear();
148     cb.append(tag);
149     cb.toLowerCase();
150
151     int name = cbNames.get(cb);
152
153     if (name >= 0)
154       return super.getName(cb);
155     else
156       return super.getName(tag);
157   }
158
159   QName getAttributeName(CharBuffer eltName, CharBuffer source)
160   {
161     if (! toLower)
162       return super.getName(source);
163     
164     cb.clear();
165     cb.append(eltName);
166     cb.toLowerCase();
167     int name = cbNames.get(cb);
168
169     if (name < 0)
170       return super.getName(source);
171     else {
172       source.toLowerCase();
173       return super.getName(source);
174     }
175   }
176
177   /**
178    * Returns the appropriate action when opening a HTML tag.
179    *
180    * @param parser the XML parser
181    * @param node the parent node
182    * @param next the next child
183    * @return the action code
184    */

185   int openAction(XmlParser parser, QName node, QName next)
186     throws XmlParseException
187   {
188     String JavaDoc nodeName = node == null ? "#document" : node.getName();
189     String JavaDoc nextName = next.getName();
190
191     int nextCode = names.get(nextName);
192
193     switch (names.get(nodeName)) {
194     case DOCUMENT:
195       switch (nextCode) {
196       case HTML:
197     return PUSH;
198
199       case COMMENT:
200         return PUSH;
201
202       case HEAD: case TITLE: case ISINDEX: case BASE: case SCRIPT:
203       case STYLE: case META: case LINK: case OBJECT:
204     opt = htmlName;
205     return PUSH_OPT;
206
207       case WHITESPACE:
208         return IGNORE;
209
210       case JSP:
211         return PUSH;
212
213       default:
214         if (autoHtml)
215           return PUSH;
216         
217         autoHtml = true;
218     opt = htmlName;
219     return PUSH_OPT;
220       }
221
222     case HTML:
223       switch (nextCode) {
224       case HTML:
225     return ERROR;
226
227       case HEAD:
228       case COMMENT:
229       case FRAMESET:
230     return PUSH;
231         
232       case BODY:
233         hasBody = true;
234     return PUSH;
235
236       case TITLE: case ISINDEX: case BASE: case SCRIPT:
237       case STYLE: case META: case LINK: case OBJECT:
238     opt = headName;
239         autoHead = true;
240     return PUSH_OPT;
241
242       case WHITESPACE:
243         return PUSH;
244
245       case JSP:
246         return PUSH;
247
248       default:
249         if (hasBody)
250           return PUSH;
251         
252         hasBody = true;
253     opt = bodyName;
254     return PUSH_OPT;
255       }
256
257     case HEAD:
258       switch (nextCode) {
259       case META:
260     // checkMetaEncoding((Element) next);
261
return PUSH_EMPTY;
262
263       case LINK: case ISINDEX: case BASE:
264     return PUSH_EMPTY;
265         
266       case SCRIPT: case STYLE:
267         return PUSH_VERBATIM;
268         
269       case TITLE:
270       case OBJECT:
271     return PUSH;
272
273       case WHITESPACE:
274         return PUSH;
275         
276       case JSP:
277       case TEXT:
278         if (autoHead)
279           return POP;
280         else
281           return PUSH;
282
283       default:
284     return POP;
285       }
286
287     case LI:
288       switch (nextCode) {
289       case LI:
290     return POP;
291
292       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
293       case HR: case INPUT: case COL: case FRAME: case ISINDEX:
294       case BASE: case META:
295     return PUSH_EMPTY;
296
297       case SCRIPT: case STYLE:
298         return PUSH_VERBATIM;
299
300       default:
301     return PUSH;
302       }
303
304     case OPTION:
305       switch (nextCode) {
306       case WHITESPACE:
307       case TEXT:
308         return PUSH;
309
310       default:
311     return POP;
312       }
313
314     case DD:
315       switch (nextCode) {
316       case DD: case DT:
317     return POP;
318
319       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
320       case HR: case INPUT: case COL: case FRAME: case ISINDEX:
321       case BASE: case META:
322     return PUSH_EMPTY;
323
324       case SCRIPT: case STYLE:
325         return PUSH_VERBATIM;
326
327       default:
328     return PUSH;
329       }
330
331     case THEAD: case TFOOT: case COLGROUP:
332       switch (nextCode) {
333       case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL:
334     return POP;
335
336       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
337       case HR: case INPUT: case FRAME: case ISINDEX:
338       case BASE: case META:
339     return PUSH_EMPTY;
340
341       case SCRIPT: case STYLE:
342         return PUSH_VERBATIM;
343
344       default:
345     return PUSH;
346       }
347
348     case TR:
349       switch (nextCode) {
350       case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
351     return POP;
352
353       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
354       case HR: case INPUT: case FRAME: case ISINDEX:
355       case BASE: case META:
356     return PUSH_EMPTY;
357
358       case TD: case TH:
359         return PUSH;
360
361       case SCRIPT: case STYLE:
362         return PUSH_VERBATIM;
363
364       default:
365         return PUSH;
366       }
367
368     case TD: case TH:
369       switch (nextCode) {
370       case THEAD: case TFOOT: case TBODY: case COLGROUP: case COL: case TR:
371       case TD: case TH:
372     return POP;
373
374       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
375       case HR: case INPUT: case FRAME: case ISINDEX:
376       case BASE: case META:
377     return PUSH_EMPTY;
378
379       case SCRIPT: case STYLE:
380         return PUSH_VERBATIM;
381
382       default:
383     return PUSH;
384       }
385
386     case P: case DT:
387       switch (nextCode) {
388       case BLOCK: case P: case TABLE: case CAPTION: case THEAD:
389       case TFOOT: case COLGROUP: case TBODY: case TR: case TD:
390       case TH: case DT: case LI:
391     return POP;
392
393       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
394       case HR: case INPUT: case COL: case FRAME: case ISINDEX:
395       case BASE: case META:
396     return PUSH_EMPTY;
397
398       case SCRIPT: case STYLE:
399         return PUSH_VERBATIM;
400
401       default:
402     return PUSH;
403       }
404
405     case TABLE:
406       switch (nextCode) {
407       case CAPTION: case THEAD: case TFOOT: case COL: case COLGROUP:
408       case TBODY: case TR:
409     return PUSH;
410
411       case SCRIPT: case STYLE:
412         return PUSH_VERBATIM;
413
414       default:
415         /*
416     opt = "tr";
417     return PUSH_OPT;
418         */

419         return PUSH;
420       }
421
422     default:
423       switch (nextCode) {
424       case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
425       case HR: case INPUT: case COL: case FRAME: case ISINDEX:
426       case BASE: case META:
427     return PUSH_EMPTY;
428
429       case SCRIPT: case STYLE:
430         return PUSH_VERBATIM;
431
432       default:
433     return PUSH;
434       }
435     }
436   }
437
438   private static CharScanner charsetScanner = new CharScanner(" \t=;");
439
440   private void checkMetaEncoding(Element JavaDoc elt)
441   {
442     String JavaDoc http = elt.getAttribute("http-equiv");
443     String JavaDoc content = elt.getAttribute("content");
444     if (http.equals("") || content.equals("") ||
445     ! http.equalsIgnoreCase("content-type"))
446       return;
447
448     CharCursor cursor = new StringCharCursor(content);
449     charsetScanner.scan(cursor);
450     charsetScanner.skip(cursor);
451     CharBuffer buf = CharBuffer.allocate();
452     while (cursor.current() != cursor.DONE) {
453       buf.clear();
454       charsetScanner.scan(cursor, buf);
455       if (buf.toString().equalsIgnoreCase("charset")) {
456     charsetScanner.skip(cursor);
457     buf.clear();
458     charsetScanner.scan(cursor, buf);
459     if (buf.length() > 0) {
460       try {
461         is.setEncoding(buf.close());
462       } catch (IOException JavaDoc e) {
463       }
464       return;
465     }
466       }
467     }
468   }
469
470   int elementCloseAction(XmlParser parser, QName node, String JavaDoc tagEnd)
471     throws XmlParseException
472   {
473     String JavaDoc nodeName = node.getName();
474     if (nodeName.equals(tagEnd))
475       return POP;
476
477     if (nodeName == "#document" && tagEnd.equals("")) {
478       /*
479       Document doc = (Document) node;
480
481       // If JSP, move any text into the body element
482       if (isJsp && doc.getDocumentElement() == null &&
483           node.getFirstChild() instanceof Text) {
484         Element html = doc.createElement("html");
485         doc.appendChild(html);
486         Element body = doc.createElement("body");
487         html.appendChild(body);
488         Node child;
489         while ((child = doc.getFirstChild()) instanceof Text ||
490         child instanceof Comment) {
491           body.appendChild(child);
492         }
493       }
494       */

495       return POP;
496     }
497     switch (names.get(tagEnd)) {
498     case BASEFONT: case BR: case AREA: case LINK: case IMG: case PARAM:
499     case HR: case INPUT: case COL: case FRAME: case ISINDEX:
500     case BASE: case META:
501       String JavaDoc errorTagEnd;
502       if (tagEnd.equals(""))
503         errorTagEnd = L.l("end of file");
504       else
505         errorTagEnd = "`<" + tagEnd + ">'";
506
507       throw parser.error(L.l("{0} expects to be empty",
508                              errorTagEnd));
509     }
510
511     switch (names.get(nodeName)) {
512     case BODY: case P:
513     case DT: case DD: case LI: case OPTION:
514     case THEAD: case TFOOT: case TBODY: case COLGROUP:
515     case TR: case TH: case TD:
516       return POP_AND_LOOP;
517
518     case HTML:
519     case HEAD:
520       // If JSP and missing a body, move any text into the body element
521
/*
522       if (isJsp && node.getLastChild() instanceof Text) {
523         Node child;
524
525         for (child = node.getLastChild();
526              child != null;
527              child = child.getPreviousSibling()) {
528           if (child.getNodeName().equals("body"))
529             return POP_AND_LOOP;
530         }
531
532         Document doc = node.getOwnerDocument();
533         Element body = doc.createElement("body");
534         
535         while ((child = node.getLastChild()) instanceof Text ||
536                child instanceof Comment) {
537           body.insertBefore(child, body.getFirstChild());
538         }
539         
540         doc.getDocumentElement().appendChild(body);
541       }
542       */

543       return POP_AND_LOOP;
544
545     default:
546
547       if (forgiving) {
548         /*
549     Node parent = node;
550     for (; parent != null; parent = parent.getParentNode()) {
551       if (parent.getNodeName().equals(tagEnd))
552         return POP_AND_LOOP;
553     }
554     return IGNORE;
555         */

556         return POP_AND_LOOP;
557       }
558       
559       String JavaDoc errorTagEnd;
560       if (tagEnd.equals(""))
561         errorTagEnd = L.l("end of file");
562       else
563         errorTagEnd = "`</" + tagEnd + ">'";
564
565       String JavaDoc expect;
566       if (nodeName.equals("#document")) {
567         throw parser.error(L.l("expected {0} at {1}",
568                                L.l("end of document"), errorTagEnd));
569       }
570       else
571     expect = "`</" + nodeName + ">'";
572
573       throw parser.error(L.l("expected {0} at {1} (open at {2})",
574                              expect, errorTagEnd,
575                              "" + parser.getNodeLine()));
576     }
577   }
578
579   private static void addName(String JavaDoc name, int code)
580   {
581     names.put(name, code);
582     cbNames.put(new CharBuffer(name), code);
583
584     String JavaDoc upper = name.toUpperCase();
585     names.put(upper, code);
586     cbNames.put(new CharBuffer(upper), code);
587   }
588
589   static {
590     names = new IntMap();
591     cbNames = new IntMap();
592     
593     addName("#document", DOCUMENT);
594     addName("#comment", COMMENT);
595     addName("#text", TEXT);
596     addName("#jsp", JSP);
597     addName("#whitespace", WHITESPACE);
598     addName("html", HTML);
599
600     addName("head", HEAD);
601     addName("title", TITLE);
602     addName("isindex", ISINDEX);
603     addName("base", BASE);
604     addName("script", SCRIPT);
605     addName("style", STYLE);
606     addName("meta", META);
607     addName("link", LINK);
608     addName("object", OBJECT);
609
610     addName("body", BODY);
611
612     addName("basefont", BASEFONT);
613     addName("br", BR);
614     addName("area", AREA);
615     addName("link", LINK);
616     addName("img", IMG);
617     addName("param", PARAM);
618     addName("hr", HR);
619     addName("input", INPUT);
620     addName("frame", FRAME);
621
622     addName("p", P);
623     addName("dt", DT);
624     addName("dd", DD);
625     addName("li", LI);
626     addName("option", OPTION);
627
628     addName("table", TABLE);
629     addName("caption", CAPTION);
630     addName("thead", THEAD);
631     addName("tfoot", TFOOT);
632     addName("col", COL);
633     addName("colgroup", COLGROUP);
634     addName("tbody", TBODY);
635     addName("tr", TR);
636     addName("th", TH);
637     addName("td", TD);
638
639     addName("h1", BLOCK);
640     addName("h2", BLOCK);
641     addName("h3", BLOCK);
642     addName("h4", BLOCK);
643     addName("h5", BLOCK);
644     addName("h6", BLOCK);
645     addName("ul", BLOCK);
646     addName("ol", BLOCK);
647     addName("dir", BLOCK);
648     addName("menu", BLOCK);
649     addName("pre", BLOCK);
650     addName("dl", BLOCK);
651     addName("div", BLOCK);
652     addName("center", BLOCK);
653     addName("noscript", BLOCK);
654     addName("noframes", BLOCK);
655     addName("blockquote", BLOCK);
656     addName("form", BLOCK);
657     addName("fieldset", BLOCK);
658     addName("address", BLOCK);
659
660     addName("tt", INLINE);
661     addName("i", INLINE);
662     addName("b", INLINE);
663     addName("u", INLINE);
664     addName("s", INLINE);
665     addName("strike", INLINE);
666     addName("big", INLINE);
667     addName("small", INLINE);
668
669     addName("em", INLINE);
670     addName("strong", INLINE);
671     addName("dfn", INLINE);
672     addName("code", INLINE);
673     addName("samp", INLINE);
674     addName("kbd", INLINE);
675     addName("var", INLINE);
676     addName("cite", INLINE);
677     addName("abbr", INLINE);
678     addName("acronym", INLINE);
679     addName("font", INLINE);
680     addName("iframe", INLINE);
681     addName("applet", INLINE);
682     addName("ins", INLINE);
683     addName("del", INLINE);
684
685     addName("a", INLINE);
686     addName("map", INLINE);
687     addName("q", INLINE);
688     addName("sub", INLINE);
689     addName("sup", INLINE);
690     addName("span", INLINE);
691     addName("bdo", INLINE);
692
693     addName("select", INLINE);
694     addName("textarea", INLINE);
695     addName("label", INLINE);
696     addName("optgroup", INLINE);
697     addName("button", INLINE);
698     addName("legend", INLINE);
699     addName("frameset", FRAMESET);
700
701     // CDATA -- STYLE, SCRIPT
702
}
703 }
704
Popular Tags