KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xml > serialize > HTMLdtd


1 /*
2  * Copyright 1999-2002,2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16
17
18 // Aug 21, 2000:
19
// Fixed bug in isElement and made HTMLdtd public.
20
// Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
21

22
23 package org.apache.xml.serialize;
24
25 import org.apache.xerces.dom.DOMMessageFormatter;
26
27 import java.io.InputStream JavaDoc;
28 import java.io.InputStreamReader JavaDoc;
29 import java.io.BufferedReader JavaDoc;
30 import java.util.Hashtable JavaDoc;
31 import java.util.Locale JavaDoc;
32
33
34 /**
35  * Utility class for accessing information specific to HTML documents.
36  * The HTML DTD is expressed as three utility function groups. Two methods
37  * allow for checking whether an element requires an open tag on printing
38  * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
39  * <P>
40  * Two other methods translate character references from name to value and
41  * from value to name. A small entities resource is loaded into memory the
42  * first time any of these methods is called for fast and efficient access.
43  *
44  *
45  * @version $Revision: 1.18 $ $Date: 2004/02/24 23:34:03 $
46  * @author <a HREF="mailto:arkin@intalio.com">Assaf Arkin</a>
47  */

48 public final class HTMLdtd
49 {
50
51     /**
52      * Public identifier for HTML 4.01 (Strict) document type.
53      */

54     public static final String JavaDoc HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
55
56     /**
57      * System identifier for HTML 4.01 (Strict) document type.
58      */

59     public static final String JavaDoc HTMLSystemId =
60         "http://www.w3.org/TR/html4/strict.dtd";
61
62     /**
63      * Public identifier for XHTML 1.0 (Strict) document type.
64      */

65     public static final String JavaDoc XHTMLPublicId =
66         "-//W3C//DTD XHTML 1.0 Strict//EN";
67
68     /**
69      * System identifier for XHTML 1.0 (Strict) document type.
70      */

71     public static final String JavaDoc XHTMLSystemId =
72         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
73
74     /**
75      * Table of reverse character reference mapping. Character codes are held
76      * as single-character strings, mapped to their reference name.
77      */

78     private static Hashtable JavaDoc _byChar;
79
80
81     /**
82      * Table of entity name to value mapping. Entities are held as strings,
83      * character references as <TT>Character</TT> objects.
84      */

85     private static Hashtable JavaDoc _byName;
86
87
88     private static Hashtable JavaDoc _boolAttrs;
89
90
91     /**
92      * Holds element definitions.
93      */

94     private static Hashtable JavaDoc _elemDefs;
95
96
97     /**
98      * Locates the HTML entities file that is loaded upon initialization.
99      * This file is a resource loaded with the default class loader.
100      */

101     private static final String JavaDoc ENTITIES_RESOURCE = "HTMLEntities.res";
102
103
104     /**
105      * Only opening tag should be printed.
106      */

107     private static final int ONLY_OPENING = 0x0001;
108
109     /**
110      * Element contains element content only.
111      */

112     private static final int ELEM_CONTENT = 0x0002;
113
114
115     /**
116      * Element preserve spaces.
117      */

118     private static final int PRESERVE = 0x0004;
119
120
121     /**
122      * Optional closing tag.
123      */

124     private static final int OPT_CLOSING = 0x0008;
125
126
127     /**
128      * Element is empty (also means only opening tag)
129      */

130     private static final int EMPTY = 0x0010 | ONLY_OPENING;
131
132
133     /**
134      * Allowed to appear in head.
135      */

136     private static final int ALLOWED_HEAD = 0x0020;
137
138
139     /**
140      * When opened, closes P.
141      */

142     private static final int CLOSE_P = 0x0040;
143
144
145     /**
146      * When opened, closes DD or DT.
147      */

148     private static final int CLOSE_DD_DT = 0x0080;
149
150
151     /**
152      * When opened, closes itself.
153      */

154     private static final int CLOSE_SELF = 0x0100;
155
156
157     /**
158      * When opened, closes another table section.
159      */

160     private static final int CLOSE_TABLE = 0x0200;
161
162
163     /**
164      * When opened, closes TH or TD.
165      */

166     private static final int CLOSE_TH_TD = 0x04000;
167
168
169     /**
170      * Returns true if element is declared to be empty. HTML elements are
171      * defines as empty in the DTD, not by the document syntax.
172      *
173      * @param tagName The element tag name (upper case)
174      * @return True if element is empty
175      */

176     public static boolean isEmptyTag( String JavaDoc tagName )
177     {
178         return isElement( tagName, EMPTY );
179     }
180
181
182     /**
183      * Returns true if element is declared to have element content.
184      * Whitespaces appearing inside element content will be ignored,
185      * other text will simply report an error.
186      *
187      * @param tagName The element tag name (upper case)
188      * @return True if element content
189      */

190     public static boolean isElementContent( String JavaDoc tagName )
191     {
192         return isElement( tagName, ELEM_CONTENT );
193     }
194
195
196     /**
197      * Returns true if element's textual contents preserves spaces.
198      * This only applies to PRE and TEXTAREA, all other HTML elements
199      * do not preserve space.
200      *
201      * @param tagName The element tag name (upper case)
202      * @return True if element's text content preserves spaces
203      */

204     public static boolean isPreserveSpace( String JavaDoc tagName )
205     {
206         return isElement( tagName, PRESERVE );
207     }
208
209
210     /**
211      * Returns true if element's closing tag is optional and need not
212      * exist. An error will not be reported for such elements if they
213      * are not closed. For example, <tt>LI</tt> is most often not closed.
214      *
215      * @param tagName The element tag name (upper case)
216      * @return True if closing tag implied
217      */

218     public static boolean isOptionalClosing( String JavaDoc tagName )
219     {
220         return isElement( tagName, OPT_CLOSING );
221     }
222
223
224     /**
225      * Returns true if element's closing tag is generally not printed.
226      * For example, <tt>LI</tt> should not print the closing tag.
227      *
228      * @param tagName The element tag name (upper case)
229      * @return True if only opening tag should be printed
230      */

231     public static boolean isOnlyOpening( String JavaDoc tagName )
232     {
233         return isElement( tagName, ONLY_OPENING );
234     }
235
236
237     /**
238      * Returns true if the opening of one element (<tt>tagName</tt>) implies
239      * the closing of another open element (<tt>openTag</tt>). For example,
240      * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
241      * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
242      *
243      * @param tagName The newly opened element
244      * @param openTag The already opened element
245      * @return True if closing tag closes opening tag
246      */

247     public static boolean isClosing( String JavaDoc tagName, String JavaDoc openTag )
248     {
249         // Several elements are defined as closing the HEAD
250
if ( openTag.equalsIgnoreCase( "HEAD" ) )
251             return ! isElement( tagName, ALLOWED_HEAD );
252         // P closes iteself
253
if ( openTag.equalsIgnoreCase( "P" ) )
254             return isElement( tagName, CLOSE_P );
255         // DT closes DD, DD closes DT
256
if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
257             return isElement( tagName, CLOSE_DD_DT );
258         // LI and OPTION close themselves
259
if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
260             return isElement( tagName, CLOSE_SELF );
261         // Each of these table sections closes all the others
262
if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
263              openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
264              openTag.equalsIgnoreCase( "COLGROUP" ) )
265             return isElement( tagName, CLOSE_TABLE );
266         // TD closes TH and TH closes TD
267
if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
268             return isElement( tagName, CLOSE_TH_TD );
269         return false;
270     }
271
272
273     /**
274      * Returns true if the specified attribute it a URI and should be
275      * escaped appropriately. In HTML URIs are escaped differently
276      * than normal attributes.
277      *
278      * @param tagName The element's tag name
279      * @param attrName The attribute's name
280      */

281     public static boolean isURI( String JavaDoc tagName, String JavaDoc attrName )
282     {
283         // Stupid checks.
284
return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
285     }
286
287
288     /**
289      * Returns true if the specified attribute is a boolean and should be
290      * printed without the value. This applies to attributes that are true
291      * if they exist, such as selected (OPTION/INPUT).
292      *
293      * @param tagName The element's tag name
294      * @param attrName The attribute's name
295      */

296     public static boolean isBoolean( String JavaDoc tagName, String JavaDoc attrName )
297     {
298         String JavaDoc[] attrNames;
299
300         attrNames = (String JavaDoc[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
301         if ( attrNames == null )
302             return false;
303         for ( int i = 0 ; i < attrNames.length ; ++i )
304             if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
305                 return true;
306         return false;
307     }
308
309
310     /**
311      * Returns the value of an HTML character reference by its name. If the
312      * reference is not found or was not defined as a character reference,
313      * returns EOF (-1).
314      *
315      * @param name Name of character reference
316      * @return Character code or EOF (-1)
317      */

318     public static int charFromName( String JavaDoc name )
319     {
320         Object JavaDoc value;
321
322         initialize();
323         value = _byName.get( name );
324         if ( value != null && value instanceof Integer JavaDoc )
325             return ( (Integer JavaDoc) value ).intValue();
326         else
327             return -1;
328     }
329
330
331     /**
332      * Returns the name of an HTML character reference based on its character
333      * value. Only valid for entities defined from character references. If no
334      * such character value was defined, return null.
335      *
336      * @param value Character value of entity
337      * @return Entity's name or null
338      */

339     public static String JavaDoc fromChar(int value )
340     {
341        if (value > 0xffff)
342             return null;
343
344         String JavaDoc name;
345
346         initialize();
347         name = (String JavaDoc) _byChar.get( new Integer JavaDoc( value ) );
348         return name;
349     }
350
351
352     /**
353      * Initialize upon first access. Will load all the HTML character references
354      * into a list that is accessible by name or character value and is optimized
355      * for character substitution. This method may be called any number of times
356      * but will execute only once.
357      */

358     private static void initialize()
359     {
360         InputStream JavaDoc is = null;
361         BufferedReader JavaDoc reader = null;
362         int index;
363         String JavaDoc name;
364         String JavaDoc value;
365         int code;
366         String JavaDoc line;
367
368         // Make sure not to initialize twice.
369
if ( _byName != null )
370             return;
371         try {
372             _byName = new Hashtable JavaDoc();
373             _byChar = new Hashtable JavaDoc();
374             is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
375             if ( is == null ) {
376                 throw new RuntimeException JavaDoc(
377                     DOMMessageFormatter.formatMessage(
378                     DOMMessageFormatter.SERIALIZER_DOMAIN,
379                     "ResourceNotFound", new Object JavaDoc[] {ENTITIES_RESOURCE}));
380             }
381             reader = new BufferedReader JavaDoc( new InputStreamReader JavaDoc( is, "ASCII" ) );
382             line = reader.readLine();
383             while ( line != null ) {
384                 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
385                     line = reader.readLine();
386                     continue;
387                 }
388                 index = line.indexOf( ' ' );
389                 if ( index > 1 ) {
390                     name = line.substring( 0, index );
391                     ++index;
392                     if ( index < line.length() ) {
393                         value = line.substring( index );
394                         index = value.indexOf( ' ' );
395                         if ( index > 0 )
396                             value = value.substring( 0, index );
397                         code = Integer.parseInt( value );
398                                         defineEntity( name, (char) code );
399                     }
400                 }
401                 line = reader.readLine();
402             }
403             is.close();
404         } catch ( Exception JavaDoc except ) {
405             throw new RuntimeException JavaDoc(
406                 DOMMessageFormatter.formatMessage(
407                 DOMMessageFormatter.SERIALIZER_DOMAIN,
408                 "ResourceNotLoaded", new Object JavaDoc[] {ENTITIES_RESOURCE, except.toString()}));
409         } finally {
410             if ( is != null ) {
411                 try {
412                     is.close();
413                 } catch ( Exception JavaDoc except ) { }
414             }
415         }
416     }
417
418
419     /**
420      * Defines a new character reference. The reference's name and value are
421      * supplied. Nothing happens if the character reference is already defined.
422      * <P>
423      * Unlike internal entities, character references are a string to single
424      * character mapping. They are used to map non-ASCII characters both on
425      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
426      * example of a character reference.
427      *
428      * @param name The entity's name
429      * @param value The entity's value
430      */

431     private static void defineEntity( String JavaDoc name, char value )
432     {
433         if ( _byName.get( name ) == null ) {
434             _byName.put( name, new Integer JavaDoc( value ) );
435             _byChar.put( new Integer JavaDoc( value ), name );
436         }
437     }
438
439
440     private static void defineElement( String JavaDoc name, int flags )
441     {
442         _elemDefs.put( name, new Integer JavaDoc( flags ) );
443     }
444
445
446     private static void defineBoolean( String JavaDoc tagName, String JavaDoc attrName )
447     {
448         defineBoolean( tagName, new String JavaDoc[] { attrName } );
449     }
450
451
452     private static void defineBoolean( String JavaDoc tagName, String JavaDoc[] attrNames )
453     {
454         _boolAttrs.put( tagName, attrNames );
455     }
456
457
458     private static boolean isElement( String JavaDoc name, int flag )
459     {
460         Integer JavaDoc flags;
461
462         flags = (Integer JavaDoc) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
463         if ( flags == null )
464             return false;
465         else
466             return ( ( flags.intValue() & flag ) == flag );
467     }
468
469
470     static
471     {
472         _elemDefs = new Hashtable JavaDoc();
473         defineElement( "ADDRESS", CLOSE_P );
474         defineElement( "AREA", EMPTY );
475         defineElement( "BASE", EMPTY | ALLOWED_HEAD );
476         defineElement( "BASEFONT", EMPTY );
477         defineElement( "BLOCKQUOTE", CLOSE_P );
478         defineElement( "BODY", OPT_CLOSING );
479         defineElement( "BR", EMPTY );
480         defineElement( "COL", EMPTY );
481         defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
482         defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
483         defineElement( "DIV", CLOSE_P );
484         defineElement( "DL", ELEM_CONTENT | CLOSE_P );
485         defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
486         defineElement( "FIELDSET", CLOSE_P );
487         defineElement( "FORM", CLOSE_P );
488         defineElement( "FRAME", EMPTY | OPT_CLOSING );
489         defineElement( "H1", CLOSE_P );
490         defineElement( "H2", CLOSE_P );
491         defineElement( "H3", CLOSE_P );
492         defineElement( "H4", CLOSE_P );
493         defineElement( "H5", CLOSE_P );
494         defineElement( "H6", CLOSE_P );
495         defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
496         defineElement( "HR", EMPTY | CLOSE_P );
497         defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
498         defineElement( "IMG", EMPTY );
499         defineElement( "INPUT", EMPTY );
500         defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
501         defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
502         defineElement( "LINK", EMPTY | ALLOWED_HEAD );
503         defineElement( "MAP", ALLOWED_HEAD );
504         defineElement( "META", EMPTY | ALLOWED_HEAD );
505         defineElement( "OL", ELEM_CONTENT | CLOSE_P );
506         defineElement( "OPTGROUP", ELEM_CONTENT );
507         defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
508         defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
509         defineElement( "PARAM", EMPTY );
510         defineElement( "PRE", PRESERVE | CLOSE_P );
511         defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
512         defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
513         defineElement( "SELECT", ELEM_CONTENT );
514         defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
515         defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
516         defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
517         defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
518         defineElement( "TEXTAREA", PRESERVE );
519         defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
520         defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
521         defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
522         defineElement( "TITLE", ALLOWED_HEAD );
523         defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
524         defineElement( "UL", ELEM_CONTENT | CLOSE_P );
525
526         _boolAttrs = new Hashtable JavaDoc();
527         defineBoolean( "AREA", "href" );
528         defineBoolean( "BUTTON", "disabled" );
529         defineBoolean( "DIR", "compact" );
530         defineBoolean( "DL", "compact" );
531         defineBoolean( "FRAME", "noresize" );
532         defineBoolean( "HR", "noshade" );
533         defineBoolean( "IMAGE", "ismap" );
534         defineBoolean( "INPUT", new String JavaDoc[] { "defaultchecked", "checked", "readonly", "disabled" } );
535         defineBoolean( "LINK", "link" );
536         defineBoolean( "MENU", "compact" );
537         defineBoolean( "OBJECT", "declare" );
538         defineBoolean( "OL", "compact" );
539         defineBoolean( "OPTGROUP", "disabled" );
540         defineBoolean( "OPTION", new String JavaDoc[] { "default-selected", "selected", "disabled" } );
541         defineBoolean( "SCRIPT", "defer" );
542         defineBoolean( "SELECT", new String JavaDoc[] { "multiple", "disabled" } );
543         defineBoolean( "STYLE", "disabled" );
544         defineBoolean( "TD", "nowrap" );
545         defineBoolean( "TH", "nowrap" );
546         defineBoolean( "TEXTAREA", new String JavaDoc[] { "disabled", "readonly" } );
547         defineBoolean( "UL", "compact" );
548
549         initialize();
550     }
551
552
553
554 }
555
556
Popular Tags