KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > xml > serializer > CharInfo


1 /*
2  * Copyright 1999-2004 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 /*
17  * $Id: CharInfo.java,v 1.11 2004/02/23 10:29:37 aruny Exp $
18  */

19 package org.apache.xml.serializer;
20
21 import java.io.BufferedReader JavaDoc;
22 import java.io.InputStream JavaDoc;
23 import java.io.InputStreamReader JavaDoc;
24 import java.io.UnsupportedEncodingException JavaDoc;
25 import java.net.URL JavaDoc;
26 import java.util.Hashtable JavaDoc;
27 import java.util.PropertyResourceBundle JavaDoc;
28 import java.util.Enumeration JavaDoc;
29 import java.util.ResourceBundle JavaDoc;
30
31 import javax.xml.transform.TransformerException JavaDoc;
32
33 import org.apache.xml.res.XMLErrorResources;
34 import org.apache.xml.res.XMLMessages;
35 import org.apache.xml.utils.CharKey;
36 import org.apache.xml.utils.SystemIDResolver;
37 import org.apache.xml.utils.WrappedRuntimeException;
38
39 /**
40  * This class provides services that tell if a character should have
41  * special treatement, such as entity reference substitution or normalization
42  * of a newline character. It also provides character to entity reference
43  * lookup.
44  *
45  * DEVELOPERS: See Known Issue in the constructor.
46  *
47  * @xsl.usage internal
48  */

49 class CharInfo
50 {
51     /** Lookup table for characters to entity references. */
52     private Hashtable JavaDoc m_charToEntityRef = new Hashtable JavaDoc();
53
54     /**
55      * The name of the HTML entities file.
56      * If specified, the file will be resource loaded with the default class loader.
57      */

58     public static String JavaDoc HTML_ENTITIES_RESOURCE = "org.apache.xml.serializer.HTMLEntities";
59
60     /**
61      * The name of the XML entities file.
62      * If specified, the file will be resource loaded with the default class loader.
63      */

64     public static String JavaDoc XML_ENTITIES_RESOURCE = "org.apache.xml.serializer.XMLEntities";
65
66     /** The horizontal tab character, which the parser should always normalize. */
67     public static final char S_HORIZONAL_TAB = 0x09;
68
69     /** The linefeed character, which the parser should always normalize. */
70     public static final char S_LINEFEED = 0x0A;
71
72     /** The carriage return character, which the parser should always normalize. */
73     public static char S_CARRIAGERETURN = 0x0D;
74     
75     /** This flag is an optimization for HTML entities. It false if entities
76      * other than quot (34), amp (38), lt (60) and gt (62) are defined
77      * in the range 0 to 127.
78      * @xsl.usage internal
79      */

80     final boolean onlyQuotAmpLtGt;
81     
82     /** Copy the first 0,1 ... ASCII_MAX values into an array */
83     private static final int ASCII_MAX = 128;
84     
85     /** Array of values is faster access than a set of bits
86      * to quickly check ASCII characters in attribute values.
87      */

88     private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
89     
90     /** Array of values is faster access than a set of bits
91      * to quickly check ASCII characters in text nodes.
92      */

93     private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
94
95     private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
96
97     /** An array of bits to record if the character is in the set.
98      * Although information in this array is complete, the
99      * isSpecialAttrASCII array is used first because access to its values
100      * is common and faster.
101      */

102     private int array_of_bits[] = createEmptySetOfIntegers(65535);
103      
104     
105     // 5 for 32 bit words, 6 for 64 bit words ...
106
/*
107      * This constant is used to shift an integer to quickly
108      * calculate which element its bit is stored in.
109      * 5 for 32 bit words (int) , 6 for 64 bit words (long)
110      */

111     private static final int SHIFT_PER_WORD = 5;
112     
113     /*
114      * A mask to get the low order bits which are used to
115      * calculate the value of the bit within a given word,
116      * that will represent the presence of the integer in the
117      * set.
118      *
119      * 0x1F for 32 bit words (int),
120      * or 0x3F for 64 bit words (long)
121      */

122     private static final int LOW_ORDER_BITMASK = 0x1f;
123     
124     /*
125      * This is used for optimizing the lookup of bits representing
126      * the integers in the set. It is the index of the first element
127      * in the array array_of_bits[] that is not used.
128      */

129     private int firstWordNotUsed;
130
131
132     /**
133      * Constructor that reads in a resource file that describes the mapping of
134      * characters to entity references.
135      * This constructor is private, just to force the use
136      * of the getCharInfo(entitiesResource) factory
137      *
138      * Resource files must be encoded in UTF-8 and can either be properties
139      * files with a .properties extension assumed. Alternatively, they can
140      * have the following form, with no particular extension assumed:
141      *
142      * <pre>
143      * # First char # is a comment
144      * Entity numericValue
145      * quot 34
146      * amp 38
147      * </pre>
148      *
149      * @param entitiesResource Name of properties or resource file that should
150      * be loaded, which describes that mapping of characters to entity
151      * references.
152      */

153     private CharInfo(String JavaDoc entitiesResource, String JavaDoc method)
154     {
155         this(entitiesResource, method, false);
156     }
157
158     private CharInfo(String JavaDoc entitiesResource, String JavaDoc method, boolean internal)
159     {
160         ResourceBundle JavaDoc entities = null;
161         boolean noExtraEntities = true;
162
163         // Make various attempts to interpret the parameter as a properties
164
// file or resource file, as follows:
165
//
166
// 1) attempt to load .properties file using ResourceBundle
167
// 2) try using the class loader to find the specified file a resource
168
// file
169
// 3) try treating the resource a URI
170

171         if (internal) {
172             try {
173                 // Load entity property files by using PropertyResourceBundle,
174
// cause of security issure for applets
175
entities = PropertyResourceBundle.getBundle(entitiesResource);
176             } catch (Exception JavaDoc e) {}
177         }
178
179         if (entities != null) {
180             Enumeration JavaDoc keys = entities.getKeys();
181             while (keys.hasMoreElements()){
182                 String JavaDoc name = (String JavaDoc) keys.nextElement();
183                 String JavaDoc value = entities.getString(name);
184                 int code = Integer.parseInt(value);
185                 defineEntity(name, (char) code);
186                 if (extraEntity(code))
187                     noExtraEntities = false;
188             }
189             set(S_LINEFEED);
190             set(S_CARRIAGERETURN);
191         } else {
192             InputStream JavaDoc is = null;
193
194             // Load user specified resource file by using URL loading, it
195
// requires a valid URI as parameter
196
try {
197                 if (internal) {
198                     is = CharInfo.class.getResourceAsStream(entitiesResource);
199                 } else {
200                     ClassLoader JavaDoc cl = ObjectFactory.findClassLoader();
201                     if (cl == null) {
202                         is = ClassLoader.getSystemResourceAsStream(entitiesResource);
203                     } else {
204                         is = cl.getResourceAsStream(entitiesResource);
205                     }
206
207                     if (is == null) {
208                         try {
209                             URL JavaDoc url = new URL JavaDoc(entitiesResource);
210                             is = url.openStream();
211                         } catch (Exception JavaDoc e) {}
212                     }
213                 }
214
215                 if (is == null) {
216                     throw new RuntimeException JavaDoc(
217                         XMLMessages.createXMLMessage(
218                             XMLErrorResources.ER_RESOURCE_COULD_NOT_FIND,
219                             new Object JavaDoc[] {entitiesResource, entitiesResource}));
220                 }
221
222                 // Fix Bugzilla#4000: force reading in UTF-8
223
// This creates the de facto standard that Xalan's resource
224
// files must be encoded in UTF-8. This should work in all
225
// JVMs.
226
//
227
// %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
228
// didn't implement the UTF-8 encoding. Theoretically, we should
229
// simply let it fail in that case, since the JVM is obviously
230
// broken if it doesn't support such a basic standard. But
231
// since there are still some users attempting to use VJ++ for
232
// development, we have dropped in a fallback which makes a
233
// second attempt using the platform's default encoding. In VJ++
234
// this is apparently ASCII, which is subset of UTF-8... and
235
// since the strings we'll be reading here are also primarily
236
// limited to the 7-bit ASCII range (at least, in English
237
// versions of Xalan), this should work well enough to keep us
238
// on the air until we're ready to officially decommit from
239
// VJ++.
240

241                 BufferedReader JavaDoc reader;
242                 try {
243                     reader = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is, "UTF-8"));
244                 } catch (UnsupportedEncodingException JavaDoc e) {
245                     reader = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is));
246                 }
247
248                 String JavaDoc line = reader.readLine();
249
250                 while (line != null) {
251                     if (line.length() == 0 || line.charAt(0) == '#') {
252                         line = reader.readLine();
253
254                         continue;
255                     }
256
257                     int index = line.indexOf(' ');
258
259                     if (index > 1) {
260                         String JavaDoc name = line.substring(0, index);
261
262                         ++index;
263
264                         if (index < line.length()) {
265                             String JavaDoc value = line.substring(index);
266                             index = value.indexOf(' ');
267
268                             if (index > 0) {
269                                 value = value.substring(0, index);
270                             }
271
272                             int code = Integer.parseInt(value);
273
274                             defineEntity(name, (char) code);
275                             if (extraEntity(code))
276                                 noExtraEntities = false;
277                         }
278                     }
279
280                     line = reader.readLine();
281                 }
282
283                 is.close();
284                 set(S_LINEFEED);
285                 set(S_CARRIAGERETURN);
286             } catch (Exception JavaDoc e) {
287                 throw new RuntimeException JavaDoc(
288                     XMLMessages.createXMLMessage(
289                         XMLErrorResources.ER_RESOURCE_COULD_NOT_LOAD,
290                         new Object JavaDoc[] { entitiesResource,
291                                        e.toString(),
292                                        entitiesResource,
293                                        e.toString()}));
294             } finally {
295                 if (is != null) {
296                     try {
297                         is.close();
298                     } catch (Exception JavaDoc except) {}
299                 }
300             }
301         }
302           
303         /* initialize the array isCleanTextASCII[] with a cache of values
304          * for use by ToStream.character(char[], int , int)
305          * and the array isSpecialTextASCII[] with the opposite values
306          * (all in the name of performance!)
307          */

308         for (int ch = 0; ch <ASCII_MAX; ch++)
309         if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
310              && (!get(ch))) || ('"' == ch))
311         {
312             isCleanTextASCII[ch] = true;
313             isSpecialTextASCII[ch] = false;
314         }
315         else {
316             isCleanTextASCII[ch] = false;
317             isSpecialTextASCII[ch] = true;
318         }
319         
320         /* Now that we've used get(ch) just above to initialize the
321          * two arrays we will change by adding a tab to the set of
322          * special chars for XML (but not HTML!).
323          * We do this because a tab is always a
324          * special character in an XML attribute,
325          * but only a special character in XML text
326          * if it has an entity defined for it.
327          * This is the reason for this delay.
328          */

329         if (Method.XML.equals(method))
330         {
331             set(S_HORIZONAL_TAB);
332         }
333         
334
335         onlyQuotAmpLtGt = noExtraEntities;
336
337         // initialize the array with a cache of the BitSet values
338
for (int i=0; i<ASCII_MAX; i++)
339             isSpecialAttrASCII[i] = get(i);
340
341     }
342
343     /**
344      * Defines a new character reference. The reference's name and value are
345      * supplied. Nothing happens if the character reference is already defined.
346      * <p>Unlike internal entities, character references are a string to single
347      * character mapping. They are used to map non-ASCII characters both on
348      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
349      * example of a character reference.</p>
350      *
351      * @param name The entity's name
352      * @param value The entity's value
353      */

354     private void defineEntity(String JavaDoc name, char value)
355     {
356         CharKey character = new CharKey(value);
357
358         m_charToEntityRef.put(character, name);
359         set(value);
360     }
361
362     private CharKey m_charKey = new CharKey();
363
364     /**
365      * Resolve a character to an entity reference name.
366      *
367      * This is reusing a stored key object, in an effort to avoid
368      * heap activity. Unfortunately, that introduces a threading risk.
369      * Simplest fix for now is to make it a synchronized method, or to give
370      * up the reuse; I see very little performance difference between them.
371      * Long-term solution would be to replace the hashtable with a sparse array
372      * keyed directly from the character's integer value; see DTM's
373      * string pool for a related solution.
374      *
375      * @param value character value that should be resolved to a name.
376      *
377      * @return name of character entity, or null if not found.
378      * @xsl.usage internal
379      */

380     synchronized public String JavaDoc getEntityNameForChar(char value)
381     {
382         // CharKey m_charKey = new CharKey(); //Alternative to synchronized
383
m_charKey.setChar(value);
384         return (String JavaDoc) m_charToEntityRef.get(m_charKey);
385     }
386     
387     /**
388      * Tell if the character argument that is from
389      * an attribute value should have special treatment.
390      *
391      * @param value the value of a character that is in an attribute value
392      * @return true if the character should have any special treatment,
393      * such as when writing out attribute values,
394      * or entity references.
395      * @xsl.usage internal
396      */

397     public final boolean isSpecialAttrChar(int value)
398     {
399         // for performance try the values in the boolean array first,
400
// this is faster access than the BitSet for common ASCII values
401

402         if (value < ASCII_MAX)
403             return isSpecialAttrASCII[value];
404
405         // rather than java.util.BitSet, our private
406
// implementation is faster (and less general).
407
return get(value);
408     }
409
410     /**
411      * Tell if the character argument that is from a
412      * text node should have special treatment.
413      *
414      * @param value the value of a character that is in a text node
415      * @return true if the character should have any special treatment,
416      * such as when writing out attribute values,
417      * or entity references.
418      * @xsl.usage internal
419      */

420     public final boolean isSpecialTextChar(int value)
421     {
422         // for performance try the values in the boolean array first,
423
// this is faster access than the BitSet for common ASCII values
424

425         if (value < ASCII_MAX)
426             return isSpecialTextASCII[value];
427
428         // rather than java.util.BitSet, our private
429
// implementation is faster (and less general).
430
return get(value);
431     }
432     
433     /**
434      * This method is used to determine if an ASCII character in
435      * a text node (not an attribute value) is "clean".
436      * @param value the character to check (0 to 127).
437      * @return true if the character can go to the writer as-is
438      * @xsl.usage internal
439      */

440     public final boolean isTextASCIIClean(int value)
441     {
442         return isCleanTextASCII[value];
443     }
444     
445 // In the future one might want to use the array directly and avoid
446
// the method call, but I think the JIT alreay inlines this well enough
447
// so don't do it (for now) - bjm
448
// public final boolean[] getASCIIClean()
449
// {
450
// return isCleanTextASCII;
451
// }
452

453
454     /**
455      * Factory that reads in a resource file that describes the mapping of
456      * characters to entity references.
457      *
458      * Resource files must be encoded in UTF-8 and have a format like:
459      * <pre>
460      * # First char # is a comment
461      * Entity numericValue
462      * quot 34
463      * amp 38
464      * </pre>
465      * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
466      *
467      * @param entitiesResource Name of entities resource file that should
468      * be loaded, which describes that mapping of characters to entity references.
469      * @param method the output method type, which should be one of "xml", "html", "text"...
470      *
471      * @xsl.usage internal
472      */

473     public static CharInfo getCharInfo(String JavaDoc entitiesFileName, String JavaDoc method)
474     {
475         CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
476         if (charInfo != null) {
477             return charInfo;
478         }
479
480         // try to load it internally - cache
481
try {
482             charInfo = new CharInfo(entitiesFileName, method, true);
483             m_getCharInfoCache.put(entitiesFileName, charInfo);
484             return charInfo;
485         } catch (Exception JavaDoc e) {}
486
487         // try to load it externally - do not cache
488
try {
489             return new CharInfo(entitiesFileName, method);
490         } catch (Exception JavaDoc e) {}
491
492         String JavaDoc absoluteEntitiesFileName;
493
494         if (entitiesFileName.indexOf(':') < 0) {
495             absoluteEntitiesFileName =
496                 SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName);
497         } else {
498             try {
499                 absoluteEntitiesFileName =
500                     SystemIDResolver.getAbsoluteURI(entitiesFileName, null);
501             } catch (TransformerException JavaDoc te) {
502                 throw new WrappedRuntimeException(te);
503             }
504         }
505
506         return new CharInfo(absoluteEntitiesFileName, method, false);
507     }
508
509     /** Table of user-specified char infos. */
510     private static Hashtable JavaDoc m_getCharInfoCache = new Hashtable JavaDoc();
511
512     /**
513      * Returns the array element holding the bit value for the
514      * given integer
515      * @param i the integer that might be in the set of integers
516      *
517      */

518     private static int arrayIndex(int i) {
519         return (i >> SHIFT_PER_WORD);
520     }
521
522     /**
523      * For a given integer in the set it returns the single bit
524      * value used within a given word that represents whether
525      * the integer is in the set or not.
526      */

527     private static int bit(int i) {
528         int ret = (1 << (i & LOW_ORDER_BITMASK));
529         return ret;
530     }
531
532     /**
533      * Creates a new empty set of integers (characters)
534      * @param max the maximum integer to be in the set.
535      */

536     private int[] createEmptySetOfIntegers(int max) {
537         firstWordNotUsed = 0; // an optimization
538

539         int[] arr = new int[arrayIndex(max - 1) + 1];
540             return arr;
541  
542     }
543
544     /**
545      * Adds the integer (character) to the set of integers.
546      * @param i the integer to add to the set, valid values are
547      * 0, 1, 2 ... up to the maximum that was specified at
548      * the creation of the set.
549      */

550     private final void set(int i) {
551         int j = (i >> SHIFT_PER_WORD); // this word is used
552
int k = j + 1;
553         
554         if(firstWordNotUsed < k) // for optimization purposes.
555
firstWordNotUsed = k;
556             
557         array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
558     }
559
560
561     /**
562      * Return true if the integer (character)is in the set of integers.
563      *
564      * This implementation uses an array of integers with 32 bits per
565      * integer. If a bit is set to 1 the corresponding integer is
566      * in the set of integers.
567      *
568      * @param i an integer that is tested to see if it is the
569      * set of integers, or not.
570      */

571     private final boolean get(int i) {
572
573         boolean in_the_set = false;
574         int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
575
// an optimization here, ... a quick test to see
576
// if this integer is beyond any of the words in use
577
if(j < firstWordNotUsed)
578             in_the_set = (array_of_bits[j] &
579                           (1 << (i & LOW_ORDER_BITMASK))
580             ) != 0; // 0L for 64 bit words
581
return in_the_set;
582     }
583     
584     // record if there are any entities other than
585
// quot, amp, lt, gt (probably user defined)
586
/**
587      * @return true if the entity
588      * @param code The value of the character that has an entity defined
589      * for it.
590      */

591     private boolean extraEntity(int entityValue)
592     {
593         boolean extra = false;
594         if (entityValue < 128)
595         {
596             switch (entityValue)
597             {
598                 case 34 : // quot
599
case 38 : // amp
600
case 60 : // lt
601
case 62 : // gt
602
break;
603                 default : // other entity in range 0 to 127
604
extra = true;
605             }
606         }
607         return extra;
608     }
609 }
610
Popular Tags