CharInfo


1   /*
2    * Copyright 1999-2004 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  /*
17   * $Id: CharInfo.java,v 1.11 2004/02/23 10:29:37 aruny Exp $
18   */
19  package org.apache.xml.serializer;
20  
21  import java.io.BufferedReader  ;
22  import java.io.InputStream  ;
23  import java.io.InputStreamReader  ;
24  import java.io.UnsupportedEncodingException  ;
25  import java.net.URL  ;
26  import java.util.Hashtable  ;
27  import java.util.PropertyResourceBundle  ;
28  import java.util.Enumeration  ;
29  import java.util.ResourceBundle  ;
30  
31  import javax.xml.transform.TransformerException  ;
32  
33  import org.apache.xml.res.XMLErrorResources;
34  import org.apache.xml.res.XMLMessages;
35  import org.apache.xml.utils.CharKey;
36  import org.apache.xml.utils.SystemIDResolver;
37  import org.apache.xml.utils.WrappedRuntimeException;
38  
39  /**
40   * This class provides services that tell if a character should have
41   * special treatement, such as entity reference substitution or normalization
42   * of a newline character.  It also provides character to entity reference
43   * lookup.
44   *
45   * DEVELOPERS: See Known Issue in the constructor.
46   * 
47   * @xsl.usage internal
48   */
49  class CharInfo
50  {
51      /** Lookup table for characters to entity references. */
52      private Hashtable   m_charToEntityRef = new Hashtable  ();
53  
54      /**
55       * The name of the HTML entities file.
56       * If specified, the file will be resource loaded with the default class loader.
57       */
58      public static String   HTML_ENTITIES_RESOURCE = "org.apache.xml.serializer.HTMLEntities";
59  
60      /**
61       * The name of the XML entities file.
62       * If specified, the file will be resource loaded with the default class loader.
63       */
64      public static String   XML_ENTITIES_RESOURCE = "org.apache.xml.serializer.XMLEntities";
65  
66      /** The horizontal tab character, which the parser should always normalize. */
67      public static final char S_HORIZONAL_TAB = 0x09;
68  
69      /** The linefeed character, which the parser should always normalize. */
70      public static final char S_LINEFEED = 0x0A;
71  
72      /** The carriage return character, which the parser should always normalize. */
73      public static char S_CARRIAGERETURN = 0x0D;
74      
75      /** This flag is an optimization for HTML entities. It false if entities 
76       * other than quot (34), amp (38), lt (60) and gt (62) are defined
77       * in the range 0 to 127.
78       * @xsl.usage internal
79       */    
80      final boolean onlyQuotAmpLtGt;
81      
82      /** Copy the first 0,1 ... ASCII_MAX values into an array */
83      private static final int ASCII_MAX = 128;
84      
85      /** Array of values is faster access than a set of bits 
86       * to quickly check ASCII characters in attribute values. 
87       */
88      private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX];
89      
90      /** Array of values is faster access than a set of bits 
91       * to quickly check ASCII characters in text nodes. 
92       */
93      private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX];
94  
95      private boolean[] isCleanTextASCII = new boolean[ASCII_MAX];
96  
97      /** An array of bits to record if the character is in the set.
98       * Although information in this array is complete, the
99       * isSpecialAttrASCII array is used first because access to its values
100      * is common and faster.
101      */   
102     private int array_of_bits[] = createEmptySetOfIntegers(65535);
103      
104     
105     // 5 for 32 bit words,  6 for 64 bit words ...
106     /*
107      * This constant is used to shift an integer to quickly
108      * calculate which element its bit is stored in.
109      * 5 for 32 bit words (int) ,  6 for 64 bit words (long)
110      */
111     private static final int SHIFT_PER_WORD = 5;
112     
113     /*
114      * A mask to get the low order bits which are used to
115      * calculate the value of the bit within a given word,
116      * that will represent the presence of the integer in the 
117      * set.
118      * 
119      * 0x1F for 32 bit words (int),
120      * or 0x3F for 64 bit words (long) 
121      */
122     private static final int LOW_ORDER_BITMASK = 0x1f;
123     
124     /*
125      * This is used for optimizing the lookup of bits representing
126      * the integers in the set. It is the index of the first element
127      * in the array array_of_bits[] that is not used.
128      */
129     private int firstWordNotUsed;
130 
131 
132     /**
133      * Constructor that reads in a resource file that describes the mapping of
134      * characters to entity references.
135      * This constructor is private, just to force the use
136      * of the getCharInfo(entitiesResource) factory
137      *
138      * Resource files must be encoded in UTF-8 and can either be properties
139      * files with a .properties extension assumed.  Alternatively, they can
140      * have the following form, with no particular extension assumed:
141      *
142      * <pre>
143      * # First char # is a comment
144      * Entity numericValue
145      * quot 34
146      * amp 38
147      * </pre>
148      *    
149      * @param entitiesResource Name of properties or resource file that should
150      * be loaded, which describes that mapping of characters to entity
151      * references.
152      */
153     private CharInfo(String   entitiesResource, String   method)
154     {
155         this(entitiesResource, method, false);
156     }
157 
158     private CharInfo(String   entitiesResource, String   method, boolean internal)
159     {
160         ResourceBundle   entities = null;
161         boolean noExtraEntities = true;
162 
163         // Make various attempts to interpret the parameter as a properties
164         // file or resource file, as follows:
165         //
166         //   1) attempt to load .properties file using ResourceBundle
167         //   2) try using the class loader to find the specified file a resource
168         //      file
169         //   3) try treating the resource a URI
170 
171         if (internal) { 
172             try {
173                 // Load entity property files by using PropertyResourceBundle,
174                 // cause of security issure for applets
175                 entities = PropertyResourceBundle.getBundle(entitiesResource);
176             } catch (Exception   e) {}
177         }
178 
179         if (entities != null) {
180             Enumeration   keys = entities.getKeys();
181             while (keys.hasMoreElements()){
182                 String   name = (String  ) keys.nextElement();
183                 String   value = entities.getString(name);
184                 int code = Integer.parseInt(value);
185                 defineEntity(name, (char) code);
186                 if (extraEntity(code))
187                     noExtraEntities = false;
188             }
189             set(S_LINEFEED);
190             set(S_CARRIAGERETURN);
191         } else {
192             InputStream   is = null;
193 
194             // Load user specified resource file by using URL loading, it
195             // requires a valid URI as parameter
196             try {
197                 if (internal) {
198                     is = CharInfo.class.getResourceAsStream(entitiesResource);
199                 } else {
200                     ClassLoader   cl = ObjectFactory.findClassLoader();
201                     if (cl == null) {
202                         is = ClassLoader.getSystemResourceAsStream(entitiesResource);
203                     } else {
204                         is = cl.getResourceAsStream(entitiesResource);
205                     }
206 
207                     if (is == null) {
208                         try {
209                             URL   url = new URL  (entitiesResource);
210                             is = url.openStream();
211                         } catch (Exception   e) {}
212                     }
213                 }
214 
215                 if (is == null) {
216                     throw new RuntimeException  (
217                         XMLMessages.createXMLMessage(
218                             XMLErrorResources.ER_RESOURCE_COULD_NOT_FIND,
219                             new Object  [] {entitiesResource, entitiesResource}));
220                 }
221 
222                 // Fix Bugzilla#4000: force reading in UTF-8
223                 //  This creates the de facto standard that Xalan's resource 
224                 //  files must be encoded in UTF-8. This should work in all
225                 // JVMs.
226                 //
227                 // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which
228                 // didn't implement the UTF-8 encoding. Theoretically, we should
229                 // simply let it fail in that case, since the JVM is obviously
230                 // broken if it doesn't support such a basic standard.  But
231                 // since there are still some users attempting to use VJ++ for
232                 // development, we have dropped in a fallback which makes a
233                 // second attempt using the platform's default encoding. In VJ++
234                 // this is apparently ASCII, which is subset of UTF-8... and
235                 // since the strings we'll be reading here are also primarily
236                 // limited to the 7-bit ASCII range (at least, in English
237                 // versions of Xalan), this should work well enough to keep us
238                 // on the air until we're ready to officially decommit from
239                 // VJ++.
240 
241                 BufferedReader   reader;
242                 try {
243                     reader = new BufferedReader  (new InputStreamReader  (is, "UTF-8"));
244                 } catch (UnsupportedEncodingException   e) {
245                     reader = new BufferedReader  (new InputStreamReader  (is));
246                 }
247 
248                 String   line = reader.readLine();
249 
250                 while (line != null) {
251                     if (line.length() == 0 || line.charAt(0) == '#') {
252                         line = reader.readLine();
253 
254                         continue;
255                     }
256 
257                     int index = line.indexOf(' ');
258 
259                     if (index > 1) {
260                         String   name = line.substring(0, index);
261 
262                         ++index;
263 
264                         if (index < line.length()) {
265                             String   value = line.substring(index);
266                             index = value.indexOf(' ');
267 
268                             if (index > 0) {
269                                 value = value.substring(0, index);
270                             }
271 
272                             int code = Integer.parseInt(value);
273 
274                             defineEntity(name, (char) code);
275                             if (extraEntity(code))
276                                 noExtraEntities = false;
277                         }
278                     }
279 
280                     line = reader.readLine();
281                 }
282 
283                 is.close();
284                 set(S_LINEFEED);
285                 set(S_CARRIAGERETURN);
286             } catch (Exception   e) {
287                 throw new RuntimeException  (
288                     XMLMessages.createXMLMessage(
289                         XMLErrorResources.ER_RESOURCE_COULD_NOT_LOAD,
290                         new Object  [] { entitiesResource,
291                                        e.toString(),
292                                        entitiesResource,
293                                        e.toString()}));
294             } finally {
295                 if (is != null) {
296                     try {
297                         is.close();
298                     } catch (Exception   except) {}
299                 }
300             }
301         }
302           
303         /* initialize the array isCleanTextASCII[] with a cache of values
304          * for use by ToStream.character(char[], int , int)
305          * and the array isSpecialTextASCII[] with the opposite values
306          * (all in the name of performance!)
307          */
308         for (int ch = 0; ch <ASCII_MAX; ch++)
309         if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch)))
310              && (!get(ch))) || ('"' == ch))
311         {
312             isCleanTextASCII[ch] = true;
313             isSpecialTextASCII[ch] = false;
314         }
315         else {
316             isCleanTextASCII[ch] = false;
317             isSpecialTextASCII[ch] = true;     
318         }       
319         
320         /* Now that we've used get(ch) just above to initialize the
321          * two arrays we will change by adding a tab to the set of 
322          * special chars for XML (but not HTML!).
323          * We do this because a tab is always a
324          * special character in an XML attribute, 
325          * but only a special character in XML text 
326          * if it has an entity defined for it.
327          * This is the reason for this delay.
328          */
329         if (Method.XML.equals(method)) 
330         {
331             set(S_HORIZONAL_TAB);
332         }
333         
334 
335         onlyQuotAmpLtGt = noExtraEntities;
336 
337         // initialize the array with a cache of the BitSet values
338         for (int i=0; i<ASCII_MAX; i++)
339             isSpecialAttrASCII[i] = get(i);    
340 
341     }
342 
343     /**
344      * Defines a new character reference. The reference's name and value are
345      * supplied. Nothing happens if the character reference is already defined.
346      * <p>Unlike internal entities, character references are a string to single
347      * character mapping. They are used to map non-ASCII characters both on
348      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
349      * example of a character reference.</p>
350      *
351      * @param name The entity's name
352      * @param value The entity's value
353      */
354     private void defineEntity(String   name, char value)
355     {
356         CharKey character = new CharKey(value);
357 
358         m_charToEntityRef.put(character, name);
359         set(value);
360     }
361 
362     private CharKey m_charKey = new CharKey();
363 
364     /**
365      * Resolve a character to an entity reference name.
366      *
367      * This is reusing a stored key object, in an effort to avoid
368      * heap activity. Unfortunately, that introduces a threading risk.
369      * Simplest fix for now is to make it a synchronized method, or to give
370      * up the reuse; I see very little performance difference between them.
371      * Long-term solution would be to replace the hashtable with a sparse array
372      * keyed directly from the character's integer value; see DTM's
373      * string pool for a related solution.
374      *
375      * @param value character value that should be resolved to a name.
376      *
377      * @return name of character entity, or null if not found.
378      * @xsl.usage internal
379      */
380     synchronized public String   getEntityNameForChar(char value)
381     {
382         // CharKey m_charKey = new CharKey(); //Alternative to synchronized
383         m_charKey.setChar(value);
384         return (String  ) m_charToEntityRef.get(m_charKey);
385     }
386     
387     /**
388      * Tell if the character argument that is from
389      * an attribute value should have special treatment.
390      * 
391      * @param value the value of a character that is in an attribute value
392      * @return true if the character should have any special treatment, 
393      * such as when writing out attribute values, 
394      * or entity references.
395      * @xsl.usage internal
396      */
397     public final boolean isSpecialAttrChar(int value)
398     {
399         // for performance try the values in the boolean array first,
400         // this is faster access than the BitSet for common ASCII values
401 
402         if (value < ASCII_MAX)
403             return isSpecialAttrASCII[value];
404 
405         // rather than java.util.BitSet, our private
406         // implementation is faster (and less general).
407         return get(value);
408     }    
409 
410     /**
411      * Tell if the character argument that is from a 
412      * text node should have special treatment.
413      * 
414      * @param value the value of a character that is in a text node
415      * @return true if the character should have any special treatment, 
416      * such as when writing out attribute values, 
417      * or entity references.
418      * @xsl.usage internal
419      */
420     public final boolean isSpecialTextChar(int value)
421     {
422         // for performance try the values in the boolean array first,
423         // this is faster access than the BitSet for common ASCII values
424 
425         if (value < ASCII_MAX)
426             return isSpecialTextASCII[value];
427 
428         // rather than java.util.BitSet, our private
429         // implementation is faster (and less general).
430         return get(value);
431     }
432     
433     /**
434      * This method is used to determine if an ASCII character in
435      * a text node (not an attribute value) is "clean".
436      * @param value the character to check (0 to 127).
437      * @return true if the character can go to the writer as-is
438      * @xsl.usage internal
439      */
440     public final boolean isTextASCIIClean(int value)
441     {
442         return isCleanTextASCII[value];
443     }
444     
445 //  In the future one might want to use the array directly and avoid
446 //  the method call, but I think the JIT alreay inlines this well enough
447 //  so don't do it (for now) - bjm    
448 //    public final boolean[] getASCIIClean()
449 //    {
450 //        return isCleanTextASCII;
451 //    }
452 
453 
454     /**
455      * Factory that reads in a resource file that describes the mapping of
456      * characters to entity references.
457      *
458      * Resource files must be encoded in UTF-8 and have a format like:
459      * <pre>
460      * # First char # is a comment
461      * Entity numericValue
462      * quot 34
463      * amp 38
464      * </pre>
465      * (Note: Why don't we just switch to .properties files? Oct-01 -sc)
466      *
467      * @param entitiesResource Name of entities resource file that should
468      * be loaded, which describes that mapping of characters to entity references.
469      * @param method the output method type, which should be one of "xml", "html", "text"...
470      * 
471      * @xsl.usage internal
472      */
473     public static CharInfo getCharInfo(String   entitiesFileName, String   method)
474     {
475         CharInfo charInfo = (CharInfo) m_getCharInfoCache.get(entitiesFileName);
476         if (charInfo != null) {
477             return charInfo;
478         }
479 
480         // try to load it internally - cache
481         try {
482             charInfo = new CharInfo(entitiesFileName, method, true);
483             m_getCharInfoCache.put(entitiesFileName, charInfo);
484             return charInfo;
485         } catch (Exception   e) {}
486 
487         // try to load it externally - do not cache
488         try {
489             return new CharInfo(entitiesFileName, method);
490         } catch (Exception   e) {}
491 
492         String   absoluteEntitiesFileName;
493 
494         if (entitiesFileName.indexOf(':') < 0) {
495             absoluteEntitiesFileName =
496                 SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName);
497         } else {
498             try {
499                 absoluteEntitiesFileName =
500                     SystemIDResolver.getAbsoluteURI(entitiesFileName, null);
501             } catch (TransformerException   te) {
502                 throw new WrappedRuntimeException(te);
503             }
504         }
505 
506         return new CharInfo(absoluteEntitiesFileName, method, false);
507     }
508 
509     /** Table of user-specified char infos. */
510     private static Hashtable   m_getCharInfoCache = new Hashtable  ();
511 
512     /**
513      * Returns the array element holding the bit value for the
514      * given integer
515      * @param i the integer that might be in the set of integers
516      * 
517      */
518     private static int arrayIndex(int i) {
519         return (i >> SHIFT_PER_WORD);
520     }
521 
522     /**
523      * For a given integer in the set it returns the single bit
524      * value used within a given word that represents whether
525      * the integer is in the set or not.
526      */
527     private static int bit(int i) {
528         int ret = (1 << (i & LOW_ORDER_BITMASK));
529         return ret;
530     }
531 
532     /**
533      * Creates a new empty set of integers (characters)
534      * @param max the maximum integer to be in the set.
535      */
536     private int[] createEmptySetOfIntegers(int max) {
537         firstWordNotUsed = 0; // an optimization 
538 
539         int[] arr = new int[arrayIndex(max - 1) + 1];
540             return arr;
541  
542     }
543 
544     /**
545      * Adds the integer (character) to the set of integers.
546      * @param i the integer to add to the set, valid values are 
547      * 0, 1, 2 ... up to the maximum that was specified at
548      * the creation of the set.
549      */
550     private final void set(int i) {        
551         int j = (i >> SHIFT_PER_WORD); // this word is used
552         int k = j + 1;       
553         
554         if(firstWordNotUsed < k) // for optimization purposes.
555             firstWordNotUsed = k;
556             
557         array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK));
558     }
559 
560 
561     /**
562      * Return true if the integer (character)is in the set of integers.
563      * 
564      * This implementation uses an array of integers with 32 bits per
565      * integer.  If a bit is set to 1 the corresponding integer is 
566      * in the set of integers.
567      * 
568      * @param i an integer that is tested to see if it is the
569      * set of integers, or not.
570      */
571     private final boolean get(int i) {
572 
573         boolean in_the_set = false;
574         int j = (i >> SHIFT_PER_WORD); // wordIndex(i)
575         // an optimization here, ... a quick test to see
576         // if this integer is beyond any of the words in use
577         if(j < firstWordNotUsed)
578             in_the_set = (array_of_bits[j] & 
579                           (1 << (i & LOW_ORDER_BITMASK))
580             ) != 0;  // 0L for 64 bit words
581         return in_the_set;
582     }
583     
584     // record if there are any entities other than
585     // quot, amp, lt, gt  (probably user defined)
586     /**
587      * @return true if the entity 
588      * @param code The value of the character that has an entity defined
589      * for it.
590      */
591     private boolean extraEntity(int entityValue)
592     {
593         boolean extra = false;
594         if (entityValue < 128)
595         {
596             switch (entityValue)
597             {
598                 case 34 : // quot
599                 case 38 : // amp
600                 case 60 : // lt
601                 case 62 : // gt
602                     break;
603                 default : // other entity in range 0 to 127  
604                     extra = true;
605             }
606         }
607         return extra;
608     }    
609 }
610
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags