KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > imageio > metadata > XmlChars


1 /*
2  * $Id: XmlChars.java,v 1.1.1.1 2000/11/23 01:53:35 edwingo Exp $
3  *
4  * The Apache Software License, Version 1.1
5  *
6  *
7  * Copyright (c) 2000 The Apache Software Foundation. All rights
8  * reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  *
17  * 2. Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in
19  * the documentation and/or other materials provided with the
20  * distribution.
21  *
22  * 3. The end-user documentation included with the redistribution,
23  * if any, must include the following acknowledgment:
24  * "This product includes software developed by the
25  * Apache Software Foundation (http://www.apache.org/)."
26  * Alternately, this acknowledgment may appear in the software itself,
27  * if and wherever such third-party acknowledgments normally appear.
28  *
29  * 4. The names "Crimson" and "Apache Software Foundation" must
30  * not be used to endorse or promote products derived from this
31  * software without prior written permission. For written
32  * permission, please contact apache@apache.org.
33  *
34  * 5. Products derived from this software may not be called "Apache",
35  * nor may "Apache" appear in their name, without prior written
36  * permission of the Apache Software Foundation.
37  *
38  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49  * SUCH DAMAGE.
50  * ====================================================================
51  *
52  * This software consists of voluntary contributions made by many
53  * individuals on behalf of the Apache Software Foundation and was
54  * originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
55  * http://www.sun.com. For more information on the Apache Software
56  * Foundation, please see <http://www.apache.org/>.
57  */

58
59 package com.sun.imageio.metadata;
60
61
62 /**
63  * Methods in this class are used to determine whether characters may
64  * appear in certain roles in XML documents. Such methods are used
65  * both to parse and to create such documents.
66  *
67  * @version 1.8
68  * @author David Brownell
69  */

70 public class XmlChars
71 {
72     // can't construct instances
73
private XmlChars () { }
74
75     /**
76      * Returns true if the argument, a UCS-4 character code, is valid in
77      * XML documents. Unicode characters fit into the low sixteen
78      * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
79      * characters</em> can be combined to encode UCS-4 characters in
80      * documents containing only Unicode. (The <code>char</code> datatype
81      * in the Java Programming Language represents Unicode characters,
82      * including unpaired surrogates.)
83      *
84      * <P> In XML, UCS-4 characters can also be encoded by the use of
85      * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
86      * happens to refer to a character that is disallowed in XML documents.
87      * UCS-4 characters allowed in XML documents can be expressed with
88      * one or two Unicode characters.
89      *
90      * @param ucs4char The 32-bit UCS-4 character being tested.
91      */

92     static public boolean isChar (int ucs4char)
93     {
94     // [2] Char ::= #x0009 | #x000A | #x000D
95
// | [#x0020-#xD7FF]
96
// ... surrogates excluded!
97
// | [#xE000-#xFFFD]
98
// | [#x10000-#x10ffff]
99
return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
100         || ucs4char == 0x000A || ucs4char == 0x0009
101         || ucs4char == 0x000D
102         || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
103         || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
104     }
105
106     /**
107      * Returns true if the character is allowed to be a non-initial
108      * character in names according to the XML recommendation.
109      * @see #isNCNameChar
110      * @see #isLetter
111      */

112     public static boolean isNameChar (char c)
113     {
114     // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
115
// | CombiningChar | Extender
116

117     if (isLetter2 (c))
118         return true;
119     else if (c == '>')
120         return false;
121     else if (c == '.' || c == '-' || c == '_' || c == ':'
122         || isExtender (c))
123         return true;
124     else
125         return false;
126     }
127
128     /**
129      * Returns true if the character is allowed to be a non-initial
130      * character in unscoped names according to the rules of the XML
131      * Namespaces proposed recommendation. Except for precluding
132      * the colon (used to separate names from their scopes) these
133      * characters are just as allowed by the XML recommendation.
134      * @see #isNameChar
135      * @see #isLetter
136      */

137     public static boolean isNCNameChar (char c)
138     {
139     // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
140
// | CombiningChar | Extender
141
return c != ':' && isNameChar (c);
142     }
143
144     /**
145      * Returns true if the character is allowed where XML supports
146      * whitespace characters, false otherwise.
147      */

148     public static boolean isSpace (char c)
149     {
150     return c == ' ' || c == '\t' || c == '\n' || c == '\r';
151     }
152
153
154     /*
155      * NOTE: java.lang.Character.getType() values are:
156      *
157      * UNASSIGNED = 0,
158      *
159      * UPPERCASE_LETTER = 1, // Lu
160      * LOWERCASE_LETTER = 2, // Ll
161      * TITLECASE_LETTER = 3, // Lt
162      * MODIFIER_LETTER = 4, // Lm
163      * OTHER_LETTER = 5, // Lo
164      * NON_SPACING_MARK = 6, // Mn
165      * ENCLOSING_MARK = 7, // Me
166      * COMBINING_SPACING_MARK = 8, // Mc
167      * DECIMAL_DIGIT_NUMBER = 9, // Nd
168      * LETTER_NUMBER = 10, // Nl
169      * OTHER_NUMBER = 11, // No
170      * SPACE_SEPARATOR = 12, // Zs
171      * LINE_SEPARATOR = 13, // Zl
172      * PARAGRAPH_SEPARATOR = 14, // Zp
173      * CONTROL = 15, // Cc
174      * FORMAT = 16, // Cf
175      * // 17 reserved for proposed Ci category
176      * PRIVATE_USE = 18, // Co
177      * SURROGATE = 19, // Cs
178      * DASH_PUNCTUATION = 20, // Pd
179      * START_PUNCTUATION = 21, // Ps
180      * END_PUNCTUATION = 22, // Pe
181      * CONNECTOR_PUNCTUATION = 23, // Pc
182      * OTHER_PUNCTUATION = 24, // Po
183      * MATH_SYMBOL = 25, // Sm
184      * CURRENCY_SYMBOL = 26, // Sc
185      * MODIFIER_SYMBOL = 27, // Sk
186      * OTHER_SYMBOL = 28; // So
187      */

188
189     /**
190      * Returns true if the character is an XML "letter". XML Names must
191      * start with Letters or a few other characters, but other characters
192      * in names must only satisfy the <em>isNameChar</em> predicate.
193      *
194      * @see #isNameChar
195      * @see #isNCNameChar
196      */

197     public static boolean isLetter (char c)
198     {
199     // [84] Letter ::= BaseChar | Ideographic
200
// [85] BaseChar ::= ... too much to repeat
201
// [86] Ideographic ::= ... too much to repeat
202

203     //
204
// Optimize the typical case.
205
//
206
if (c >= 'a' && c <= 'z')
207         return true;
208     if (c == '/')
209         return false;
210     if (c >= 'A' && c <= 'Z')
211         return true;
212
213     //
214
// Since the tables are too ridiculous to use in code,
215
// we're using the footnotes here to drive this test.
216
//
217
switch (Character.getType (c)) {
218         // app. B footnote says these are 'name start'
219
// chars' ...
220
case Character.LOWERCASE_LETTER: // Ll
221
case Character.UPPERCASE_LETTER: // Lu
222
case Character.OTHER_LETTER: // Lo
223
case Character.TITLECASE_LETTER: // Lt
224
case Character.LETTER_NUMBER: // Nl
225

226         // OK, here we just have some exceptions to check...
227
return !isCompatibilityChar (c)
228             // per "5.14 of Unicode", rule out some combiners
229
&& !(c >= 0x20dd && c <= 0x20e0);
230
231       default:
232         // check for some exceptions: these are "alphabetic"
233
return ((c >= 0x02bb && c <= 0x02c1)
234             || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
235     }
236     }
237
238     //
239
// XML 1.0 discourages "compatibility" characters in names; these
240
// were defined to permit passing through some information stored in
241
// older non-Unicode character sets. These always have alternative
242
// representations in Unicode, e.g. using combining chars.
243
//
244
private static boolean isCompatibilityChar (char c)
245     {
246     // the numerous comparisions here seem unavoidable,
247
// but the switch can reduce the number which must
248
// actually be executed.
249

250     switch ((c >> 8) & 0x0ff) {
251       case 0x00:
252         // ISO Latin/1 has a few compatibility characters
253
return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
254
255       case 0x01:
256         // as do Latin Extended A and (parts of) B
257
return (c >= 0x0132 && c <= 0x0133)
258         || (c >= 0x013f && c <= 0x0140)
259         || c == 0x0149
260         || c == 0x017f
261         || (c >= 0x01c4 && c <= 0x01cc)
262         || (c >= 0x01f1 && c <= 0x01f3) ;
263
264       case 0x02:
265            // some spacing modifiers
266
return (c >= 0x02b0 && c <= 0x02b8)
267         || (c >= 0x02e0 && c <= 0x02e4);
268       
269       case 0x03:
270         return c == 0x037a; // Greek
271

272       case 0x05:
273         return c == 0x0587; // Armenian
274

275       case 0x0e:
276         return c >= 0x0edc && c <= 0x0edd; // Laotian
277

278       case 0x11:
279         // big chunks of Hangul Jamo are all "compatibility"
280
return c == 0x1101
281         || c == 0x1104
282         || c == 0x1108
283         || c == 0x110a
284         || c == 0x110d
285         || (c >= 0x1113 && c <= 0x113b)
286         || c == 0x113d
287         || c == 0x113f
288         || (c >= 0x1141 && c <= 0x114b)
289         || c == 0x114d
290         || c == 0x114f
291         || (c >= 0x1151 && c <= 0x1153)
292         || (c >= 0x1156 && c <= 0x1158)
293         || c == 0x1162
294         || c == 0x1164
295         || c == 0x1166
296         || c == 0x1168
297         || (c >= 0x116a && c <= 0x116c)
298         || (c >= 0x116f && c <= 0x1171)
299         || c == 0x1174
300         || (c >= 0x1176 && c <= 0x119d)
301         || (c >= 0x119f && c <= 0x11a2)
302         || (c >= 0x11a9 && c <= 0x11aa)
303         || (c >= 0x11ac && c <= 0x11ad)
304         || (c >= 0x11b0 && c <= 0x11b6)
305         || c == 0x11b9
306         || c == 0x11bb
307         || (c >= 0x11c3 && c <= 0x11ea)
308         || (c >= 0x11ec && c <= 0x11ef)
309         || (c >= 0x11f1 && c <= 0x11f8)
310         ;
311
312       case 0x20:
313         return c == 0x207f; // superscript
314

315       case 0x21:
316         return
317         // various letterlike symbols
318
c == 0x2102
319         || c == 0x2107
320         || (c >= 0x210a && c <= 0x2113)
321         || c == 0x2115
322         || (c >= 0x2118 && c <= 0x211d)
323         || c == 0x2124
324         || c == 0x2128
325         || (c >= 0x212c && c <= 0x212d)
326         || (c >= 0x212f && c <= 0x2138)
327
328             // most Roman numerals (less 1K, 5K, 10K)
329
|| (c >= 0x2160 && c <= 0x217f)
330         ;
331
332       case 0x30:
333         // some Hiragana
334
return c >= 0x309b && c <= 0x309c;
335
336       case 0x31:
337         // all Hangul Compatibility Jamo
338
return c >= 0x3131 && c <= 0x318e;
339
340       case 0xf9:
341       case 0xfa:
342       case 0xfb:
343       case 0xfc:
344       case 0xfd:
345       case 0xfe:
346       case 0xff:
347         // the whole "compatibility" area is for that purpose!
348
return true;
349     
350       default:
351         // most of Unicode isn't flagged as being for compatibility
352
return false;
353     }
354     }
355
356     // guts of isNameChar/isNCNameChar
357
private static boolean isLetter2 (char c)
358     {
359     // [84] Letter ::= BaseChar | Ideographic
360
// [85] BaseChar ::= ... too much to repeat
361
// [86] Ideographic ::= ... too much to repeat
362
// [87] CombiningChar ::= ... too much to repeat
363

364     //
365
// Optimize the typical case.
366
//
367
if (c >= 'a' && c <= 'z')
368         return true;
369     if (c == '>')
370         return false;
371     if (c >= 'A' && c <= 'Z')
372         return true;
373
374     //
375
// Since the tables are too ridiculous to use in code,
376
// we're using the footnotes here to drive this test.
377
//
378
switch (Character.getType (c)) {
379         // app. B footnote says these are 'name start'
380
// chars' ...
381
case Character.LOWERCASE_LETTER: // Ll
382
case Character.UPPERCASE_LETTER: // Lu
383
case Character.OTHER_LETTER: // Lo
384
case Character.TITLECASE_LETTER: // Lt
385
case Character.LETTER_NUMBER: // Nl
386
// ... and these are name characters 'other
387
// than name start characters'
388
case Character.COMBINING_SPACING_MARK: // Mc
389
case Character.ENCLOSING_MARK: // Me
390
case Character.NON_SPACING_MARK: // Mn
391
case Character.MODIFIER_LETTER: // Lm
392
case Character.DECIMAL_DIGIT_NUMBER: // Nd
393

394         // OK, here we just have some exceptions to check...
395
return !isCompatibilityChar (c)
396             // per "5.14 of Unicode", rule out some combiners
397
&& !(c >= 0x20dd && c <= 0x20e0);
398
399       default:
400         // added a character ...
401
return c == 0x0387;
402     }
403     }
404
405     private static boolean isDigit (char c)
406     {
407     // [88] Digit ::= ...
408

409     //
410
// java.lang.Character.isDigit is correct from the XML point
411
// of view except that it allows "fullwidth" digits.
412
//
413
return Character.isDigit (c)
414         && ! ( (c >= 0xff10) && (c <= 0xff19));
415     }
416
417     private static boolean isExtender (char c)
418     {
419     // [89] Extender ::= ...
420
return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
421         || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
422         || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
423         || (c >= 0x309d && c <= 0x309e)
424         || (c >= 0x30fc && c <= 0x30fe)
425         ;
426     }
427 }
428
Popular Tags