KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > JSX > ParseUtilities


1 /** (c) Brendan Macmillan 2001, licensed under GNU's GPL 2
2         (If your project is not GPL, then a fee is due - see license on website)
3     Website: http://www.csse.monash.edu.au/~bren/JSX
4     Website: http://freshmeat.net/projects/jsx
5     List (can read without subscribing, need to join to post):
6       http://groups.yahoo.com/group/JSX-ideas/messages
7         Commercial licensing enquiries only:
8             bren@mail.csse.monash.edu.au */

9
10
11 /** ===========================================================================
12     * ParseUtilities
13     * ==============
14     * boolean, float and double; and
15     * XML Strings (both parsing and writing code)
16     *
17     * Correctly parses NaN, Infinity and -Infinity of floating point numbers
18     * (float and double). I wonder why the standard versions do not do this
19     * already?
20     *
21     * Based on a code patch submitted by Mark Collette, 11 Jul 2001
22     **/

23 package JSX;
24
25 public class ParseUtilities {
26     public static boolean parseBoolean(String JavaDoc in) {
27         String JavaDoc s = in.trim();
28         if (s.equalsIgnoreCase("false")) return false;
29         else if (s.equalsIgnoreCase("true")) return true;
30         else
31             throw new IllegalArgumentException JavaDoc("Boolean must be 'true' or 'false'; not '"
32                 +s+"')");
33             //superclass of NumberFormatException
34
}
35     public static float parseFloat(String JavaDoc in) {
36         String JavaDoc s = in.trim();
37         if( s.equals("NaN") ) return Float.NaN;
38         else if( s.equals("Infinity") ) return Float.POSITIVE_INFINITY;
39         else if( s.equals("-Infinity") ) return Float.NEGATIVE_INFINITY;
40         else return Float.parseFloat(s);
41     }
42
43     public static double parseDouble(String JavaDoc s) {
44         if( s.equals("NaN") ) return Double.NaN;
45         else if( s.equals("Infinity") ) return Double.POSITIVE_INFINITY;
46         else if( s.equals("-Infinity") ) return Double.NEGATIVE_INFINITY;
47         else return Double.parseDouble(s);
48     }
49
50     /**
51         * Changed to StringBuffer based on Raimi's "non-fork" version.
52         * (FIXED) BUG: characters from (char)128 (0x80) to (char)159 (0x9f)
53         * are encoded as "?"
54         *
55         * A new scheme:
56         * output "\_" as "\_"; unless _ is a \ or u
57         * (note: if the _ will need to be escaped, then it is a "\"...)
58         * requires looking ahead to the next character. That is, a digraph
59         * encoding.
60         * What to do if there is no next character? I guess this could be
61         * treated as a special digraph all its own, and given a particular
62         * encoding. It would be easiest for the decoder if this was encoded as
63         * "\\". That way, there is *always* another character following
64         */

65     public static String JavaDoc encodeXML(String JavaDoc in) {
66         StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length()); //cap at least this long
67
//FIXME: \ u encoding <-- compiler picks it up as a malformed unicoding!
68
//is \u0000 a legitimate char? Can it be rep in XML?
69
for (int i=0; i<in.length(); i++) {
70             char c = in.charAt(i);
71             switch (c) {
72                 case '&': out.append("&amp;"); break;
73                 case '<': out.append("&lt;"); break;
74                 case '>': out.append("&gt;"); break;
75                 case '\'': out.append("&apos;"); break;
76                 case '"': out.append("&quot;"); break;
77             // case '\\': out.append("\\\\"); break;
78
case '\\':
79                     //need to check for the next character, and encode them together:
80
// if there is none, then output "\\" - need to escape for compiler
81
// if it is '\', then output "\\\\" - need to escape for compiler
82
// if it is 'u', then output "\ \ u" (spaced so compiler not upset)
83
// for all other cases, simply output '\' and the character.
84
if (i+1==in.length()) { //could also write as an Exception handler
85
out.append("\\\\"); //escaping: two back-slashes \ \
86
i++; //we just encoded the next one
87
}
88                     else {
89                         c = in.charAt(i+1);
90                         if (c=='\\') {
91                             out.append("\\\\\\\\"); //escaping: four back-slashes \ \ \ \
92
i++; //we just encoded the next one
93
} else if (c=='u') {
94                             out.append("\\\\u"); //escaping: two back-slashes \ \
95
i++; //we just encoded the next one
96
} else if (c<=0x1F || (c>=0x80 && c<=0x9f)) {
97                             out.append("\\\\"); //it *will* be followed by a \u0000
98
} else {
99                             out.append("\\"); //leave the next char to be encoded as usual
100
}
101                     }
102                     break; //escape the escape char...
103
case '\t': case '\n': case '\r': out.append(c); break;
104                 default: //NB: exceptions done already (0x9 \t, 0xA \n, 0xD \r)
105
//TODO: What about other ranges??
106
//if (c<=0x1F || (c>=0x80 && c<=0x9f)) { //char is unsigned, so no problem with -ve rep
107
if (c<=0x1F || (c>=0x80)) { //*all* non-ascii
108
out.append( "\\u" + Integer.toHexString(c+0x10000).substring(1) );
109                             //must be exactly 4 char long: add leading 0's, remove "1"
110
} else {
111                         out.append(c);
112                     }
113             }
114         }
115         return out.toString(); //fast: a ref, not copy (unless needed).
116
}
117
118
119 /** JSX2 uses SAX, which already does &-based encoding - dangerous to do it
120     * twice! But we still want the binary encoding, so make this special
121     * one for JSX2
122     **/

123     public static String JavaDoc decodeXML_JSX2(String JavaDoc in) {
124         StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length()); //cap at least this long
125
//String a = new String(""); //prevent reference sharing for zero length
126
int i = 0;
127         try {
128             for (i=0; i<in.length(); i++) {
129                 char c = in.charAt(i);
130 /*
131                 if (c=='&') { //if the escape character
132                     i++;
133                     if (in.startsWith("amp;", i)) {out.append('&'); i+=3;} //NB: loop's i++
134                     else if (in.startsWith("lt;", i)) {out.append('<'); i+=2;}
135                     else if (in.startsWith("gt;", i)) {out.append('>'); i+=2;}
136                     else if (in.startsWith("apos;", i)) {out.append('\''); i+=4;}
137                     else if (in.startsWith("quot;", i)) {out.append('"'); i+=4;}
138                     else throw new IllegalArgumentException("malformed XML: "+in.substring(i-1));
139                 }
140                 else
141 */

142                 if (c=='\\') { //if \ escape char. NB: \ is quoted as \\ in source
143
i++;
144                     if (in.charAt(i)=='u') { //NB: loops's i++ if \ u (\ then u)
145
i++;
146                         out.append((char)Integer.parseInt(in.substring(i,i+4),16));
147                         i+=3;
148                     }
149                     else if (in.charAt(i)=='\\') {
150                          out.append('\\'); i+=0; //a double \ \ is now a \ \!
151
} //if \\ (\ then \)
152
else { //completely use up the codespace!
153
out.append('\\'); i-=1; //unread the last char
154
}
155                 }
156                 else {
157                     out.append(c);
158                 }
159             }
160         } catch (StringIndexOutOfBoundsException JavaDoc e) {
161             System.err.println( "Seems to be an incomplete escaped entity or \\unnnn: "
162                 +in.substring(i) );
163         } catch (IndexOutOfBoundsException JavaDoc e) {
164             System.err.println( "Seems to be an incomplete escaped entity: "
165                 +in.substring(in.lastIndexOf('&')) );
166         } catch (NumberFormatException JavaDoc e) {
167             System.err.println( "Seems to be a faulty \\unnnn escaped control char: "
168                 +in.substring(i) );
169         }
170         return out.toString();
171     }
172
173     /**--------------------------------------------------------------------------
174         * decodeXML
175         * ---------
176         * &amp; etc.
177         * To do: this also needs to be applied to char (not just Strings)
178         * should rewrite to use StringBuffer - it presently creates new objects
179         * like mad - a very expensive operation.
180         **/

181
182     public static String JavaDoc decodeXML(String JavaDoc in) {
183         StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length()); //cap at least this long
184
//String a = new String(""); //prevent reference sharing for zero length
185
int i = 0;
186         try {
187             for (i=0; i<in.length(); i++) {
188                 char c = in.charAt(i);
189                 if (c=='&') { //if the escape character
190
i++;
191                     if (in.startsWith("amp;", i)) {out.append('&'); i+=3;} //NB: loop's i++
192
else if (in.startsWith("lt;", i)) {out.append('<'); i+=2;}
193                     else if (in.startsWith("gt;", i)) {out.append('>'); i+=2;}
194                     else if (in.startsWith("apos;", i)) {out.append('\''); i+=4;}
195                     else if (in.startsWith("quot;", i)) {out.append('"'); i+=4;}
196                     else throw new IllegalArgumentException JavaDoc("malformed XML: "+in.substring(i-1));
197                 }
198                 else if (c=='\\') { //if \ escape char. NB: \ is quoted as \\ in source
199
i++;
200                     if (in.charAt(i)=='u') { //NB: loops's i++ if \ u (\ then u)
201
i++;
202                         out.append((char)Integer.parseInt(in.substring(i,i+4),16));
203                         i+=3;
204                     }
205                     else if (in.charAt(i)=='\\') {
206                          out.append('\\'); i+=0; //a double \ \ is now a \ \!
207
} //if \\ (\ then \)
208
else { //completely use up the codespace!
209
out.append('\\'); i-=1; //unread the last char
210
}
211                 }
212                 else {
213                     out.append(c);
214                 }
215             }
216         } catch (StringIndexOutOfBoundsException JavaDoc e) {
217             System.err.println( "Seems to be an incomplete escaped entity or \\unnnn: "
218                 +in.substring(i) );
219         } catch (IndexOutOfBoundsException JavaDoc e) {
220             System.err.println( "Seems to be an incomplete escaped entity: "
221                 +in.substring(in.lastIndexOf('&')) );
222         } catch (NumberFormatException JavaDoc e) {
223             System.err.println( "Seems to be a faulty \\unnnn escaped control char: "
224                 +in.substring(i) );
225         }
226         return out.toString();
227     }
228
229     static class Dollar {
230         public static void main(String JavaDoc a[]) {
231             if (a.length==0)
232                 a = new String JavaDoc[] {"$a$a$"};
233             for (int i=0; i<a.length; i++) {
234                 String JavaDoc t;
235                 System.err.println(a[i] + " -> "
236                     + (t=escapeDollar(a[i])) + " -> "
237                     + descapeDollar(t) );
238             }
239         }
240     }
241
242
243     /**--------------------------------------------------------------------------
244         * escapeDollar
245         * ------------
246         * This converts "$" to DOLLAR (eg "DOLLAR-"). $ is legal in Java, but
247         * not in XML.
248         * Specifically, $ is legal *anywhere* in Java package, class and field
249         * names (at beginning and at end - not just for inner classes!), but not
250         * in XML tags and attributes.
251
252         * The "-" is 100% illegal in Java package, class and field names, so is
253         * safe. To avoid conflict with "ArrayOf-", we prefix it to be "DOLLAR-".
254
255         * NB: converting "$" to ".." doesn't work: both "a.$a" and "a$.a" map to
256         * "a...a", so the decoding is ambiguous.
257         */

258     //static final String DOLLAR = "-DOLLAR-";
259
//static final String DOLLAR = "D-";
260
//static final String DOLLAR = ".-"; //as released in JSX0.8.20.2
261
//PROBLEM: leading $ (as in "class $SomeClass {}") is mapped to a leading
262
//"." (ie <.-SomeClass), which is not well-formed XML. We change it to
263
//"_-" instead: this marks the unusal mapping fairly clearly.
264
static final String JavaDoc DOLLAR = "_-";
265     //It is very important that it is trailing a "-", so it cannot be
266
//confused with "ArrayOf-". Consider if "-DOLLAR" and "ArrayOf-DOLLAR"?
267
//a fixed parsing order would disambiguate; but better not to need to do so.
268
public static String JavaDoc escapeDollar(String JavaDoc in) {
269         StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length());
270         for (int i=0; i<in.length(); i++) { //step through
271
char c;
272             if ((c=in.charAt(i))=='$')
273                 out.append(DOLLAR);
274             else
275                 out.append(c);
276         }
277         return out.toString();
278     }
279
280     /**--------------------------------------------------------------------------
281         * descapeDollar (see escapeDollar above for a fuller explanation)
282         * -------------
283         * This converts DOLLAR (eg "DOLLAR-") to "$".
284         * One implementation will scan the input String with substring, for
285         * DOLLAR-; from a marker, it will copy the previous stuff across, except
286         * "$" instead of the "DOLLAR". Then, update the marker.
287         */

288     public static String JavaDoc descapeDollar(String JavaDoc in) {
289         StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length());
290         int marker = 0;
291         while (true) {
292             int newmarker = in.indexOf(DOLLAR, marker);
293             if (newmarker==-1)
294                 break; //no DOLLARs left
295
out.append(in.substring(marker, newmarker)+'$');
296             marker = newmarker+DOLLAR.length();
297         }
298         out.append(in.substring(marker)); //the rest of it (could be 0 length)
299
return out.toString();
300     }
301
302
303
304   /** test class:
305     * java JSX.ParseUtilities$Hex
306     **/

307     static class Hex {
308         public static void main(String JavaDoc a[]) {
309             if (a.length>0) {
310                 byte[] b = decodeHex(a[0]);
311                 System.err.print( print(b) );
312                 System.err.print( encodeHex(b) );
313             }
314             else {
315                 byte[] b = new byte[0x200];
316                 byte j=0;
317             for (int i=0; i<b.length; i++) b[i] = j++; //j overflows silently at 127++
318
String JavaDoc t;
319                 System.err.print(print(b) + " -> ");
320                 System.err.print((t=encodeHex(b)) + " -> ");
321                 System.err.print( print(decodeHex(t)) );
322                 System.err.println();
323             }
324         }
325     }
326     public static String JavaDoc print(byte[] in) {
327         String JavaDoc out = "";
328     for (int i=0; i<in.length; i++) {
329             out += in[i]+", "; //ineff, but simple
330
}
331         return out;
332     }
333
334     /** one byte -> two chars.
335         * TODO: need to make the \n platform independent:
336         * out.append( '\n' );
337         *
338         * Some users may want to switch off the pretty printing, for a more compact
339         * rendering.
340         * A similar switch may be appreciated for indentation in general - it
341         * can take up a lot of room (Gary M turned it off; Mark C noted it for
342         * binary arrays that were nested deeply.
343         **/

344     public static String JavaDoc encodeHex(byte[] in) {
345        return encodeHex(in, true);
346     }
347     /** one byte -> two chars.
348         * TODO: need to make the \n platform independent:
349         * out.append( '\n' );
350         *
351         * Some users may want to switch off the pretty printing, for a more compact
352         * rendering.
353         * A similar switch may be appreciated for indentation in general - it
354         * can take up a lot of room (Gary M turned it off; Mark C noted it for
355         * binary arrays that were nested deeply.
356         **/

357     public static String JavaDoc encodeHex(byte[] in, boolean formatted) {
358     StringBuffer JavaDoc out = new StringBuffer JavaDoc(in.length*3); //was 2
359
if (formatted)
360           out.append( EOL_SEP );
361     for (int i=0; i<in.length;) { //NB: i++ is defered
362
out.append( toHexChar((in[i]&0xF0)>>>4) ); //hi 4 bits, shifted
363
out.append( toHexChar((in[i]&0x0F)>>>0) ); //low 4 bits
364
i++;
365         if (formatted)
366         {
367               if (i%256==0)
368             out.append( EOL_SEP+EOL_SEP );
369               else if (i%16==0)
370             out.append( EOL_SEP );
371               else if (i%4==0)
372             out.append( " " );
373               else
374             out.append( ' ' );
375       }
376                 //I think we should have no spacing at all here (?)
377
//else
378
// out.append( ' ' );
379
}
380         if (formatted)
381           out.append( EOL_SEP ); //make it platform independent
382
return out.toString();
383     }
384     static final String JavaDoc EOL_SEP = System.getProperty("line.separator");
385         //platform independence
386

387     static char toHexChar(int b) {
388         if (b>=10)
389             return (char)(b-10+'A');
390         else
391             return (char)(b+'0');
392     }
393
394
395     /**
396         * decodeHex
397         * two chars -> one byte
398     * no error checking for malformed hex encoding.
399         * - if 0xF0 bits are ever set
400         * - if an odd number of chars
401         * Skipping ws is tricky, because we can't use the String length to
402         * estimate it.
403     **/

404     public static byte[] decodeHex(String JavaDoc in) {
405     byte[] out = new byte[in.length()/2];
406         int j = 0; //need j's value after the loop
407
for (int i=0; i<in.length();) {
408             char a = in.charAt(i++);
409             if (Character.isWhitespace(a))
410                 continue;
411         out[j++] = (byte)
412         ( ((fromHexChar(a) &0x0F)<<4) |
413                   (( fromHexChar(in.charAt(i++)) &0x0F)<<0) ); //redundant for "human clarity"
414
//Integer.parseInt(hexString, 16); //use instead?
415
}
416         if ( j<(in.length()/2) ) {
417             byte[] trimOut = new byte[j];
418             System.arraycopy(out, 0, trimOut, 0, j);
419             out = trimOut;
420         }
421     return out;
422     }
423
424
425     /**
426         * Handles both upper and lower case.
427         * Does not check that within range.
428         **/

429     static int fromHexChar(char a) {
430         if (Character.isUpperCase(a))
431             return (char)(a-'A'+10);
432         else if (Character.isLowerCase(a))
433             return (char)(a-'a'+10);
434         else if (Character.isDigit(a))
435             return (char)(a-'0');
436         else throw new NumberFormatException JavaDoc("'"+a+"' is not a valid hex character");
437     }
438
439 }
440
Popular Tags