ParseUtilities


1   /** (c) Brendan Macmillan 2001, licensed under GNU's GPL 2 
2           (If your project is not GPL, then a fee is due - see license on website)
3       Website: http://www.csse.monash.edu.au/~bren/JSX
4       Website: http://freshmeat.net/projects/jsx
5       List (can read without subscribing, need to join to post):
6         http://groups.yahoo.com/group/JSX-ideas/messages
7           Commercial licensing enquiries only:
8               bren@mail.csse.monash.edu.au */
9   
10  
11  /** ===========================================================================
12      * ParseUtilities
13      * ==============
14      * boolean, float and double; and
15      * XML Strings (both parsing and writing code)
16      *
17      * Correctly parses NaN, Infinity and -Infinity of floating point numbers
18      * (float and double).  I wonder why the standard versions do not do this
19      * already?
20      *
21      * Based on a code patch submitted by Mark Collette, 11 Jul 2001
22      **/
23  package JSX;
24  
25  public class ParseUtilities {
26      public static boolean parseBoolean(String   in) {
27          String   s = in.trim();
28          if (s.equalsIgnoreCase("false")) return false;
29          else if (s.equalsIgnoreCase("true")) return true;
30          else
31              throw new IllegalArgumentException  ("Boolean must be 'true' or 'false'; not '"
32                  +s+"')");
33              //superclass of NumberFormatException
34      }
35      public static float parseFloat(String   in) {
36          String   s = in.trim();
37          if( s.equals("NaN") ) return Float.NaN;
38          else if( s.equals("Infinity") ) return Float.POSITIVE_INFINITY;
39          else if( s.equals("-Infinity") ) return Float.NEGATIVE_INFINITY;
40          else return Float.parseFloat(s);
41      }
42  
43      public static double parseDouble(String   s) {
44          if( s.equals("NaN") ) return Double.NaN;
45          else if( s.equals("Infinity") ) return Double.POSITIVE_INFINITY;
46          else if( s.equals("-Infinity") ) return Double.NEGATIVE_INFINITY;
47          else return Double.parseDouble(s);
48      }
49  
50      /** 
51          * Changed to StringBuffer based on Raimi's "non-fork" version.
52          * (FIXED) BUG: characters from (char)128 (0x80) to (char)159 (0x9f)
53          * are encoded as "?"
54          *
55          * A new scheme:
56          *   output "\_" as "\_"; unless _ is a \ or u
57          *   (note: if the _ will need to be escaped, then it is a "\"...)
58          * requires looking ahead to the next character.  That is, a digraph
59          * encoding.
60          * What to do if there is no next character?  I guess this could be
61          * treated as a special digraph all its own, and given a particular
62          * encoding.  It would be easiest for the decoder if this was encoded as
63          * "\\".  That way, there is *always* another character following
64          */
65      public static String   encodeXML(String   in) {
66          StringBuffer   out = new StringBuffer  (in.length());   //cap at least this long
67      //FIXME: \ u encoding <-- compiler picks it up as a malformed unicoding!
68      //is \u0000 a legitimate char? Can it be rep in XML?
69          for (int i=0; i<in.length(); i++) {
70              char c = in.charAt(i);
71              switch (c) {
72                  case '&': out.append("&amp;"); break;
73                  case '<': out.append("&lt;"); break;
74                  case '>': out.append("&gt;"); break;
75                  case '\'': out.append("&apos;"); break;
76                  case '"': out.append("&quot;"); break;
77              //  case '\\': out.append("\\\\"); break;
78                  case '\\':
79                      //need to check for the next character, and encode them together:
80                      // if there is none, then output "\\" - need to escape for compiler
81                      // if it is '\', then output "\\\\" - need to escape for compiler
82                      // if it is 'u', then output "\ \ u" (spaced so compiler not upset)
83                      // for all other cases, simply output '\' and the character.
84                      if (i+1==in.length())   { //could also write as an Exception handler
85                          out.append("\\\\"); //escaping: two back-slashes \ \ 
86                          i++;    //we just encoded the next one
87                      }
88                      else {
89                          c = in.charAt(i+1);
90                          if (c=='\\') {
91                              out.append("\\\\\\\\"); //escaping: four back-slashes \ \ \ \ 
92                              i++;    //we just encoded the next one
93                          } else if (c=='u') {
94                              out.append("\\\\u");    //escaping: two back-slashes \ \ 
95                              i++;    //we just encoded the next one
96                          } else if (c<=0x1F || (c>=0x80 && c<=0x9f)) {
97                              out.append("\\\\"); //it *will* be followed by a \u0000
98                          } else {
99                              out.append("\\");   //leave the next char to be encoded as usual
100                         }
101                     }
102                     break;  //escape the escape char...
103                 case '\t': case '\n': case '\r': out.append(c); break;
104                 default:    //NB: exceptions done already (0x9 \t, 0xA \n, 0xD \r)
105                     //TODO: What about other ranges??
106                     //if (c<=0x1F || (c>=0x80 && c<=0x9f)) {    //char is unsigned, so no problem with -ve rep
107                     if (c<=0x1F || (c>=0x80)) { //*all* non-ascii
108                         out.append( "\\u" + Integer.toHexString(c+0x10000).substring(1) );
109                             //must be exactly 4 char long: add leading 0's, remove "1"
110                     } else {
111                         out.append(c);
112                     }
113             }
114         }
115         return out.toString();  //fast: a ref, not copy (unless needed).
116     }
117 
118 
119 /** JSX2 uses SAX, which already does &-based encoding - dangerous to do it
120     * twice!  But we still want the binary encoding, so make this special
121     * one for JSX2
122     **/
123     public static String   decodeXML_JSX2(String   in) {
124         StringBuffer   out = new StringBuffer  (in.length());   //cap at least this long
125         //String a = new String("");    //prevent reference sharing for zero length
126         int i = 0;
127         try {
128             for (i=0; i<in.length(); i++) {
129                 char c = in.charAt(i);
130 /*
131                 if (c=='&') {               //if the escape character
132                     i++;
133                     if (in.startsWith("amp;", i))       {out.append('&');  i+=3;} //NB: loop's i++
134                     else if (in.startsWith("lt;", i))   {out.append('<');  i+=2;}
135                     else if (in.startsWith("gt;", i))   {out.append('>');  i+=2;}
136                     else if (in.startsWith("apos;", i)) {out.append('\''); i+=4;}
137                     else if (in.startsWith("quot;", i)) {out.append('"');  i+=4;}
138                     else throw new IllegalArgumentException("malformed XML: "+in.substring(i-1));
139                 }
140                 else
141 */
142                 if (c=='\\') {  //if \ escape char. NB: \ is quoted as \\ in source
143                     i++;
144                     if (in.charAt(i)=='u') { //NB: loops's i++ if \ u (\ then u)
145                         i++;
146                         out.append((char)Integer.parseInt(in.substring(i,i+4),16));
147                         i+=3;
148                     }
149                     else if (in.charAt(i)=='\\') {
150                          out.append('\\'); i+=0;    //a double \ \ is now a \ \!
151                     }   //if \\ (\ then \)
152                     else    { //completely use up the codespace!
153                          out.append('\\'); i-=1;    //unread the last char
154                     }
155                 }
156                 else {
157                     out.append(c);
158                 }
159             }
160         } catch (StringIndexOutOfBoundsException   e) {
161             System.err.println( "Seems to be an incomplete escaped entity or \\unnnn: "
162                 +in.substring(i) );
163         } catch (IndexOutOfBoundsException   e) {
164             System.err.println( "Seems to be an incomplete escaped entity: "
165                 +in.substring(in.lastIndexOf('&')) );
166         } catch (NumberFormatException   e) {
167             System.err.println( "Seems to be a faulty \\unnnn escaped control char: "
168                 +in.substring(i) );
169         }
170         return out.toString();
171     }
172 
173     /**--------------------------------------------------------------------------
174         * decodeXML
175         * ---------
176         * &amp; etc.
177         * To do: this also needs to be applied to char (not just Strings)
178         * should rewrite to use StringBuffer - it presently creates new objects
179         * like mad - a very expensive operation.
180         **/
181 
182     public static String   decodeXML(String   in) {
183         StringBuffer   out = new StringBuffer  (in.length());   //cap at least this long
184         //String a = new String("");    //prevent reference sharing for zero length
185         int i = 0;
186         try {
187             for (i=0; i<in.length(); i++) {
188                 char c = in.charAt(i);
189                 if (c=='&') {               //if the escape character
190                     i++;
191                     if (in.startsWith("amp;", i))       {out.append('&');  i+=3;} //NB: loop's i++
192                     else if (in.startsWith("lt;", i))   {out.append('<');  i+=2;}
193                     else if (in.startsWith("gt;", i))   {out.append('>');  i+=2;}
194                     else if (in.startsWith("apos;", i)) {out.append('\''); i+=4;}
195                     else if (in.startsWith("quot;", i)) {out.append('"');  i+=4;}
196                     else throw new IllegalArgumentException  ("malformed XML: "+in.substring(i-1));
197                 }
198                 else if (c=='\\') { //if \ escape char. NB: \ is quoted as \\ in source
199                     i++;
200                     if (in.charAt(i)=='u') { //NB: loops's i++ if \ u (\ then u)
201                         i++;
202                         out.append((char)Integer.parseInt(in.substring(i,i+4),16));
203                         i+=3;
204                     }
205                     else if (in.charAt(i)=='\\') {
206                          out.append('\\'); i+=0;    //a double \ \ is now a \ \!
207                     }   //if \\ (\ then \)
208                     else    { //completely use up the codespace!
209                          out.append('\\'); i-=1;    //unread the last char
210                     }
211                 }
212                 else {
213                     out.append(c);
214                 }
215             }
216         } catch (StringIndexOutOfBoundsException   e) {
217             System.err.println( "Seems to be an incomplete escaped entity or \\unnnn: "
218                 +in.substring(i) );
219         } catch (IndexOutOfBoundsException   e) {
220             System.err.println( "Seems to be an incomplete escaped entity: "
221                 +in.substring(in.lastIndexOf('&')) );
222         } catch (NumberFormatException   e) {
223             System.err.println( "Seems to be a faulty \\unnnn escaped control char: "
224                 +in.substring(i) );
225         }
226         return out.toString();
227     }
228 
229     static class Dollar {
230         public static void main(String   a[]) {
231             if (a.length==0)
232                 a = new String  [] {"$a$a$"};
233             for (int i=0; i<a.length; i++) {
234                 String   t;
235                 System.err.println(a[i] + " -> "
236                     + (t=escapeDollar(a[i])) + " -> "
237                     + descapeDollar(t) );
238             }
239         }
240     }
241 
242 
243     /**--------------------------------------------------------------------------
244         * escapeDollar
245         * ------------
246         * This converts "$" to DOLLAR (eg "DOLLAR-").  $ is legal in Java, but
247         * not in XML.
248         * Specifically, $ is legal *anywhere* in Java package, class and field
249         * names (at beginning and at end - not just for inner classes!), but not
250         * in XML tags and attributes.
251 
252         * The "-" is 100% illegal in Java package, class and field names, so is
253         * safe.  To avoid conflict with "ArrayOf-", we prefix it to be "DOLLAR-".
254 
255         * NB: converting "$" to ".." doesn't work: both "a.$a" and "a$.a" map to
256         * "a...a", so the decoding is ambiguous.
257         */
258     //static final String DOLLAR = "-DOLLAR-";
259     //static final String DOLLAR = "D-";
260     //static final String DOLLAR = ".-";    //as released in JSX0.8.20.2
261         //PROBLEM: leading $ (as in "class $SomeClass {}") is mapped to a leading
262         //"." (ie <.-SomeClass), which is not well-formed XML.  We change it to
263         //"_-" instead: this marks the unusal mapping fairly clearly.
264     static final String   DOLLAR = "_-";
265     //It is very important that it is trailing a "-", so it cannot be
266     //confused with "ArrayOf-".  Consider if "-DOLLAR" and "ArrayOf-DOLLAR"?
267     //a fixed parsing order would disambiguate; but better not to need to do so.
268     public static String   escapeDollar(String   in) {
269         StringBuffer   out = new StringBuffer  (in.length());
270         for (int i=0; i<in.length(); i++)   {   //step through
271             char c;
272             if ((c=in.charAt(i))=='$')
273                 out.append(DOLLAR);
274             else
275                 out.append(c);
276         }
277         return out.toString();
278     }
279 
280     /**--------------------------------------------------------------------------
281         * descapeDollar  (see escapeDollar above for a fuller explanation)
282         * -------------
283         * This converts DOLLAR (eg "DOLLAR-") to "$".
284         * One implementation will scan the input String with substring, for
285         * DOLLAR-; from a marker, it will copy the previous stuff across, except
286         * "$" instead of the "DOLLAR".  Then, update the marker.
287         */
288     public static String   descapeDollar(String   in) {
289         StringBuffer   out = new StringBuffer  (in.length());
290         int marker = 0;
291         while (true) {
292             int newmarker = in.indexOf(DOLLAR, marker);
293             if (newmarker==-1)
294                 break;  //no DOLLARs left
295             out.append(in.substring(marker, newmarker)+'$');
296             marker = newmarker+DOLLAR.length();
297         }
298         out.append(in.substring(marker));   //the rest of it (could be 0 length)
299         return out.toString();
300     }
301 
302 
303 
304   /** test class:
305     * java JSX.ParseUtilities$Hex
306     **/
307     static class Hex {
308         public static void main(String   a[]) {
309             if (a.length>0) {
310                 byte[] b = decodeHex(a[0]);
311                 System.err.print( print(b) );
312                 System.err.print( encodeHex(b) );
313             }
314             else {
315                 byte[] b = new byte[0x200];
316                 byte j=0;
317             for (int i=0; i<b.length; i++) b[i] = j++;  //j overflows silently at 127++
318                 String   t;
319                 System.err.print(print(b) + " -> ");
320                 System.err.print((t=encodeHex(b)) + " -> ");
321                 System.err.print( print(decodeHex(t)) );
322                 System.err.println();
323             }
324         }
325     }
326     public static String   print(byte[] in) {
327         String   out = "";
328     for (int i=0; i<in.length; i++) {
329             out += in[i]+", ";      //ineff, but simple
330         }
331         return out;
332     }
333 
334     /** one byte -> two chars.
335         * TODO: need to make the \n platform independent:
336         *   out.append( '\n' );
337         *
338         * Some users may want to switch off the pretty printing, for a more compact
339         * rendering.
340         * A similar switch may be appreciated for indentation in general - it
341         * can take up a lot of room (Gary M turned it off; Mark C noted it for
342         * binary arrays that were nested deeply.
343         **/
344     public static String   encodeHex(byte[] in) {
345        return encodeHex(in, true);
346     }
347     /** one byte -> two chars.
348         * TODO: need to make the \n platform independent:
349         *   out.append( '\n' );
350         *
351         * Some users may want to switch off the pretty printing, for a more compact
352         * rendering.
353         * A similar switch may be appreciated for indentation in general - it
354         * can take up a lot of room (Gary M turned it off; Mark C noted it for
355         * binary arrays that were nested deeply.
356         **/
357     public static String   encodeHex(byte[] in, boolean formatted) {
358     StringBuffer   out = new StringBuffer  (in.length*3);   //was 2
359         if (formatted)
360           out.append( EOL_SEP );
361     for (int i=0; i<in.length;) {   //NB: i++ is defered
362         out.append( toHexChar((in[i]&0xF0)>>>4) );  //hi 4 bits, shifted
363         out.append( toHexChar((in[i]&0x0F)>>>0) );  //low 4 bits
364             i++;
365         if (formatted)
366         {
367               if (i%256==0)
368             out.append( EOL_SEP+EOL_SEP );
369               else if (i%16==0)
370             out.append( EOL_SEP );
371               else if (i%4==0)
372             out.append( "  " );
373               else
374             out.append( ' ' );
375       }
376                 //I think we should have no spacing at all here (?)
377       //else
378       //    out.append( ' ' );
379     }
380         if (formatted)
381           out.append( EOL_SEP );    //make it platform independent
382     return out.toString();
383     }
384     static final String   EOL_SEP = System.getProperty("line.separator");
385         //platform independence
386 
387     static char toHexChar(int b) {
388         if (b>=10)
389             return (char)(b-10+'A');
390         else
391             return (char)(b+'0');
392     }
393 
394 
395     /** 
396         * decodeHex
397         * two chars -> one byte
398     * no error checking for malformed hex encoding.
399         * - if 0xF0 bits are ever set
400         * - if an odd number of chars
401         * Skipping ws is tricky, because we can't use the String length to 
402         * estimate it.
403     **/
404     public static byte[] decodeHex(String   in) {
405     byte[] out = new byte[in.length()/2];
406         int j = 0;  //need j's value after the loop
407     for (int i=0; i<in.length();) {
408             char a = in.charAt(i++);
409             if (Character.isWhitespace(a))
410                 continue;
411         out[j++] = (byte)
412         ( ((fromHexChar(a) &0x0F)<<4) |
413                   (( fromHexChar(in.charAt(i++)) &0x0F)<<0) ); //redundant for "human clarity"
414             //Integer.parseInt(hexString, 16);   //use instead?
415     }
416         if ( j<(in.length()/2) ) {
417             byte[] trimOut = new byte[j];
418             System.arraycopy(out, 0, trimOut, 0, j);
419             out = trimOut;
420         }
421     return out;
422     }
423 
424 
425     /**
426         * Handles both upper and lower case.
427         * Does not check that within range.
428         **/
429     static int fromHexChar(char a) {
430         if (Character.isUpperCase(a))
431             return (char)(a-'A'+10);
432         else if (Character.isLowerCase(a))
433             return (char)(a-'a'+10);
434         else if (Character.isDigit(a))
435             return (char)(a-'0');
436         else throw new NumberFormatException  ("'"+a+"' is not a valid hex character");
437     }
438 
439 }
440
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags