KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > corba > se > impl > encoding > CodeSetConversion


1 /*
2  * @(#)CodeSetConversion.java 1.19 04/03/01
3  *
4  * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7 package com.sun.corba.se.impl.encoding;
8
9 import java.util.Map JavaDoc;
10 import java.util.HashMap JavaDoc;
11 import java.nio.ByteBuffer JavaDoc;
12 import java.nio.CharBuffer JavaDoc;
13 import java.nio.charset.Charset JavaDoc;
14 import java.nio.charset.CharsetEncoder JavaDoc;
15 import java.nio.charset.CharsetDecoder JavaDoc;
16 import java.nio.charset.CharacterCodingException JavaDoc;
17 import java.nio.charset.IllegalCharsetNameException JavaDoc;
18 import java.nio.charset.MalformedInputException JavaDoc;
19 import java.nio.charset.UnsupportedCharsetException JavaDoc;
20 import java.nio.charset.UnmappableCharacterException JavaDoc;
21 import com.sun.corba.se.impl.logging.ORBUtilSystemException;
22 import com.sun.corba.se.impl.logging.OMGSystemException;
23 import com.sun.corba.se.spi.logging.CORBALogDomains;
24
25 /**
26  * Collection of classes, interfaces, and factory methods for
27  * CORBA code set conversion.
28  *
29  * This is mainly used to shield other code from the sun.io
30  * converters which might change, as well as provide some basic
31  * translation from conversion to CORBA error exceptions. Some
32  * extra work is required here to facilitate the way CORBA
33  * says it uses UTF-16 as of the 00-11-03 spec.
34  *
35  * REVISIT - Since the nio.Charset and nio.Charset.Encoder/Decoder
36  * use NIO ByteBuffer and NIO CharBuffer, the interaction
37  * and interface between this class and the CDR streams
38  * should be looked at more closely for optimizations to
39  * avoid unnecessary copying of data between char[] &
40  * CharBuffer and byte[] & ByteBuffer, especially
41  * DirectByteBuffers.
42  *
43  */

44 public class CodeSetConversion
45 {
46     /**
47      * Abstraction for char to byte conversion.
48      *
49      * Must be used in the proper sequence:
50      *
51      * 1) convert
52      * 2) Optional getNumBytes and/or getAlignment (if necessary)
53      * 3) getBytes (see warning)
54      */

55     public abstract static class CTBConverter
56     {
57         // Perform the conversion of the provided char or String,
58
// allowing the caller to query for more information
59
// before writing.
60
public abstract void convert(char chToConvert);
61         public abstract void convert(String JavaDoc strToConvert);
62
63         // How many bytes resulted from the conversion?
64
public abstract int getNumBytes();
65
66         // What's the maximum number of bytes per character?
67
public abstract float getMaxBytesPerChar();
68
69         public abstract boolean isFixedWidthEncoding();
70
71         // What byte boundary should the stream align to before
72
// calling writeBytes? For instance, a fixed width
73
// encoding with 2 bytes per char in a stream which
74
// doesn't encapsulate the char's bytes should align
75
// on a 2 byte boundary. (Ex: UTF16 in GIOP1.1)
76
//
77
// Note: This has no effect on the converted bytes. It
78
// is just information available to the caller.
79
public abstract int getAlignment();
80
81         // Get the resulting bytes. Warning: You must use getNumBytes()
82
// to determine the end of the data in the byte array instead
83
// of array.length! The array may be used internally, so don't
84
// save references.
85
public abstract byte[] getBytes();
86     }
87     
88     /**
89      * Abstraction for byte to char conversion.
90      */

91     public abstract static class BTCConverter
92     {
93         // In GIOP 1.1, interoperability can only be achieved with
94
// fixed width encodings like UTF-16. This is because wstrings
95
// specified how many code points follow rather than specifying
96
// the length in octets.
97
public abstract boolean isFixedWidthEncoding();
98         public abstract int getFixedCharWidth();
99
100         // Called after getChars to determine the true size of the
101
// converted array.
102
public abstract int getNumChars();
103
104         // Perform the conversion using length bytes from the given
105
// input stream. Warning: You must use getNumChars() to
106
// determine the correct length of the resulting array.
107
// The same array may be used internally over multiple
108
// calls.
109
public abstract char[] getChars(byte[] bytes, int offset, int length);
110     }
111
112     /**
113      * Implementation of CTBConverter which uses a nio.Charset.CharsetEncoder
114      * to do the real work. Handles translation of exceptions to the
115      * appropriate CORBA versions.
116      */

117     private class JavaCTBConverter extends CTBConverter
118     {
119     private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
120         CORBALogDomains.RPC_ENCODING ) ;
121
122     private OMGSystemException omgWrapper = OMGSystemException.get(
123         CORBALogDomains.RPC_ENCODING ) ;
124
125         // nio.Charset.CharsetEncoder actually does the work here
126
// have to use it directly rather than through String's interface
127
// because we want to know when errors occur during the conversion.
128
private CharsetEncoder JavaDoc ctb;
129
130         // Proper alignment for this type of converter. For instance,
131
// ASCII has alignment of 1 (1 byte per char) but UTF16 has
132
// alignment of 2 (2 bytes per char)
133
private int alignment;
134
135         // Char buffer to hold the input.
136
private char[] chars = null;
137
138         // How many bytes are generated from the conversion?
139
private int numBytes = 0;
140
141         // How many characters were converted (temporary variable
142
// for cross method communication)
143
private int numChars = 0;
144
145         // ByteBuffer holding the converted input. This is necessary
146
// since we have to do calculations that require the conversion
147
// before writing the array to the stream.
148
private ByteBuffer JavaDoc buffer;
149
150         // What code set are we using?
151
private OSFCodeSetRegistry.Entry codeset;
152
153         public JavaCTBConverter(OSFCodeSetRegistry.Entry codeset,
154                                 int alignmentForEncoding) {
155
156             try {
157                 ctb = cache.getCharToByteConverter(codeset.getName());
158                 if (ctb == null) {
159                     Charset JavaDoc tmpCharset = Charset.forName(codeset.getName());
160                     ctb = tmpCharset.newEncoder();
161                     cache.setConverter(codeset.getName(), ctb);
162                 }
163             } catch(IllegalCharsetNameException JavaDoc icne) {
164
165                 // This can only happen if one of our Entries has
166
// an invalid name.
167
throw wrapper.invalidCtbConverterName(icne,codeset.getName());
168             } catch(UnsupportedCharsetException JavaDoc ucne) {
169
170                 // This can only happen if one of our Entries has
171
// an unsupported name.
172
throw wrapper.invalidCtbConverterName(ucne,codeset.getName());
173             }
174
175             this.codeset = codeset;
176             alignment = alignmentForEncoding;
177         }
178
179         public final float getMaxBytesPerChar() {
180             return ctb.maxBytesPerChar();
181         }
182
183         public void convert(char chToConvert) {
184             if (chars == null)
185                 chars = new char[1];
186             
187             // The CharToByteConverter only takes a char[]
188
chars[0] = chToConvert;
189             numChars = 1;
190
191             convertCharArray();
192         }
193
194         public void convert(String JavaDoc strToConvert) {
195             // Try to save a memory allocation if possible. Usual
196
// space/time trade off. If we could get the char[] out of
197
// the String without copying, that would be great, but
198
// it's forbidden since String is immutable.
199
if (chars == null || chars.length < strToConvert.length())
200                 chars = new char[strToConvert.length()];
201
202             numChars = strToConvert.length();
203             
204             strToConvert.getChars(0, numChars, chars, 0);
205
206             convertCharArray();
207         }
208         
209         public final int getNumBytes() {
210             return numBytes;
211         }
212         
213         public final int getAlignment() {
214             return alignment;
215         }
216
217         public final boolean isFixedWidthEncoding() {
218             return codeset.isFixedWidth();
219         }
220
221         public byte[] getBytes() {
222             // Note that you can't use buffer.length since the buffer might
223
// be larger than the actual number of converted bytes depending
224
// on the encoding.
225
return buffer.array();
226         }
227
228         private void convertCharArray() {
229             try {
230                 
231                 // Possible optimization of directly converting into the CDR buffer.
232
// However, that means the CDR code would have to reserve
233
// a 4 byte string length ahead of time, and we'd need a
234
// confusing partial conversion scheme for when we couldn't
235
// fit everything in the buffer but needed to know the
236
// converted length before proceeding due to fragmentation.
237
// Then there's the issue of the chunking code.
238
//
239
// For right now, this is less messy and basic tests don't
240
// show more than a 1 ms penalty worst case. Less than a
241
// factor of 2 increase.
242

243                 // Convert the characters
244
buffer = ctb.encode(CharBuffer.wrap(chars,0,numChars));
245
246                 // ByteBuffer returned by the encoder will set its limit
247
// to byte immediately after the last written byte.
248
numBytes = buffer.limit();
249
250             } catch (IllegalStateException JavaDoc ise) {
251                 // an encoding operation is already in progress
252
throw wrapper.ctbConverterFailure( ise ) ;
253             } catch (MalformedInputException JavaDoc mie) {
254                 // There were illegal Unicode char pairs
255
throw wrapper.badUnicodePair( mie ) ;
256             } catch (UnmappableCharacterException JavaDoc uce) {
257                 // A character doesn't map to the desired code set
258
// CORBA formal 00-11-03.
259
throw omgWrapper.charNotInCodeset( uce ) ;
260             } catch (CharacterCodingException JavaDoc cce) {
261                 // If this happens, then some other encoding error occured
262
throw wrapper.ctbConverterFailure( cce ) ;
263             }
264         }
265     }
266
267     /**
268      * Special UTF16 converter which can either always write a BOM
269      * or use a specified byte order without one.
270      */

271     private class UTF16CTBConverter extends JavaCTBConverter
272     {
273         // Using this constructor, we will always write a BOM
274
public UTF16CTBConverter() {
275             super(OSFCodeSetRegistry.UTF_16, 2);
276         }
277
278         // Using this constructor, we don't use a BOM and use the
279
// byte order specified
280
public UTF16CTBConverter(boolean littleEndian) {
281             super(littleEndian ?
282                   OSFCodeSetRegistry.UTF_16LE :
283                   OSFCodeSetRegistry.UTF_16BE,
284                   2);
285         }
286     }
287
288     /**
289      * Implementation of BTCConverter which uses a sun.io.ByteToCharConverter
290      * for the real work. Handles translation of exceptions to the
291      * appropriate CORBA versions.
292      */

293     private class JavaBTCConverter extends BTCConverter
294     {
295     private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
296         CORBALogDomains.RPC_ENCODING ) ;
297
298     private OMGSystemException omgWrapper = OMGSystemException.get(
299         CORBALogDomains.RPC_ENCODING ) ;
300
301         protected CharsetDecoder JavaDoc btc;
302         private char[] buffer;
303         private int resultingNumChars;
304         private OSFCodeSetRegistry.Entry codeset;
305
306         public JavaBTCConverter(OSFCodeSetRegistry.Entry codeset) {
307             
308             // Obtain a Decoder
309
btc = this.getConverter(codeset.getName());
310
311             this.codeset = codeset;
312         }
313
314         public final boolean isFixedWidthEncoding() {
315             return codeset.isFixedWidth();
316         }
317
318         // Should only be called if isFixedWidthEncoding is true
319
// IMPORTANT: This calls OSFCodeSetRegistry.Entry, not
320
// CharsetDecoder.maxCharsPerByte().
321
public final int getFixedCharWidth() {
322             return codeset.getMaxBytesPerChar();
323         }
324
325         public final int getNumChars() {
326             return resultingNumChars;
327         }
328
329         public char[] getChars(byte[] bytes, int offset, int numBytes) {
330
331             // Possible optimization of reading directly from the CDR
332
// byte buffer. The sun.io converter supposedly can handle
333
// incremental conversions in which a char is broken across
334
// two convert calls.
335
//
336
// Basic tests didn't show more than a 1 ms increase
337
// worst case. It's less than a factor of 2 increase.
338
// Also makes the interface more difficult.
339

340
341             try {
342
343                 ByteBuffer JavaDoc byteBuf = ByteBuffer.wrap(bytes, offset, numBytes);
344                 CharBuffer JavaDoc charBuf = btc.decode(byteBuf);
345
346                 // CharBuffer returned by the decoder will set its limit
347
// to byte immediately after the last written byte.
348
resultingNumChars = charBuf.limit();
349
350                 // IMPORTANT - It's possible the underlying char[] in the
351
// CharBuffer returned by btc.decode(byteBuf)
352
// is longer in length than the number of characters
353
// decoded. Hence, the check below to ensure the
354
// char[] returned contains all the chars that have
355
// been decoded and no more.
356
if (charBuf.limit() == charBuf.capacity()) {
357                     buffer = charBuf.array();
358                 } else {
359                     buffer = new char[charBuf.limit()];
360                     charBuf.get(buffer, 0, charBuf.limit()).position(0);
361                 }
362
363                 return buffer;
364
365             } catch (IllegalStateException JavaDoc ile) {
366                 // There were a decoding operation already in progress
367
throw wrapper.btcConverterFailure( ile ) ;
368             } catch (MalformedInputException JavaDoc mie) {
369                 // There were illegal Unicode char pairs
370
throw wrapper.badUnicodePair( mie ) ;
371             } catch (UnmappableCharacterException JavaDoc uce) {
372                 // A character doesn't map to the desired code set.
373
// CORBA formal 00-11-03.
374
throw omgWrapper.charNotInCodeset( uce ) ;
375             } catch (CharacterCodingException JavaDoc cce) {
376                 // If this happens, then a character decoding error occured.
377
throw wrapper.btcConverterFailure( cce ) ;
378             }
379         }
380
381         /**
382          * Utility method to find a CharsetDecoder in the
383          * cache or create a new one if necessary. Throws an
384          * INTERNAL if the code set is unknown.
385          */

386         protected CharsetDecoder JavaDoc getConverter(String JavaDoc javaCodeSetName) {
387
388             CharsetDecoder JavaDoc result = null;
389             try {
390                 result = cache.getByteToCharConverter(javaCodeSetName);
391
392                 if (result == null) {
393                     Charset JavaDoc tmpCharset = Charset.forName(javaCodeSetName);
394                     result = tmpCharset.newDecoder();
395                     cache.setConverter(javaCodeSetName, result);
396                 }
397
398             } catch(IllegalCharsetNameException JavaDoc icne) {
399                 // This can only happen if one of our charset entries has
400
// an illegal name.
401
throw wrapper.invalidBtcConverterName( icne, javaCodeSetName ) ;
402             }
403
404             return result;
405         }
406     }
407
408     /**
409      * Special converter for UTF16 since it's required to optionally
410      * support a byte order marker while the internal Java converters
411      * either require it or require that it isn't there.
412      *
413      * The solution is to check for the byte order marker, and if we
414      * need to do something differently, switch internal converters.
415      */

416     private class UTF16BTCConverter extends JavaBTCConverter
417     {
418         private boolean defaultToLittleEndian;
419         private boolean converterUsesBOM = true;
420
421         private static final char UTF16_BE_MARKER = (char) 0xfeff;
422         private static final char UTF16_LE_MARKER = (char) 0xfffe;
423
424         // When there isn't a byte order marker, used the byte
425
// order specified.
426
public UTF16BTCConverter(boolean defaultToLittleEndian) {
427             super(OSFCodeSetRegistry.UTF_16);
428
429             this.defaultToLittleEndian = defaultToLittleEndian;
430         }
431
432         public char[] getChars(byte[] bytes, int offset, int numBytes) {
433
434             if (hasUTF16ByteOrderMarker(bytes, offset, numBytes)) {
435                 if (!converterUsesBOM)
436                     switchToConverter(OSFCodeSetRegistry.UTF_16);
437
438                 converterUsesBOM = true;
439
440                 return super.getChars(bytes, offset, numBytes);
441             } else {
442                 if (converterUsesBOM) {
443                     if (defaultToLittleEndian)
444                         switchToConverter(OSFCodeSetRegistry.UTF_16LE);
445                     else
446                         switchToConverter(OSFCodeSetRegistry.UTF_16BE);
447
448                     converterUsesBOM = false;
449                 }
450
451                 return super.getChars(bytes, offset, numBytes);
452             }
453         }
454
455         /**
456          * Utility method for determining if a UTF-16 byte order marker is present.
457          */

458         private boolean hasUTF16ByteOrderMarker(byte[] array, int offset, int length) {
459             // If there aren't enough bytes to represent the marker and data,
460
// return false.
461
if (length >= 4) {
462
463                 int b1 = array[offset] & 0x00FF;
464                 int b2 = array[offset + 1] & 0x00FF;
465
466                 char marker = (char)((b1 << 8) | (b2 << 0));
467                 
468                 return (marker == UTF16_BE_MARKER || marker == UTF16_LE_MARKER);
469             } else
470                 return false;
471         }
472
473         /**
474          * The current solution for dealing with UTF-16 in CORBA
475          * is that if our sun.io converter requires byte order markers,
476          * and then we see a CORBA wstring/wchar without them, we
477          * switch to the sun.io converter that doesn't require them.
478          */

479         private void switchToConverter(OSFCodeSetRegistry.Entry newCodeSet) {
480
481             // Use the getConverter method from our superclass.
482
btc = super.getConverter(newCodeSet.getName());
483         }
484     }
485
486     /**
487      * CTB converter factory for single byte or variable length encodings.
488      */

489     public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset) {
490         int alignment = (!codeset.isFixedWidth() ?
491                          1 :
492                          codeset.getMaxBytesPerChar());
493             
494         return new JavaCTBConverter(codeset, alignment);
495     }
496
497     /**
498      * CTB converter factory for multibyte (mainly fixed) encodings.
499      *
500      * Because of the awkwardness with byte order markers and the possibility of
501      * using UCS-2, you must specify both the endianness of the stream as well as
502      * whether or not to use byte order markers if applicable. UCS-2 has no byte
503      * order markers. UTF-16 has optional markers.
504      *
505      * If you select useByteOrderMarkers, there is no guarantee that the encoding
506      * will use the endianness specified.
507      *
508      */

509     public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset,
510                                         boolean littleEndian,
511                                         boolean useByteOrderMarkers) {
512
513         // UCS2 doesn't have byte order markers, and we're encoding it
514
// as UTF-16 since UCS2 isn't available in all Java platforms.
515
// They should be identical with only minor differences in
516
// negative cases.
517
if (codeset == OSFCodeSetRegistry.UCS_2)
518             return new UTF16CTBConverter(littleEndian);
519
520         // We can write UTF-16 with or without a byte order marker.
521
if (codeset == OSFCodeSetRegistry.UTF_16) {
522             if (useByteOrderMarkers)
523                 return new UTF16CTBConverter();
524             else
525                 return new UTF16CTBConverter(littleEndian);
526         }
527
528         // Everything else uses the generic JavaCTBConverter.
529
//
530
// Variable width encodings are aligned on 1 byte boundaries.
531
// A fixed width encoding with a max. of 4 bytes/char should
532
// align on a 4 byte boundary. Note that UTF-16 is a special
533
// case because of the optional byte order marker, so it's
534
// handled above.
535
//
536
// This doesn't matter for GIOP 1.2 wchars and wstrings
537
// since the encoded bytes are treated as an encapsulation.
538
int alignment = (!codeset.isFixedWidth() ?
539                          1 :
540                          codeset.getMaxBytesPerChar());
541         
542         return new JavaCTBConverter(codeset, alignment);
543     }
544
545     /**
546      * BTCConverter factory for single byte or variable width encodings.
547      */

548     public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset) {
549         return new JavaBTCConverter(codeset);
550     }
551
552     /**
553      * BTCConverter factory for fixed width multibyte encodings.
554      */

555     public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset,
556                                         boolean defaultToLittleEndian) {
557
558         if (codeset == OSFCodeSetRegistry.UTF_16 ||
559             codeset == OSFCodeSetRegistry.UCS_2) {
560
561             return new UTF16BTCConverter(defaultToLittleEndian);
562         } else {
563             return new JavaBTCConverter(codeset);
564         }
565     }
566
567     /**
568      * Follows the code set negotiation algorithm in CORBA formal 99-10-07 13.7.2.
569      *
570      * Returns the proper negotiated OSF character encoding number or
571      * CodeSetConversion.FALLBACK_CODESET.
572      */

573     private int selectEncoding(CodeSetComponentInfo.CodeSetComponent client,
574                                CodeSetComponentInfo.CodeSetComponent server) {
575
576         // A "null" value for the server's nativeCodeSet means that
577
// the server desired not to indicate one. We'll take that
578
// to mean that it wants the first thing in its conversion list.
579
// If it's conversion list is empty, too, then use the fallback
580
// codeset.
581
int serverNative = server.nativeCodeSet;
582
583         if (serverNative == 0) {
584             if (server.conversionCodeSets.length > 0)
585                 serverNative = server.conversionCodeSets[0];
586             else
587                 return CodeSetConversion.FALLBACK_CODESET;
588         }
589
590         if (client.nativeCodeSet == serverNative) {
591             // Best case -- client and server don't have to convert
592
return serverNative;
593         }
594
595         // Is this client capable of converting to the server's
596
// native code set?
597
for (int i = 0; i < client.conversionCodeSets.length; i++) {
598             if (serverNative == client.conversionCodeSets[i]) {
599                 // The client will convert to the server's
600
// native code set.
601
return serverNative;
602             }
603         }
604
605         // Is the server capable of converting to the client's
606
// native code set?
607
for (int i = 0; i < server.conversionCodeSets.length; i++) {
608             if (client.nativeCodeSet == server.conversionCodeSets[i]) {
609                 // The server will convert to the client's
610
// native code set.
611
return client.nativeCodeSet;
612             }
613         }
614
615         // See if there are any code sets that both the server and client
616
// support (giving preference to the server). The order
617
// of conversion sets is from most to least desired.
618
for (int i = 0; i < server.conversionCodeSets.length; i++) {
619             for (int y = 0; y < client.conversionCodeSets.length; y++) {
620                 if (server.conversionCodeSets[i] == client.conversionCodeSets[y]) {
621                     return server.conversionCodeSets[i];
622                 }
623             }
624         }
625
626         // Before using the fallback codesets, the spec calls for a
627
// compatibility check on the native code sets. It doesn't make
628
// sense because loss free communication is always possible with
629
// UTF8 and UTF16, the fall back code sets. It's also a lot
630
// of work to implement. In the case of incompatibility, the
631
// spec says to throw a CODESET_INCOMPATIBLE exception.
632

633         // Use the fallback
634
return CodeSetConversion.FALLBACK_CODESET;
635     }
636
637     /**
638      * Perform the code set negotiation algorithm and come up with
639      * the two encodings to use.
640      */

641     public CodeSetComponentInfo.CodeSetContext negotiate(CodeSetComponentInfo client,
642                                                          CodeSetComponentInfo server) {
643         int charData
644             = selectEncoding(client.getCharComponent(),
645                              server.getCharComponent());
646
647         if (charData == CodeSetConversion.FALLBACK_CODESET) {
648             charData = OSFCodeSetRegistry.UTF_8.getNumber();
649         }
650
651         int wcharData
652             = selectEncoding(client.getWCharComponent(),
653                              server.getWCharComponent());
654
655         if (wcharData == CodeSetConversion.FALLBACK_CODESET) {
656             wcharData = OSFCodeSetRegistry.UTF_16.getNumber();
657         }
658
659         return new CodeSetComponentInfo.CodeSetContext(charData,
660                                                        wcharData);
661     }
662
663     // No one should instantiate a CodeSetConversion but the singleton
664
// instance method
665
private CodeSetConversion() {}
666
667     // initialize-on-demand holder
668
private static class CodeSetConversionHolder {
669     static final CodeSetConversion csc = new CodeSetConversion() ;
670     }
671
672     /**
673      * CodeSetConversion is a singleton, and this is the access point.
674      */

675     public final static CodeSetConversion impl() {
676     return CodeSetConversionHolder.csc ;
677     }
678
679     // Singleton instance
680
private static CodeSetConversion implementation;
681
682     // Number used internally to indicate the fallback code
683
// set.
684
private static final int FALLBACK_CODESET = 0;
685
686     // Provides a thread local cache for the sun.io
687
// converters.
688
private CodeSetCache cache = new CodeSetCache();
689 }
690
Popular Tags