KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > w3c > tidy > EncodingUtils


1 /*
2  * Java HTML Tidy - JTidy
3  * HTML parser and pretty printer
4  *
5  * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6  * Institute of Technology, Institut National de Recherche en
7  * Informatique et en Automatique, Keio University). All Rights
8  * Reserved.
9  *
10  * Contributing Author(s):
11  *
12  * Dave Raggett <dsr@w3.org>
13  * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14  * Gary L Peskin <garyp@firstech.com> (Java development)
15  * Sami Lempinen <sami@lempinen.net> (release management)
16  * Fabrizio Giustina <fgiust at users.sourceforge.net>
17  *
18  * The contributing author(s) would like to thank all those who
19  * helped with testing, bug fixes, and patience. This wouldn't
20  * have been possible without all of you.
21  *
22  * COPYRIGHT NOTICE:
23  *
24  * This software and documentation is provided "as is," and
25  * the copyright holders and contributing author(s) make no
26  * representations or warranties, express or implied, including
27  * but not limited to, warranties of merchantability or fitness
28  * for any particular purpose or that the use of the software or
29  * documentation will not infringe any third party patents,
30  * copyrights, trademarks or other rights.
31  *
32  * The copyright holders and contributing author(s) will not be
33  * liable for any direct, indirect, special or consequential damages
34  * arising out of any use of the software or documentation, even if
35  * advised of the possibility of such damage.
36  *
37  * Permission is hereby granted to use, copy, modify, and distribute
38  * this source code, or portions hereof, documentation and executables,
39  * for any purpose, without fee, subject to the following restrictions:
40  *
41  * 1. The origin of this source code must not be misrepresented.
42  * 2. Altered versions must be plainly marked as such and must
43  * not be misrepresented as being the original source.
44  * 3. This Copyright notice may not be removed or altered from any
45  * source or altered source distribution.
46  *
47  * The copyright holders and contributing author(s) specifically
48  * permit, without fee, and encourage the use of this source code
49  * as a component for supporting the Hypertext Markup Language in
50  * commercial products. If you use this source code in a product,
51  * acknowledgment is not required but would be appreciated.
52  *
53  */

54 package org.w3c.tidy;
55
56 /**
57  * @author Fabrizio Giustina
58  * @version $Revision: 1.7 $ ($Author: fgiust $)
59  */

60 public final class EncodingUtils
61 {
62
63     /**
64      * the big-endian (default) UNICODE BOM.
65      */

66     public static final int UNICODE_BOM_BE = 0xFEFF;
67
68     /**
69      * the default (big-endian) UNICODE BOM.
70      */

71     public static final int UNICODE_BOM = UNICODE_BOM_BE;
72
73     /**
74      * the little-endian UNICODE BOM.
75      */

76     public static final int UNICODE_BOM_LE = 0xFFFE;
77
78     /**
79      * the UTF-8 UNICODE BOM.
80      */

81     public static final int UNICODE_BOM_UTF8 = 0xEFBBBF;
82
83     /**
84      * states for ISO 2022 A document in ISO-2022 based encoding uses some ESC sequences called "designator" to switch
85      * character sets. The designators defined and used in ISO-2022-JP are: "ESC" + "(" + ? for ISO646 variants "ESC" +
86      * "$" + ? and "ESC" + "$" + "(" + ? for multibyte character sets. State ASCII.
87      */

88     public static final int FSM_ASCII = 0;
89
90     /**
91      * state ESC.
92      */

93     public static final int FSM_ESC = 1;
94
95     /**
96      * state ESCD.
97      */

98     public static final int FSM_ESCD = 2;
99
100     /**
101      * state ESCDP.
102      */

103     public static final int FSM_ESCDP = 3;
104
105     /**
106      * state ESCP.
107      */

108     public static final int FSM_ESCP = 4;
109
110     /**
111      * state NONASCII.
112      */

113     public static final int FSM_NONASCII = 5;
114
115     /**
116      * Max UTF-88 valid char value.
117      */

118     public static final int MAX_UTF8_FROM_UCS4 = 0x10FFFF;
119
120     /**
121      * Max UTF-16 value.
122      */

123     public static final int MAX_UTF16_FROM_UCS4 = 0x10FFFF;
124
125     /**
126      * utf16 low surrogate.
127      */

128     public static final int LOW_UTF16_SURROGATE = 0xD800;
129
130     /**
131      * UTF-16 surrogates begin.
132      */

133     public static final int UTF16_SURROGATES_BEGIN = 0x10000;
134
135     /**
136      * UTF-16 surrogate pair areas: low surrogates begin.
137      */

138     public static final int UTF16_LOW_SURROGATE_BEGIN = 0xD800;
139
140     /**
141      * UTF-16 surrogate pair areas: low surrogates end.
142      */

143     public static final int UTF16_LOW_SURROGATE_END = 0xDBFF;
144
145     /**
146      * UTF-16 surrogate pair areas: high surrogates begin.
147      */

148     public static final int UTF16_HIGH_SURROGATE_BEGIN = 0xDC00;
149
150     /**
151      * UTF-16 surrogate pair areas: high surrogates end.
152      */

153     public static final int UTF16_HIGH_SURROGATE_END = 0xDFFF;
154
155     /**
156      * UTF-16 high surrogate.
157      */

158     public static final int HIGH_UTF16_SURROGATE = 0xDFFF;
159
160     /**
161      * UTF-8 bye swap: invalid char.
162      */

163     private static final int UTF8_BYTE_SWAP_NOT_A_CHAR = 0xFFFE;
164
165     /**
166      * UTF-8 invalid char.
167      */

168     private static final int UTF8_NOT_A_CHAR = 0xFFFF;
169
170     /**
171      * Mapping for Windows Western character set (128-159) to Unicode.
172      */

173     private static final int[] WIN2UNICODE = {
174         0x20AC,
175         0x0000,
176         0x201A,
177         0x0192,
178         0x201E,
179         0x2026,
180         0x2020,
181         0x2021,
182         0x02C6,
183         0x2030,
184         0x0160,
185         0x2039,
186         0x0152,
187         0x0000,
188         0x017D,
189         0x0000,
190         0x0000,
191         0x2018,
192         0x2019,
193         0x201C,
194         0x201D,
195         0x2022,
196         0x2013,
197         0x2014,
198         0x02DC,
199         0x2122,
200         0x0161,
201         0x203A,
202         0x0153,
203         0x0000,
204         0x017E,
205         0x0178};
206
207     /**
208      * John Love-Jensen contributed this table for mapping MacRoman character set to Unicode.
209      */

210     private static final int[] MAC2UNICODE = { // modified to only need chars 128-255/U+0080-U+00FF Terry T 19 Aug 01
211
// x7F = DEL
212
0x00C4,
213         0x00C5,
214         0x00C7,
215         0x00C9,
216         0x00D1,
217         0x00D6,
218         0x00DC,
219         0x00E1,
220         0x00E0,
221         0x00E2,
222         0x00E4,
223         0x00E3,
224         0x00E5,
225         0x00E7,
226         0x00E9,
227         0x00E8,
228         0x00EA,
229         0x00EB,
230         0x00ED,
231         0x00EC,
232         0x00EE,
233         0x00EF,
234         0x00F1,
235         0x00F3,
236         0x00F2,
237         0x00F4,
238         0x00F6,
239         0x00F5,
240         0x00FA,
241         0x00F9,
242         0x00FB,
243         0x00FC,
244         0x2020,
245         0x00B0,
246         0x00A2,
247         0x00A3,
248         0x00A7,
249         0x2022,
250         0x00B6,
251         0x00DF,
252         0x00AE,
253         0x00A9,
254         0x2122,
255         0x00B4,
256         0x00A8,
257         0x2260,
258         0x00C6,
259         0x00D8,
260         0x221E,
261         0x00B1,
262         0x2264,
263         0x2265,
264         0x00A5,
265         0x00B5,
266         0x2202,
267         0x2211,
268         // =BD U+2126 OHM SIGN
269
0x220F,
270         0x03C0,
271         0x222B,
272         0x00AA,
273         0x00BA,
274         0x03A9,
275         0x00E6,
276         0x00F8,
277         0x00BF,
278         0x00A1,
279         0x00AC,
280         0x221A,
281         0x0192,
282         0x2248,
283         0x2206,
284         0x00AB,
285         0x00BB,
286         0x2026,
287         0x00A0,
288         0x00C0,
289         0x00C3,
290         0x00D5,
291         0x0152,
292         0x0153,
293         0x2013,
294         0x2014,
295         0x201C,
296         0x201D,
297         0x2018,
298         0x2019,
299         0x00F7,
300         0x25CA,
301         // =DB U+00A4 CURRENCY SIGN
302
0x00FF,
303         0x0178,
304         0x2044,
305         0x20AC,
306         0x2039,
307         0x203A,
308         0xFB01,
309         0xFB02,
310         0x2021,
311         0x00B7,
312         0x201A,
313         0x201E,
314         0x2030,
315         0x00C2,
316         0x00CA,
317         0x00C1,
318         0x00CB,
319         0x00C8,
320         0x00CD,
321         0x00CE,
322         0x00CF,
323         0x00CC,
324         0x00D3,
325         0x00D4,
326         // xF0 = Apple Logo
327
// =F0 U+2665 BLACK HEART SUIT
328
0xF8FF,
329         0x00D2,
330         0x00DA,
331         0x00DB,
332         0x00D9,
333         0x0131,
334         0x02C6,
335         0x02DC,
336         0x00AF,
337         0x02D8,
338         0x02D9,
339         0x02DA,
340         0x00B8,
341         0x02DD,
342         0x02DB,
343         0x02C7};
344
345     /**
346      * table to map symbol font characters to Unicode; undefined characters are mapped to 0x0000 and characters without
347      * any unicode equivalent are mapped to '?'. Is this appropriate?
348      */

349     private static final int[] SYMBOL2UNICODE = {
350         0x0000,
351         0x0001,
352         0x0002,
353         0x0003,
354         0x0004,
355         0x0005,
356         0x0006,
357         0x0007,
358         0x0008,
359         0x0009,
360         0x000A,
361         0x000B,
362         0x000C,
363         0x000D,
364         0x000E,
365         0x000F,
366
367         0x0010,
368         0x0011,
369         0x0012,
370         0x0013,
371         0x0014,
372         0x0015,
373         0x0016,
374         0x0017,
375         0x0018,
376         0x0019,
377         0x001A,
378         0x001B,
379         0x001C,
380         0x001D,
381         0x001E,
382         0x001F,
383
384         0x0020,
385         0x0021,
386         0x2200,
387         0x0023,
388         0x2203,
389         0x0025,
390         0x0026,
391         0x220D,
392         0x0028,
393         0x0029,
394         0x2217,
395         0x002B,
396         0x002C,
397         0x2212,
398         0x002E,
399         0x002F,
400
401         0x0030,
402         0x0031,
403         0x0032,
404         0x0033,
405         0x0034,
406         0x0035,
407         0x0036,
408         0x0037,
409         0x0038,
410         0x0039,
411         0x003A,
412         0x003B,
413         0x003C,
414         0x003D,
415         0x003E,
416         0x003F,
417
418         0x2245,
419         0x0391,
420         0x0392,
421         0x03A7,
422         0x0394,
423         0x0395,
424         0x03A6,
425         0x0393,
426         0x0397,
427         0x0399,
428         0x03D1,
429         0x039A,
430         0x039B,
431         0x039C,
432         0x039D,
433         0x039F,
434
435         0x03A0,
436         0x0398,
437         0x03A1,
438         0x03A3,
439         0x03A4,
440         0x03A5,
441         0x03C2,
442         0x03A9,
443         0x039E,
444         0x03A8,
445         0x0396,
446         0x005B,
447         0x2234,
448         0x005D,
449         0x22A5,
450         0x005F,
451
452         0x00AF,
453         0x03B1,
454         0x03B2,
455         0x03C7,
456         0x03B4,
457         0x03B5,
458         0x03C6,
459         0x03B3,
460         0x03B7,
461         0x03B9,
462         0x03D5,
463         0x03BA,
464         0x03BB,
465         0x03BC,
466         0x03BD,
467         0x03BF,
468
469         0x03C0,
470         0x03B8,
471         0x03C1,
472         0x03C3,
473         0x03C4,
474         0x03C5,
475         0x03D6,
476         0x03C9,
477         0x03BE,
478         0x03C8,
479         0x03B6,
480         0x007B,
481         0x007C,
482         0x007D,
483         0x223C,
484         0x003F,
485
486         0x0000,
487         0x0000,
488         0x0000,
489         0x0000,
490         0x0000,
491         0x0000,
492         0x0000,
493         0x0000,
494         0x0000,
495         0x0000,
496         0x0000,
497         0x0000,
498         0x0000,
499         0x0000,
500         0x0000,
501         0x0000,
502
503         0x0000,
504         0x0000,
505         0x0000,
506         0x0000,
507         0x0000,
508         0x0000,
509         0x0000,
510         0x0000,
511         0x0000,
512         0x0000,
513         0x0000,
514         0x0000,
515         0x0000,
516         0x0000,
517         0x0000,
518         0x0000,
519
520         0x00A0,
521         0x03D2,
522         0x2032,
523         0x2264,
524         0x2044,
525         0x221E,
526         0x0192,
527         0x2663,
528         0x2666,
529         0x2665,
530         0x2660,
531         0x2194,
532         0x2190,
533         0x2191,
534         0x2192,
535         0x2193,
536
537         0x00B0,
538         0x00B1,
539         0x2033,
540         0x2265,
541         0x00D7,
542         0x221D,
543         0x2202,
544         0x00B7,
545         0x00F7,
546         0x2260,
547         0x2261,
548         0x2248,
549         0x2026,
550         0x003F,
551         0x003F,
552         0x21B5,
553
554         0x2135,
555         0x2111,
556         0x211C,
557         0x2118,
558         0x2297,
559         0x2295,
560         0x2205,
561         0x2229,
562         0x222A,
563         0x2283,
564         0x2287,
565         0x2284,
566         0x2282,
567         0x2286,
568         0x2208,
569         0x2209,
570
571         0x2220,
572         0x2207,
573         0x00AE,
574         0x00A9,
575         0x2122,
576         0x220F,
577         0x221A,
578         0x22C5,
579         0x00AC,
580         0x2227,
581         0x2228,
582         0x21D4,
583         0x21D0,
584         0x21D1,
585         0x21D2,
586         0x21D3,
587
588         0x25CA,
589         0x2329,
590         0x00AE,
591         0x00A9,
592         0x2122,
593         0x2211,
594         0x003F,
595         0x003F,
596         0x003F,
597         0x003F,
598         0x003F,
599         0x003F,
600         0x003F,
601         0x003F,
602         0x003F,
603         0x003F,
604
605         0x20AC,
606         0x232A,
607         0x222B,
608         0x2320,
609         0x003F,
610         0x2321,
611         0x003F,
612         0x003F,
613         0x003F,
614         0x003F,
615         0x003F,
616         0x003F,
617         0x003F,
618         0x003F,
619         0x003F,
620         0x003F};
621
622     /**
623      * Array of valid UTF8 sequences.
624      */

625     private static final ValidUTF8Sequence[] VALID_UTF8 = {
626         new ValidUTF8Sequence(0x0000, 0x007F, 1, new char[]{0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
627         new ValidUTF8Sequence(0x0080, 0x07FF, 2, new char[]{0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}),
628         new ValidUTF8Sequence(0x0800, 0x0FFF, 3, new char[]{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}),
629         new ValidUTF8Sequence(0x1000, 0xFFFF, 3, new char[]{0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}),
630         new ValidUTF8Sequence(0x10000, 0x3FFFF, 4, new char[]{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}),
631         new ValidUTF8Sequence(0x40000, 0xFFFFF, 4, new char[]{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}),
632         new ValidUTF8Sequence(0x100000, 0x10FFFF, 4, new char[]{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})};
633
634     /**
635      * number of valid utf8 sequances.
636      */

637     private static final int NUM_UTF8_SEQUENCES = VALID_UTF8.length;
638
639     /**
640      * Offset for utf8 sequences.
641      */

642     private static final int[] OFFSET_UTF8_SEQUENCES = {0, // 1 byte
643
1, // 2 bytes
644
2, // 3 bytes
645
4, // 4 bytes
646
NUM_UTF8_SEQUENCES}; // must be last
647

648     /**
649      * don't instantiate.
650      */

651     private EncodingUtils()
652     {
653         // unused
654
}
655
656     /**
657      * Function for conversion from Windows-1252 to Unicode.
658      * @param c char to decode
659      * @return decoded char
660      */

661     protected static int decodeWin1252(int c)
662     {
663         return WIN2UNICODE[c - 128];
664     }
665
666     /**
667      * Function to convert from MacRoman to Unicode.
668      * @param c char to decode
669      * @return decoded char
670      */

671     protected static int decodeMacRoman(int c)
672     {
673         if (127 < c)
674         {
675             c = MAC2UNICODE[c - 128];
676         }
677         return c;
678     }
679
680     /**
681      * Function to convert from Symbol Font chars to Unicode.
682      * @param c char to decode
683      * @return decoded char
684      */

685     static int decodeSymbolFont(int c)
686     {
687         if (c > 255)
688         {
689             return c;
690         }
691
692         return SYMBOL2UNICODE[c];
693     }
694
695     /**
696      * Decodes an array of bytes to a char.
697      * @param c will contain the decoded char
698      * @param firstByte first input byte
699      * @param successorBytes array containing successor bytes (can be null if a getter is provided).
700      * @param getter callback used to get new bytes if successorBytes doesn't contain enough bytes
701      * @param count will contain the number of bytes read
702      * @param startInSuccessorBytesArray starting offset for bytes in successorBytes
703      * @return <code>true</code> if error
704      */

705     static boolean decodeUTF8BytesToChar(int[] c, int firstByte, byte[] successorBytes, GetBytes getter, int[] count,
706         int startInSuccessorBytesArray)
707     {
708         byte[] buf = new byte[10];
709
710         int ch = 0;
711         int n = 0;
712         int i, bytes = 0;
713         boolean hasError = false;
714
715         if (successorBytes.length != 0)
716         {
717             buf = successorBytes;
718         }
719
720         // special check if we have been passed an EOF char
721
if (firstByte == StreamIn.END_OF_STREAM) //uint
722
{
723             // at present
724
c[0] = firstByte;
725             count[0] = 1;
726             return false;
727         }
728
729         ch = TidyUtils.toUnsigned(firstByte); // first byte is passed in separately
730

731         if (ch <= 0x7F) // 0XXX XXXX one byte
732
{
733             n = ch;
734             bytes = 1;
735         }
736         else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
737         {
738             n = ch & 31;
739             bytes = 2;
740         }
741         else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
742         {
743             n = ch & 15;
744             bytes = 3;
745         }
746         else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
747         {
748             n = ch & 7;
749             bytes = 4;
750         }
751         else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
752         {
753             n = ch & 3;
754             bytes = 5;
755             hasError = true;
756         }
757         else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
758         {
759             n = ch & 1;
760             bytes = 6;
761             hasError = true;
762         }
763         else
764         {
765             // not a valid first byte of a UTF-8 sequence
766
n = ch;
767             bytes = 1;
768             hasError = true;
769         }
770
771         for (i = 1; i < bytes; ++i)
772         {
773             int[] tempCount = new int[1]; // no. of additional bytes to get
774

775             // successor bytes should have the form 10XX XXXX
776
if (getter != null && (bytes - i > 0))
777             {
778                 tempCount[0] = 1; // to simplify things, get 1 byte at a time
779
int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]};
780
781                 getter.doGet(buftocopy, tempCount, false);
782                 //readRawBytesFromStream(buftocopy, tempCount, false);
783
if (tempCount[0] <= 0) // EOF
784
{
785                     hasError = true;
786                     bytes = i;
787                     break;
788                 }
789             }
790
791             if ((buf[startInSuccessorBytesArray + i - 1] & 0xC0) != 0x80)
792             {
793                 // illegal successor byte value
794
hasError = true;
795                 bytes = i;
796                 if (getter != null)
797                 {
798                     int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]};
799                     tempCount[0] = 1; // to simplify things, unget 1 byte at a time
800
getter.doGet(buftocopy, tempCount, true);
801                 }
802                 break;
803             }
804
805             n = (n << 6) | (buf[startInSuccessorBytesArray + i - 1] & 0x3F);
806         }
807
808         if (!hasError && ((n == UTF8_BYTE_SWAP_NOT_A_CHAR) || (n == UTF8_NOT_A_CHAR)))
809         {
810             hasError = true;
811         }
812
813         if (!hasError && (n > MAX_UTF8_FROM_UCS4))
814         {
815             hasError = true;
816         }
817
818         if (!hasError && (n >= UTF16_LOW_SURROGATE_BEGIN) && (n <= UTF16_HIGH_SURROGATE_END))
819         {
820             // unpaired surrogates not allowed
821
hasError = true;
822         }
823
824         if (!hasError)
825         {
826             int lo = OFFSET_UTF8_SEQUENCES[bytes - 1];
827             int hi = OFFSET_UTF8_SEQUENCES[bytes] - 1;
828
829             // check for overlong sequences
830
if ((n < VALID_UTF8[lo].lowChar) || (n > VALID_UTF8[hi].highChar))
831             {
832                 hasError = true;
833             }
834             else
835             {
836                 hasError = true; // assume error until proven otherwise
837

838                 for (i = lo; i <= hi; i++)
839                 {
840                     int tempCount;
841                     char theByte; //unsigned
842

843                     for (tempCount = 0; tempCount < bytes; tempCount++)
844                     {
845                         if (!TidyUtils.toBoolean(tempCount))
846                         {
847                             theByte = (char) firstByte;
848                         }
849                         else
850                         {
851                             theByte = (char) buf[startInSuccessorBytesArray + tempCount - 1];
852                         }
853                         if ((theByte >= VALID_UTF8[i].validBytes[(tempCount * 2)])
854                             && (theByte <= VALID_UTF8[i].validBytes[(tempCount * 2) + 1]))
855                         {
856                             hasError = false;
857                         }
858                         if (hasError)
859                         {
860                             break;
861                         }
862                     }
863                 }
864             }
865         }
866
867         count[0] = bytes;
868
869         c[0] = n;
870
871         // n = 0xFFFD;
872
// replacement char - do this in the caller
873
return hasError;
874
875     }
876
877     /**
878      * Encode a char to an array of bytes.
879      * @param c char to encode
880      * @param encodebuf will contain the decoded bytes
881      * @param putter if not null it will be called to write bytes to out
882      * @param count number of bytes written
883      * @return <code>false</code>= ok, <code>true</code>= error
884      */

885     static boolean encodeCharToUTF8Bytes(int c, byte[] encodebuf, PutBytes putter, int[] count)
886     {
887         int bytes = 0;
888
889         byte[] buf = new byte[10];
890
891         if (encodebuf != null)
892         {
893             buf = encodebuf;
894         }
895
896         boolean hasError = false;
897
898         if (c <= 0x7F) // 0XXX XXXX one byte
899
{
900             buf[0] = (byte) c;
901             bytes = 1;
902         }
903         else if (c <= 0x7FF) // 110X XXXX two bytes
904
{
905             buf[0] = (byte) (0xC0 | (c >> 6));
906             buf[1] = (byte) (0x80 | (c & 0x3F));
907             bytes = 2;
908         }
909         else if (c <= 0xFFFF) // 1110 XXXX three bytes
910
{
911             buf[0] = (byte) (0xE0 | (c >> 12));
912             buf[1] = (byte) (0x80 | ((c >> 6) & 0x3F));
913             buf[2] = (byte) (0x80 | (c & 0x3F));
914             bytes = 3;
915             if ((c == UTF8_BYTE_SWAP_NOT_A_CHAR) || (c == UTF8_NOT_A_CHAR))
916             {
917                 hasError = true;
918             }
919             else if ((c >= UTF16_LOW_SURROGATE_BEGIN) && (c <= UTF16_HIGH_SURROGATE_END))
920             {
921                 // unpaired surrogates not allowed
922
hasError = true;
923             }
924         }
925         else if (c <= 0x1FFFFF) // 1111 0XXX four bytes
926
{
927             buf[0] = (byte) (0xF0 | (c >> 18));
928             buf[1] = (byte) (0x80 | ((c >> 12) & 0x3F));
929             buf[2] = (byte) (0x80 | ((c >> 6) & 0x3F));
930             buf[3] = (byte) (0x80 | (c & 0x3F));
931             bytes = 4;
932             if (c > MAX_UTF8_FROM_UCS4)
933             {
934                 hasError = true;
935             }
936         }
937         else if (c <= 0x3FFFFFF) // 1111 10XX five bytes
938
{
939             buf[0] = (byte) (0xF8 | (c >> 24));
940             buf[1] = (byte) (0x80 | (c >> 18));
941             buf[2] = (byte) (0x80 | ((c >> 12) & 0x3F));
942             buf[3] = (byte) (0x80 | ((c >> 6) & 0x3F));
943             buf[4] = (byte) (0x80 | (c & 0x3F));
944             bytes = 5;
945             hasError = true;
946         }
947         else if (c <= 0x7FFFFFFF) // 1111 110X six bytes
948
{
949             buf[0] = (byte) (0xFC | (c >> 30));
950             buf[1] = (byte) (0x80 | ((c >> 24) & 0x3F));
951             buf[2] = (byte) (0x80 | ((c >> 18) & 0x3F));
952             buf[3] = (byte) (0x80 | ((c >> 12) & 0x3F));
953             buf[4] = (byte) (0x80 | ((c >> 6) & 0x3F));
954             buf[5] = (byte) (0x80 | (c & 0x3F));
955             bytes = 6;
956             hasError = true;
957         }
958         else
959         {
960             hasError = true;
961         }
962
963         if (!hasError && putter != null) // don't output invalid UTF-8 byte sequence to a stream
964
{
965             int[] tempCount = new int[]{bytes};
966             putter.doPut(buf, tempCount);
967
968             if (tempCount[0] < bytes)
969             {
970                 hasError = true;
971             }
972         }
973
974         count[0] = bytes;
975         return hasError;
976     }
977
978     /**
979      * Getter callback: called to retrieve 1 or more additional UTF-8 bytes. The Getter callback can also unget if
980      * necessary to re-synchronize the input stream.
981      */

982     static interface GetBytes
983     {
984
985         /**
986          * Get one or more byte.
987          * @param buf will contain the bytes.
988          * @param count number of bytes actually stored in "buf". &lt;= 0 if error or EOF
989          * @param unget unget bytes?
990          */

991         void doGet(int[] buf, int[] count, boolean unget);
992     }
993
994     /**
995      * Putter callbacks: called to store 1 or more additional UTF-8 bytes.
996      */

997     static interface PutBytes
998     {
999
1000        /**
1001         * Store one or more byte.
1002         * @param buf will contain the bytes.
1003         * @param count number of bytes actually stored in "buf". &lt;= 0 if error or EOF
1004         */

1005        void doPut(byte[] buf, int[] count);
1006    }
1007}
1008
Popular Tags