KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > mmbase > util > transformers > CP1252Surrogator


1 /*
2
3 This software is OSI Certified Open Source Software.
4 OSI Certified is a certification mark of the Open Source Initiative.
5
6 The license (Mozilla version 1.0) can be read at the MMBase site.
7 See http://www.MMBase.org/license
8
9 */

10 package org.mmbase.util.transformers;
11
12 import java.io.Reader JavaDoc;
13 import java.io.Writer JavaDoc;
14 import java.util.*;
15
16 import org.mmbase.util.logging.*;
17
18 /**
19  * Surrogates the Windows CP1252 characters which are not valid ISO-8859-1. It can also repair
20  * wrongly encoded Strings (byte arrays which were actually CP1252, but were considered ISO-8859-1
21  * when they were made to a Java String).
22  *
23  * @author Michiel Meeuwissen
24  * @since MMBase-1.7.2
25  * @version $Id: CP1252Surrogator.java,v 1.5 2005/02/02 10:12:37 michiel Exp $
26  */

27
28 public class CP1252Surrogator extends ConfigurableReaderTransformer implements CharTransformer {
29     private static final Logger log = Logging.getLoggerInstance(CP1252Surrogator.class);
30
31
32     public static final int WELL_ENCODED = 0;
33     public static final int WRONG_ENCODED = 1;
34
35
36     public CP1252Surrogator() {
37         this(WELL_ENCODED);
38     }
39     public CP1252Surrogator(int conf) {
40         super(conf);
41     }
42
43
44     public Writer JavaDoc transform(Reader JavaDoc r, Writer JavaDoc w) {
45         try {
46             while (true) {
47                 int c = r.read();
48                 if (c == -1) break;
49                 int cp;
50                 if (to == WELL_ENCODED) { // CP1252 chars appear all over the place in the unicode set, this makes a nice an clear int of it, with the ISO-8859-1 values (0-255)
51
cp = ("" + (char) c).getBytes("CP1252")[0] & 0xff; // should this really be done by a String?
52
} else {
53                     cp = c;
54                     
55                 }
56                 switch (cp) {
57                 case 128: w.write("EURO"); break; // EURO SIGN
58
case 129: w.write('?'); break; //
59
case 130: w.write(','); break; // SINGLE LOW-9 QUOTATION MARK
60
case 131: w.write('f'); break; // LATIN SMALL LETTER F WITH HOOK
61
case 132: w.write(",,"); break; // DOUBLE LOW-9 QUOTATION MARK
62
case 133: w.write("..."); break; // HORIZONTAL ELLIPSIS
63
case 134: w.write('+'); break; // DAGGER
64
case 135: w.write("++"); break; // DOUBLE DAGGER
65
case 136: w.write('^'); break; // MODIFIER LETTER CIRCUMFLEX ACCENT
66
case 137: w.write("0/00"); break; // PER MILLE SIGN
67
case 138: w.write('S'); break; // LATIN CAPITAL LETTER S WITH CARON
68
case 139: w.write('<'); break; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
69
case 140: w.write("OE"); break; // LATIN CAPITAL LIGATURE OE
70
case 141: w.write('?'); break; //
71
case 142: w.write('Z'); break; // LATIN CAPITAL LETTER Z WITH CARON
72
case 143: w.write('?'); break; //
73
case 144: w.write('?'); break; //
74
case 145: w.write('\''); break; // LEFT SINGLE QUOTATION MARK
75
case 146: w.write('\''); break; // RIGHT SINGLE QUOTATION MARK
76
case 147: w.write('\"'); break; // LEFT DOUBLE QUOTATION MARK
77
case 148: w.write('\"'); break; // RIGHT DOUBLE QUOTATION MARK
78
case 149: w.write('-'); break; // BULLET
79
case 150: w.write('-'); break; // EN DASH
80
case 151: w.write('-'); break; // EM DASH
81
case 152: w.write('~'); break; // SMALL TILDE
82
case 153: w.write("(TM)"); break; // TRADE MARK SIGN
83
case 154: w.write('s'); break; // LATIN SMALL LETTER S WITH CARON
84
case 155: w.write('>'); break; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
85
case 156: w.write("oe"); break; // LATIN SMALL LIGATURE OE
86
case 157: w.write('?'); break; //
87
case 158: w.write('z'); break; // LATIN SMALL LETTER Z WITH CARON
88
case 159: w.write('Y'); break; // LATIN CAPITAL LETTER Y WITH DIAERESIS
89
default: w.write(c);
90                 }
91             }
92         } catch (Exception JavaDoc e) {
93             log.error(e.toString());
94         }
95         return w;
96     }
97
98
99     public Map transformers() {
100         Map h = new HashMap();
101         h.put("CP1252_SURROGATOR", new Config(CP1252Surrogator.class, WELL_ENCODED, "Takes the java String, and surrogates the 32 characters of it which are in CP1252 but not in ISO-8859-1"));
102         h.put("CP1252_WRONG_SURROGATOR", new Config(CP1252Surrogator.class, WRONG_ENCODED, "Also surrogates the characters specific to CP1252, but supposed the String originally wrong encoded (it was suppoed to be ISO-8859-1, but actually was CP1252)"));
103         return h;
104     }
105
106
107     public String JavaDoc getEncoding() {
108         switch (to) {
109         case WELL_ENCODED:
110             return "CP1252_SURROGATOR";
111         case WRONG_ENCODED:
112             return "CP1252_WRONG_SURROGATOR";
113         default :
114             throw new UnknownCodingException(getClass(), to);
115         }
116     }
117
118
119     public static byte[] getTestBytes() {
120         byte[] testBytes = new byte[32];
121         for (int i = 0; i < 32; i++) {
122             testBytes[i] = (byte) (-128 + i);
123         }
124         return testBytes;
125     }
126
127     public static String JavaDoc getTestString() {
128         try {
129             return new String JavaDoc(getTestBytes(), "CP1252");
130         } catch (Exception JavaDoc e) {
131             return e.toString();
132         }
133     }
134
135     /**
136      * For testing only.
137      *
138      * Use on a UTF-8 terminal:
139      * java -Dfile.encoding=UTF-8 org.mmbase.util.transformers.CP1252Surrogator
140      * Or, on a ISO-8859-1 terminal: (you will see question marks, for the CP1252 chars)
141      * java -Dfile.encoding=ISO-8859-1 org.mmbase.util.transformers.CP1252Surrogator
142      * Or, if - may God forbid - you have a CP1252 terminal:
143      * java -Dfile.encoding=CP1252 org.mmbase.util.transformers.CP1252Surrogator
144      *
145      * This last thing you may simulate with something like this:
146      * java -Dfile.encoding=CP1252 org.mmbase.util.transformers.CP1252Surrogator | konwert cp1252-utf8
147      *
148      */

149     public static void main(String JavaDoc[] args) {
150
151         // construct a String with all specific CP1252 charachters.
152
String JavaDoc testStringCP1252 = "bla bla " + getTestString();
153         String JavaDoc testStringISO1 = "";
154         try {
155             testStringISO1 = "bla bla " + new String JavaDoc(getTestBytes(), "ISO-8859-1"); /// it's a lie, but try it anyway.
156
} catch (Exception JavaDoc e) {
157             log.error("", e);
158         }
159
160         CharTransformer transOk = new CP1252Surrogator();
161         CharTransformer transNok = new CP1252Surrogator(WRONG_ENCODED);
162         CharTransformer unicode = new UnicodeEscaper();
163         
164         System.out.println("Test-string (CP1252): " + testStringCP1252);
165         // System.out.println("Test-string (ISO-1) : " + testStringISO1); _DOES NOT MAKE SENSE_.
166

167         System.out.println("Java-escaped (CP1252): " + unicode.transform(testStringCP1252));
168         System.out.println("Java-escaped (ISO-1) : " + unicode.transform(testStringISO1));
169         System.out.println("Surrogated test-string (CP1252): " + transOk.transform(testStringCP1252));
170         System.out.println("Surrogated test-string (ISO-1) : " + transNok.transform(testStringISO1)); // fixe the non-sensical string.
171

172          
173     }
174     
175         
176 }
177
Popular Tags