KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > xml > io > MkEncodingsTable


1 /*
2  * Enhydra Java Application Server Project
3  *
4  * The contents of this file are subject to the Enhydra Public License
5  * Version 1.1 (the "License"); you may not use this file except in
6  * compliance with the License. You may obtain a copy of the License on
7  * the Enhydra web site ( http://www.enhydra.org/ ).
8  *
9  * Software distributed under the License is distributed on an "AS IS"
10  * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
11  * the License for the specific terms governing rights and limitations
12  * under the License.
13  *
14  * The Initial Developer of the Enhydra Application Server is Lutris
15  * Technologies, Inc. The Enhydra Application Server and portions created
16  * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
17  * All Rights Reserved.
18  *
19  * Contributor(s):
20  *
21  * $Id: MkEncodingsTable.java,v 1.2 2005/01/26 08:29:24 jkjome Exp $
22  */

23 package org.enhydra.xml.io;
24
25 import java.io.BufferedReader JavaDoc;
26 import java.io.BufferedWriter JavaDoc;
27 import java.io.FileReader JavaDoc;
28 import java.io.FileWriter JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.PrintWriter JavaDoc;
31 import java.util.ArrayList JavaDoc;
32 import java.util.HashSet JavaDoc;
33
34 //FIXME: Next to convert to file to XML.
35

36 /**
37  * Generate a file contain character encodings by parsing
38  * the IANA Charset Registry, obtained from:
39  * <br>
40  * <a HREF="ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets">
41  * <tt>ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets</tt></a>
42  * <br>
43  * This is a development-time tool, with special hacks to make up for
44  * various java encoding names not in the table.
45  * The resulting file has the format:
46  * <pre>
47  * name bits mime-name alias1 alias2 ...
48  * </pre>
49  * Where bits is 7, 8 or 16 and mime-name can be "null" if there is none
50  * defined. This file will be converted to XML in a future release.
51  */

52 public final class MkEncodingsTable {
53     /**
54      * Resource name of IANA Charset Registry file.
55      */

56     private final String JavaDoc CHAR_SET_REGISTRY = "character-sets";
57
58     /**
59      * Character set table that is created
60      */

61     private final String JavaDoc CHAR_SET_TABLE = "character-sets.tbl";
62
63     /**
64      * Labels in registry file.
65      */

66     private final String JavaDoc REG_NAME_FIELD = "Name:";
67     private final String JavaDoc REG_ALIAS_FIELD = "Alias:";
68
69     //FIXME: these encodings lists are not complete..
70
/**
71      * 7 bit encoding names.
72      */

73     private static final String JavaDoc[] ENCODINGS_7BIT = {
74         "ANSI_X3.4-1968",
75         "T.61-7bit"
76     };
77
78     /**
79      * 8 bit encoding names.
80      */

81     private static final String JavaDoc[] ENCODINGS_8BIT = {
82         "T.61-8bit",
83         "UNKNOWN-8BIT",
84         "PC8-Danish-Norwegian",
85         "PC8-Turkish",
86         "ISO_8859-1:1987",
87         "ISO_8859-2:1987",
88         "ISO_8859-3:1988",
89         "ISO_8859-4:1988",
90         "ISO_8859-6:1987",
91         "ISO_8859-6-E",
92         "ISO_8859-6-I",
93         "ISO_8859-7:1987",
94         "ISO_8859-8:1988",
95         "ISO_8859-8-E",
96         "ISO_8859-8-I",
97         "ISO_8859-5:1988",
98         "ISO_8859-9:1989",
99         "ISO_8859-supp",
100         "ISO-8859-10",
101         "ISO-8859-15",
102         "ISO-8859-1-Windows-3.0-Latin-1",
103         "ISO-8859-1-Windows-3.1-Latin-1",
104         "ISO-8859-2-Windows-Latin-2",
105         "ISO-8859-9-Windows-Latin-5",
106         "latin-greek",
107         "Latin-greek-1"
108     };
109
110     /**
111      * Tables of known 7 & 8 bit encodings.
112      */

113     private static final HashSet JavaDoc f7BitEncodings = new HashSet JavaDoc();
114     private static final HashSet JavaDoc f8BitEncodings = new HashSet JavaDoc();
115
116     /**
117      * Pattern indicating the preferred MIME name.
118      */

119     private static final String JavaDoc REG_MIME_PREFERRED = "preferred MIME name";
120
121     /**
122      * Table of aliases to add.
123      */

124     private static final String JavaDoc[][] HACKED_ALIASES = {
125         {"UTF-8", "UTF8"},
126         {"ANSI_X3.4-1968", "646"},
127     };
128
129     /**
130      * Table of prefix conversions. An alias is created for names/aliases
131      * that match the first prefix, with the second subsitutued.
132      */

133     private static final String JavaDoc[][] HACKED_PREFIXES = {
134         {"windows-", "Cp"},
135         {"ISO_8859-", "ISO8859-"},
136         {"ISO-8859-", "ISO8859_"}
137     };
138
139     /**
140      * Extra entries to output, with no other hacks available to get them.
141      */

142     private static String JavaDoc[] EXTRA_ENTRIES = {
143         "UnicodeBig 16 null",
144         "UnicodeBigUnmarked 16 null",
145         "UnicodeLittle 16 null",
146         "UnicodeLittleUnmarked 16 null",
147         "UTF-16 16 null UTF16"
148     };
149
150     /**
151      * Class initializer.
152      */

153     static {
154         for (int idx = 0; idx < ENCODINGS_7BIT.length; idx++) {
155             f7BitEncodings.add(ENCODINGS_7BIT[idx]);
156         }
157         for (int idx = 0; idx < ENCODINGS_8BIT.length; idx++) {
158             f8BitEncodings.add(ENCODINGS_8BIT[idx]);
159         }
160     }
161
162     /**
163      * Generate an error about parsing a line in registry.
164      */

165     private void ianaParseError(String JavaDoc msg,
166                                 String JavaDoc line) {
167         throw new XMLIOError(msg + "; parsing line in " + CHAR_SET_REGISTRY
168                              + "\"" + line + "\"");
169     }
170
171
172     /**
173      * Get the encoding size. Returning 7, 8, or 16. This makes a guess
174      * based on some encoded knowledge. If not known, returns 16.
175      */

176     private int getCharSize(String JavaDoc encoding) {
177         if (f7BitEncodings.contains(encoding)) {
178             return 7;
179         } else if (f8BitEncodings.contains(encoding)) {
180             return 8;
181         } else {
182             return 16;
183         }
184     }
185
186     /**
187      * Extract a encoding name out of a Name: or Alias: line. Returns
188      * null if empty.
189      */

190     private String JavaDoc parseName(String JavaDoc line) {
191         int len = line.length();
192
193         // Get next char after index.
194
int startIdx = line.indexOf(':');
195         if (startIdx < 0) {
196             ianaParseError("no `:' found", line);
197         }
198         startIdx++;
199
200         // Skip spaces
201
while ((startIdx < len) && (line.charAt(startIdx) == ' ')) {
202             startIdx++;
203         }
204         
205         // Find end
206
int endIdx = startIdx;
207         while ((endIdx < len) && (line.charAt(endIdx) != ' ')) {
208             endIdx++;
209         }
210         if (endIdx <= startIdx) {
211             return null;
212         } else {
213             return line.substring(startIdx, endIdx).intern();
214         }
215     }
216
217     /**
218      * Determine if a line contains the preferred MIME encoding.
219      */

220     private boolean isMimePreferredEntry(String JavaDoc line) {
221         return (line.indexOf(REG_MIME_PREFERRED) >= 0);
222     }
223
224     /**
225      * Add a alias to the list of aliases, if its not null or not already
226      * there.
227      */

228     private void addAlias(ArrayList JavaDoc aliases,
229                           String JavaDoc alias) {
230         if ((alias != null) && !aliases.contains(alias)) {
231             aliases.add(alias);
232         }
233     }
234
235     /**
236      * Do special hacked mapping of name/aliases to other aliases. This
237      * handles alisas not in registry
238      */

239     private void makeHackedAliases(ArrayList JavaDoc aliases,
240                                    String JavaDoc name) {
241         // Hacks based on alias.
242
for (int idx = 0; idx < HACKED_ALIASES.length; idx++) {
243             String JavaDoc[] mapping = HACKED_ALIASES[idx];
244             if (name.equals(mapping[0])) {
245                 addAlias(aliases, mapping[1]);
246             }
247         }
248
249         // Hacks based on prefix.
250
for (int idx = 0; idx < HACKED_PREFIXES.length; idx++) {
251             String JavaDoc[] mapping = HACKED_PREFIXES[idx];
252             if (name.startsWith(mapping[0])) {
253                 addAlias(aliases,
254                          mapping[1] + name.substring(mapping[0].length()));
255             }
256         }
257     }
258
259     /**
260      * Scan the input stream for the next encoding entry and parse that
261      * entry and write a record.
262      */

263     private boolean parseCharSetEntry(BufferedReader JavaDoc in,
264                                       PrintWriter JavaDoc out) throws IOException JavaDoc {
265         ArrayList JavaDoc aliases = new ArrayList JavaDoc();
266         String JavaDoc mimePreferred = null;
267         String JavaDoc line = null;
268
269         // Scan for next Name: entry
270
while ((line = in.readLine()) != null) {
271             if (line.startsWith(REG_NAME_FIELD)) {
272                 break;
273             }
274         }
275         if (line == null) {
276             return false; // EOF
277
}
278         String JavaDoc name = parseName(line);
279         if (name == null) {
280             ianaParseError("no name parsed", line);
281         }
282         if (isMimePreferredEntry(line)){
283             mimePreferred = name;
284         }
285
286         // Handle stuff missing from registry
287
makeHackedAliases(aliases, name);
288         
289         // Parse Alias: entries, scanning until a blank line or EOF.
290
while (((line = in.readLine()) != null)
291                && (line.trim().length() > 0)) {
292             if (line.startsWith(REG_ALIAS_FIELD)) {
293                 String JavaDoc alias = parseName(line);
294                 if (alias != null) {
295                     addAlias(aliases, alias);
296                     makeHackedAliases(aliases, alias);
297                     if (isMimePreferredEntry(line)){
298                         mimePreferred = alias;
299                     }
300                 }
301             }
302         }
303         
304         // output entry
305
out.print(name);
306         out.print(' ');
307         out.print(getCharSize(name));
308         out.print(' ');
309         out.print(mimePreferred);
310         int len = aliases.size();
311         for (int idx = 0; idx < len; idx++) {
312             out.print(' ');
313             out.print(aliases.get(idx));
314         }
315         out.println();
316         return true;
317     }
318
319     /**
320      * Parse the registry file.
321      */

322     private void parseIanaRegistry(BufferedReader JavaDoc in,
323                                    PrintWriter JavaDoc out) throws IOException JavaDoc {
324         while (parseCharSetEntry(in, out)) {
325             // Looping till eof
326
}
327     }
328
329     /**
330      * Parse the registry file.
331      */

332     private void parseIanaRegistry() throws IOException JavaDoc {
333         BufferedReader JavaDoc in = new BufferedReader JavaDoc(new FileReader JavaDoc(CHAR_SET_REGISTRY));
334         PrintWriter JavaDoc out = new PrintWriter JavaDoc(new BufferedWriter JavaDoc(new FileWriter JavaDoc(CHAR_SET_TABLE)));
335         parseIanaRegistry(in, out);
336
337         for (int i = 0; i < EXTRA_ENTRIES.length; i++) {
338             out.println(EXTRA_ENTRIES[i]);
339         }
340         out.close();
341         in.close();
342     }
343
344     /**
345      * Entry
346      */

347     public static void main(String JavaDoc[] args) throws IOException JavaDoc {
348         new MkEncodingsTable().parseIanaRegistry();
349     }
350 }
351
Popular Tags