EncodingHelper


1   /*
2    * The contents of this file are subject to the terms of the Common Development
3    * and Distribution License (the License). You may not use this file except in
4    * compliance with the License.
5    *
6    * You can obtain a copy of the License at http://www.netbeans.org/cddl.html
7    * or http://www.netbeans.org/cddl.txt.
8    *
9    * When distributing Covered Code, include this CDDL Header Notice in each file
10   * and include the License file at http://www.netbeans.org/cddl.txt.
11   * If applicable, add the following below the CDDL Header, with the fields
12   * enclosed by brackets [] replaced by your own identifying information:
13   * "Portions Copyrighted [year] [name of copyright owner]"
14   *
15   * The Original Software is NetBeans. The Initial Developer of the Original
16   * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
17   * Microsystems, Inc. All Rights Reserved.
18   */
19  package org.netbeans.modules.xml.core.lib;
20  
21  import java.io.*;
22  import javax.swing.text.*;
23  
24  /**
25   * XML uses inband encoding detection - this class obtains it.
26   *
27   * @author  Petr Kuzel
28   * @version 1.0
29   */
30  public class EncodingHelper extends Object   {
31  
32      // heuristic constant guessing max prolog length
33      private static final int EXPECTED_PROLOG_LENGTH = 1000;
34  
35      /** Detect input stream encoding.
36      * The stream stays intact.
37      * @return java encoding names ("UTF8", "ASCII", etc.) or null
38      * if the stream is not markable or enoding cannot be detected.
39      */
40      public static String   detectEncoding(InputStream in) throws IOException {
41  
42          if (! in.markSupported()) {
43              if ( Util.THIS.isLoggable() ) /* then */ Util.THIS.debug("EncodingHelper got unmarkable stream: " + in.getClass()); // NOI18N
44              return null;
45          }
46  
47          try {
48              in.mark(EXPECTED_PROLOG_LENGTH);
49  
50              byte[] bytes = new byte[EXPECTED_PROLOG_LENGTH];
51              for (int i = 0; i<bytes.length; i++) {
52                  try {
53                      int datum = in.read();
54                      if (datum == -1) break;
55                      bytes[i] = (byte) datum;
56                  } catch (EOFException ex) {
57                  }
58              }
59  
60              String   enc = autoDetectEncoding(bytes);
61              if (enc == null) return null;
62              
63              enc = detectDeclaredEncoding(bytes, enc);
64              if (enc == null) return null;
65              
66              return Convertors.iana2java (enc);
67          } finally {
68              in.reset();
69          }
70      }
71  
72          
73      /**
74       * @return Java encoding family identifier or <tt>null</tt> for unrecognized
75       */
76      static String   autoDetectEncoding(byte[] buf) throws IOException {
77          
78  
79          if (buf.length >= 4) {
80              switch (buf[0]) {
81                  case 0:  
82                      // byte order mark of (1234-big endian) or (2143) USC-4
83                      // or '<' encoded as UCS-4 (1234, 2143, 3412) or UTF-16BE 
84                      if (buf[1] == (byte)0x3c && buf[2] == (byte)0x00 && buf[3] == (byte)0x3f) {
85                          return "UnicodeBigUnmarked";
86                      }
87                      // else it's probably UCS-4
88                      break;
89  
90                  case 0x3c:
91                      switch (buf[1]) {
92                          // First character is '<'; could be XML without
93                          // an XML directive such as "<hello>", "<!-- ...", // NOI18N
94                          // and so on.
95                          
96                          // 3c 00 3f 00 UTF-16 little endian
97                          case 0x00:
98                              if (buf [2] == (byte)0x3f && buf [3] == (byte)0x00) {
99                                  return  "UnicodeLittleUnmarked";
100                             }                            
101                             break;
102 
103                         // 3c 3f 78 6d == ASCII and supersets '<?xm'
104                         case '?':
105                             if (buf [2] == 'x' && buf [3] == 'm') {
106                                 return  "UTF8"; // NOI18N
107                             }
108                             break;
109                     }
110                     break;
111 
112                 // 4c 6f a7 94 ... some EBCDIC code page
113                 case 0x4c:
114                     if (buf[1] == (byte)0x6f && buf[2] == (byte)0xa7 && buf[3] == (byte)0x94) {
115                         return "Cp037"; // NOI18N
116                     }                     
117                     break;
118 
119                 // UTF-16 big-endian marked
120                 case (byte)0xfe:
121                     if (buf[1] == (byte)0xff && (buf[2] != 0 || buf[3] != 0)) {
122                         return  "UnicodeBig"; // NOI18N
123                     }
124                     break;
125 
126                 // UTF-16 little-endian marked
127                 case (byte)0xff:
128                     if (buf[1] == (byte)0xfe && (buf[2] != 0 || buf[3] != 0)) {                        
129                         return "UnicodeLittle"; // NOI18N
130                     }
131                     break;
132                     
133                 // UTF-8 byte order mark
134                 case (byte)0xef:
135                     if (buf[1] == (byte)0xbb && buf[2] == (byte)0xbf) {
136                         return "UTF8";  //NOI18N
137                     }
138                     break;
139                     
140             }
141         }
142 
143         return null;
144     }
145 
146     /**
147      * Look for encoding='' anyway stop at <tt>?></tt>
148      * @return found encoding or null if none declared
149      */
150     static String   detectDeclaredEncoding(byte[] data, String   baseEncoding) throws IOException {
151 
152         StringBuffer   buf = new StringBuffer  ();
153         Reader r;
154         char delimiter = '"';
155 
156         r = new InputStreamReader(new ByteArrayInputStream(data), baseEncoding);
157         try {
158             for (int c = r.read(); c != -1; c = r.read()) {
159                 buf.append((char)c);
160             }
161         } catch (IOException ex) {
162             // EOF of data out of boundary
163             // dont care try to guess from given data
164         }
165         
166         String   s = buf.toString();
167         
168         int iend = s.indexOf("?>");
169         iend = iend == -1 ? s.length() : iend;
170         
171         int iestart = s.indexOf("encoding");
172         if (iestart == -1 || iestart > iend) return null;
173         
174         char[] chars = s.toCharArray();
175         
176         int i = iestart;
177         
178         for (; i<iend; i++) {
179             if (chars[i] == '=') break;
180         }
181         
182         for (; i<iend; i++) {
183             if (chars[i] == '\'' || chars[i] == '"') {
184                 delimiter = chars[i];
185                 break;
186             }
187                 
188         }
189 
190         i++;
191         
192         int ivalstart = i;
193         for (; i<iend; i++) {
194             if (chars[i] == delimiter) {
195                 return new String  (chars, ivalstart, i - ivalstart);
196             }
197         }
198         
199         return null;
200     }
201     
202     /**
203      * Parse MIME content type for attributes. 
204      */
205     static String   parseMIMECharSet(String   mime) {
206         
207         final String   CHARSET = "charset";
208         
209         if (mime != null) {
210             int i;
211 
212             mime = mime.toLowerCase ();
213             i = mime.indexOf (';');
214             if (i != -1) {
215                 String    attributes;
216 
217                 attributes = mime.substring (i + 1);
218                 mime = mime.substring (0, i);
219 
220                 // use "charset=..." if it's available // NOI18N
221                 i = attributes.indexOf (CHARSET); // NOI18N
222                 if (i != -1) {
223                     attributes = attributes.substring (i + CHARSET.length());
224                     // strip out subsequent attributes
225                     if ((i = attributes.indexOf (';')) != -1)
226                         attributes = attributes.substring (0, i);
227                     // find start of value
228                     if ((i = attributes.indexOf ('=')) != -1) {
229                         attributes = attributes.substring (i + 1);
230                         // strip out rfc822 comments
231                         if ((i = attributes.indexOf ('(')) != -1)
232                             attributes = attributes.substring (0, i);
233                         // double quotes are optional
234                         if ((i = attributes.indexOf ('"')) != -1) {
235                             attributes = attributes.substring (i + 1);
236                             attributes = attributes.substring (0,
237                                                                attributes.indexOf ('"'));
238                         }
239                         return attributes.trim();
240                         // XXX "\;", "\)" etc were mishandled above // NOI18N
241                     }
242                 }
243             }
244         } 
245         
246         return null;        
247     }
248 
249     
250     
251     /** Document itself is encoded as Unicode, but in
252     * the document prolog is an encoding attribute.
253     * @return java encoding names ("UTF8", "ASCII", etc.) or null if no guess
254     */
255     public static String   detectEncoding(Document doc) throws IOException {
256 
257         if (doc == null) return null;
258 
259         try {
260 
261             String   text = doc.getText(0,
262                                       doc.getLength() > EXPECTED_PROLOG_LENGTH ?
263                                       EXPECTED_PROLOG_LENGTH : doc.getLength()
264                                      );
265             InputStream in = new ByteArrayInputStream(text.getBytes());
266             return detectEncoding(in);
267 
268         } catch (BadLocationException ex) {
269             throw new RuntimeException  (ex.toString());
270         }
271 
272     }
273 
274 }
275
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags