MimeUtility


1   /*
2    * The contents of this file are subject to the terms 
3    * of the Common Development and Distribution License 
4    * (the "License").  You may not use this file except 
5    * in compliance with the License.
6    * 
7    * You can obtain a copy of the license at 
8    * glassfish/bootstrap/legal/CDDLv1.0.txt or 
9    * https://glassfish.dev.java.net/public/CDDLv1.0.html. 
10   * See the License for the specific language governing 
11   * permissions and limitations under the License.
12   * 
13   * When distributing Covered Code, include this CDDL 
14   * HEADER in each file and include the License file at 
15   * glassfish/bootstrap/legal/CDDLv1.0.txt.  If applicable, 
16   * add the following below this CDDL HEADER, with the 
17   * fields enclosed by brackets "[]" replaced with your 
18   * own identifying information: Portions Copyright [yyyy] 
19   * [name of copyright owner]
20   */
21  
22  /*
23   * @(#)MimeUtility.java 1.54 05/08/29
24   *
25   * Copyright 1997-2005 Sun Microsystems, Inc. All Rights Reserved.
26   */
27  
28  package javax.mail.internet;
29  
30  import javax.mail.MessagingException  ;
31  import javax.activation.*;
32  import java.util.*;
33  import java.io.*;
34  import com.sun.mail.util.*;
35  
36  /**
37   * This is a utility class that provides various MIME related
38   * functionality. <p>
39   *
40   * There are a set of methods to encode and decode MIME headers as 
41   * per RFC 2047. A brief description on handling such headers is
42   * given below: <p>
43   *
44   * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
45   * characters. Headers that contain non US-ASCII characters must be
46   * encoded so that they contain only US-ASCII characters. Basically,
47   * this process involves using either BASE64 or QP to encode certain
48   * characters. RFC 2047 describes this in detail. <p>
49   *
50   * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
51   * subset of Unicode (and occupies the range 0 - 127). A String
52   * that contains only ASCII characters is already mail-safe. If the
53   * String contains non US-ASCII characters, it must be encoded. An
54   * additional complexity in this step is that since Unicode is not
55   * yet a widely used charset, one might want to first charset-encode
56   * the String into another charset and then do the transfer-encoding.
57   * <p>
58   * Note that to get the actual bytes of a mail-safe String (say,
59   * for sending over SMTP), one must do 
60   * <p><blockquote><pre>
61   *
62   *  byte[] bytes = string.getBytes("iso-8859-1");   
63   *
64   * </pre></blockquote><p>
65   * 
66   * The <code>setHeader</code> and <code>addHeader</code> methods
67   * on MimeMessage and MimeBodyPart assume that the given header values
68   * are Unicode strings that contain only US-ASCII characters. Hence
69   * the callers of those methods must insure that the values they pass
70   * do not contain non US-ASCII characters. The methods in this class 
71   * help do this. <p>
72   *
73   * The <code>getHeader</code> family of methods on MimeMessage and
74   * MimeBodyPart return the raw header value. These might be encoded
75   * as per RFC 2047, and if so, must be decoded into Unicode Strings.
76   * The methods in this class help to do this. <p>
77   *
78   * Several System properties control strict conformance to the MIME
79   * spec.  Note that these are not session properties but must be set
80   * globally as System properties. <p>
81   *
82   * The <code>mail.mime.decodetext.strict</code> property controls
83   * decoding of MIME encoded words.  The MIME spec requires that encoded
84   * words start at the beginning of a whitespace separated word.  Some
85   * mailers incorrectly include encoded words in the middle of a word.
86   * If the <code>mail.mime.decodetext.strict</code> System property is
87   * set to <code>"false"</code>, an attempt will be made to decode these
88   * illegal encoded words. The default is true. <p>
89   *
90   * The <code>mail.mime.encodeeol.strict</code> property controls the
91   * choice of Content-Transfer-Encoding for MIME parts that are not of
92   * type "text".  Often such parts will contain textual data for which
93   * an encoding that allows normal end of line conventions is appropriate.
94   * In rare cases, such a part will appear to contain entirely textual
95   * data, but will require an encoding that preserves CR and LF characters
96   * without change.  If the <code>mail.mime.encodeeol.strict</code>
97   * System property is set to <code>"true"</code>, such an encoding will
98   * be used when necessary.  The default is false. <p>
99   *
100  * In addition, the <code>mail.mime.charset</code> System property can
101  * be used to specify the default MIME charset to use for encoded words
102  * and text parts that don't otherwise specify a charset.  Normally, the
103  * default MIME charset is derived from the default Java charset, as
104  * specified in the <code>file.encoding</code> System property.  Most
105  * applications will have no need to explicitly set the default MIME
106  * charset.  In cases where the default MIME charset to be used for
107  * mail messages is different than the charset used for files stored on
108  * the system, this property should be set.
109  *
110  * @version 1.54, 05/08/29
111  * @author  John Mani
112  * @author  Bill Shannon
113  */
114 
115 public class MimeUtility {
116 
117     // This class cannot be instantiated
118     private MimeUtility() { }
119 
120     public static final int ALL = -1;
121 
122     private static boolean decodeStrict = true;
123     private static boolean encodeEolStrict = false;
124     private static boolean foldEncodedWords = false;
125     private static boolean foldText = true;
126 
127     static {
128     try {
129         String   s = System.getProperty("mail.mime.decodetext.strict");
130         // default to true
131         decodeStrict = s == null || !s.equalsIgnoreCase("false");
132         s = System.getProperty("mail.mime.encodeeol.strict");
133         // default to false
134         encodeEolStrict = s != null && s.equalsIgnoreCase("true");
135         s = System.getProperty("mail.mime.foldencodedwords");
136         // default to false
137         foldEncodedWords = s != null && s.equalsIgnoreCase("true");
138         s = System.getProperty("mail.mime.foldtext");
139         // default to true
140         foldText = s == null || !s.equalsIgnoreCase("false");
141     } catch (SecurityException   sex) {
142         // ignore it
143     }
144     }
145         
146 
147     /**
148      * Get the content-transfer-encoding that should be applied
149      * to the input stream of this datasource, to make it mailsafe. <p>
150      *
151      * The algorithm used here is: <br>
152      * <ul>
153      * <li>
154      * If the primary type of this datasource is "text" and if all
155      * the bytes in its input stream are US-ASCII, then the encoding
156      * is "7bit". If more than half of the bytes are non-US-ASCII, then
157      * the encoding is "base64". If less than half of the bytes are
158      * non-US-ASCII, then the encoding is "quoted-printable".
159      * <li>
160      * If the primary type of this datasource is not "text", then if
161      * all the bytes of its input stream are US-ASCII, the encoding
162      * is "7bit". If there is even one non-US-ASCII character, the
163      * encoding is "base64".
164      * </ul>
165      *
166      * @param   ds  DataSource
167      * @return      the encoding. This is either "7bit",
168      *          "quoted-printable" or "base64"
169      */ 
170     public static String   getEncoding(DataSource ds) {
171     ContentType   cType = null;
172     InputStream is = null;
173     String   encoding = null;
174 
175     try {
176         cType = new ContentType  (ds.getContentType());
177         is = ds.getInputStream();
178     } catch (Exception   ex) {
179         return "base64"; // what else ?!
180     }
181 
182     boolean isText = cType.match("text/*");
183     // if not text, stop processing when we see non-ASCII
184     int i = checkAscii(is, ALL, !isText);
185     switch (i) {
186     case ALL_ASCII:
187         encoding = "7bit"; // all ascii
188         break;
189     case MOSTLY_ASCII:
190         encoding = "quoted-printable"; // mostly ascii
191         break;
192     default:
193         encoding = "base64"; // mostly binary
194         break;
195     }
196 
197     // Close the input stream
198     try {
199         is.close();
200     } catch (IOException ioex) { }
201 
202     return encoding;
203     }
204 
205     /**
206      * Same as <code>getEncoding(DataSource)</code> except that instead
207      * of reading the data from an <code>InputStream</code> it uses the
208      * <code>writeTo</code> method to examine the data.  This is more
209      * efficient in the common case of a <code>DataHandler</code>
210      * created with an object and a MIME type (for example, a
211      * "text/plain" String) because all the I/O is done in this
212      * thread.  In the case requiring an <code>InputStream</code> the
213      * <code>DataHandler</code> uses a thread, a pair of pipe streams,
214      * and the <code>writeTo</code> method to produce the data. <p>
215      *
216      * @since   JavaMail 1.2
217      */
218     public static String   getEncoding(DataHandler dh) {
219     ContentType   cType = null;
220     String   encoding = null;
221 
222     /*
223      * Try to pick the most efficient means of determining the
224      * encoding.  If this DataHandler was created using a DataSource,
225      * the getEncoding(DataSource) method is typically faster.  If
226      * the DataHandler was created with an object, this method is
227      * much faster.  To distinguish the two cases, we use a heuristic.
228      * A DataHandler created with an object will always have a null name.
229      * A DataHandler created with a DataSource will usually have a
230      * non-null name.
231      *
232      * XXX - This is actually quite a disgusting hack, but it makes
233      *   a common case run over twice as fast.
234      */
235     if (dh.getName() != null)
236         return getEncoding(dh.getDataSource());
237 
238     try {
239         cType = new ContentType  (dh.getContentType());
240     } catch (Exception   ex) {
241         return "base64"; // what else ?!
242     }
243 
244     if (cType.match("text/*")) {
245         // Check all of the available bytes
246         AsciiOutputStream aos = new AsciiOutputStream(false, false);
247         try {
248         dh.writeTo(aos);
249         } catch (IOException ex) { }    // ignore it
250         switch (aos.getAscii()) {
251         case ALL_ASCII:
252         encoding = "7bit"; // all ascii
253         break;
254         case MOSTLY_ASCII:
255         encoding = "quoted-printable"; // mostly ascii
256         break;
257         default:
258         encoding = "base64"; // mostly binary
259         break;
260         }
261     } else { // not "text"
262         // Check all of available bytes, break out if we find
263         // at least one non-US-ASCII character
264         AsciiOutputStream aos =
265             new AsciiOutputStream(true, encodeEolStrict);
266         try {
267         dh.writeTo(aos);
268         } catch (IOException ex) { }    // ignore it
269         if (aos.getAscii() == ALL_ASCII) // all ascii
270         encoding = "7bit";
271         else // found atleast one non-ascii character, use b64 
272         encoding = "base64";
273     }
274 
275     return encoding;
276     }
277 
278     /**
279      * Decode the given input stream. The Input stream returned is
280      * the decoded input stream. All the encodings defined in RFC 2045
281      * are supported here. They include "base64", "quoted-printable",
282      * "7bit", "8bit", and "binary". In addition, "uuencode" is also
283      * supported.
284      *
285      * @param   is      input stream
286      * @param   encoding    the encoding of the stream.
287      * @return          decoded input stream.
288      */
289     public static InputStream decode(InputStream is, String   encoding)
290         throws MessagingException   {
291     if (encoding.equalsIgnoreCase("base64"))
292         return new BASE64DecoderStream(is);
293     else if (encoding.equalsIgnoreCase("quoted-printable"))
294         return new QPDecoderStream(is);
295     else if (encoding.equalsIgnoreCase("uuencode") ||
296          encoding.equalsIgnoreCase("x-uuencode") ||
297          encoding.equalsIgnoreCase("x-uue"))
298         return new UUDecoderStream(is);
299     else if (encoding.equalsIgnoreCase("binary") ||
300          encoding.equalsIgnoreCase("7bit") ||
301          encoding.equalsIgnoreCase("8bit"))
302         return is;
303     else
304         throw new MessagingException  ("Unknown encoding: " + encoding);
305     }
306 
307     /**
308      * Wrap an encoder around the given output stream. 
309      * All the encodings defined in RFC 2045 are supported here. 
310      * They include "base64", "quoted-printable", "7bit", "8bit" and
311      * "binary". In addition, "uuencode" is also supported.
312      *
313      * @param   os      output stream
314      * @param   encoding    the encoding of the stream. 
315      * @return          output stream that applies the
316      *              specified encoding.
317      */
318     public static OutputStream encode(OutputStream os, String   encoding)
319         throws MessagingException   {
320         if (encoding == null)
321         return os;
322     else if (encoding.equalsIgnoreCase("base64"))
323         return new BASE64EncoderStream(os);
324     else if (encoding.equalsIgnoreCase("quoted-printable"))
325         return new QPEncoderStream(os);
326     else if (encoding.equalsIgnoreCase("uuencode") ||
327          encoding.equalsIgnoreCase("x-uuencode") ||
328          encoding.equalsIgnoreCase("x-uue"))
329         return new UUEncoderStream(os);
330     else if (encoding.equalsIgnoreCase("binary") ||
331          encoding.equalsIgnoreCase("7bit") ||
332          encoding.equalsIgnoreCase("8bit"))
333         return os;
334     else
335         throw new MessagingException  ("Unknown encoding: " +encoding);
336     }
337 
338     /**
339      * Wrap an encoder around the given output stream.
340      * All the encodings defined in RFC 2045 are supported here.
341      * They include "base64", "quoted-printable", "7bit", "8bit" and
342      * "binary". In addition, "uuencode" is also supported.
343      * The <code>filename</code> parameter is used with the "uuencode"
344      * encoding and is included in the encoded output.
345      *
346      * @param   os              output stream
347      * @param   encoding        the encoding of the stream.
348      * @param   filename        name for the file being encoded (only used
349      *                          with uuencode)
350      * @return                  output stream that applies the
351      *                          specified encoding.
352      * @since                   JavaMail 1.2
353      */
354     public static OutputStream encode(OutputStream os, String   encoding,
355                                       String   filename)
356                 throws MessagingException   {
357         if (encoding == null)
358             return os;
359         else if (encoding.equalsIgnoreCase("base64"))
360             return new BASE64EncoderStream(os);
361         else if (encoding.equalsIgnoreCase("quoted-printable"))
362             return new QPEncoderStream(os);
363         else if (encoding.equalsIgnoreCase("uuencode") ||
364                  encoding.equalsIgnoreCase("x-uuencode") ||
365                  encoding.equalsIgnoreCase("x-uue"))
366             return new UUEncoderStream(os, filename);
367         else if (encoding.equalsIgnoreCase("binary") ||
368                  encoding.equalsIgnoreCase("7bit") ||
369                  encoding.equalsIgnoreCase("8bit"))
370             return os;
371         else
372             throw new MessagingException  ("Unknown encoding: " +encoding);
373     }
374 
375     /**
376      * Encode a RFC 822 "text" token into mail-safe form as per
377      * RFC 2047. <p>
378      *
379      * The given Unicode string is examined for non US-ASCII
380      * characters. If the string contains only US-ASCII characters,
381      * it is returned as-is.  If the string contains non US-ASCII
382      * characters, it is first character-encoded using the platform's
383      * default charset, then transfer-encoded using either the B or 
384      * Q encoding. The resulting bytes are then returned as a Unicode 
385      * string containing only ASCII  characters. <p>
386      *
387      * Note that this method should be used to encode only 
388      * "unstructured" RFC 822 headers. <p>
389      *
390      * Example of usage:
391      * <p><blockquote><pre>
392      *
393      *  MimePart part = ...
394      *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
395      *  try {
396      *    // If we know for sure that rawvalue contains only US-ASCII 
397      *    // characters, we can skip the encoding part
398      *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
399      *  } catch (UnsupportedEncodingException e) {
400      *    // encoding failure
401      *  } catch (MessagingException me) {
402      *   // setHeader() failure
403      *  }
404      *
405      * </pre></blockquote><p>
406      * 
407      * @param   text    Unicode string
408      * @return  Unicode string containing only US-ASCII characters
409      * @exception UnsupportedEncodingException if the encoding fails
410      */
411     public static String   encodeText(String   text)
412             throws UnsupportedEncodingException {
413     return encodeText(text, null, null);
414     }
415 
416     /**
417      * Encode a RFC 822 "text" token into mail-safe form as per
418      * RFC 2047. <p>
419      *
420      * The given Unicode string is examined for non US-ASCII
421      * characters. If the string contains only US-ASCII characters,
422      * it is returned as-is.  If the string contains non US-ASCII
423      * characters, it is first character-encoded using the specified
424      * charset, then transfer-encoded using either the B or Q encoding.
425      * The resulting bytes are then returned as a Unicode string 
426      * containing only ASCII characters. <p>
427      *
428      * Note that this method should be used to encode only 
429      * "unstructured" RFC 822 headers. 
430      * 
431      * @param   text    the header value
432      * @param   charset the charset. If this parameter is null, the
433      *      platform's default chatset is used.
434      * @param   encoding the encoding to be used. Currently supported
435      *      values are "B" and "Q". If this parameter is null, then
436      *      the "Q" encoding is used if most of characters to be
437      *      encoded are in the ASCII charset, otherwise "B" encoding
438      *      is used.
439      * @return  Unicode string containing only US-ASCII characters
440      */
441     public static String   encodeText(String   text, String   charset,
442                     String   encoding)
443             throws UnsupportedEncodingException {
444     return encodeWord(text, charset, encoding, false);
445     }
446 
447     /**
448      * Decode "unstructured" headers, that is, headers that are defined
449      * as '*text' as per RFC 822. <p>
450      *
451      * The string is decoded using the algorithm specified in
452      * RFC 2047, Section 6.1.1. If the charset-conversion fails
453      * for any sequence, an UnsupportedEncodingException is thrown.
454      * If the String is not an RFC 2047 style encoded header, it is
455      * returned as-is <p>
456      *
457      * Example of usage:
458      * <p><blockquote><pre>
459      *
460      *  MimePart part = ...
461      *  String rawvalue = null;
462      *  String  value = null;
463      *  try {
464      *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
465      *      value = MimeUtility.decodeText(rawvalue);
466      *  } catch (UnsupportedEncodingException e) {
467      *      // Don't care
468      *      value = rawvalue;
469      *  } catch (MessagingException me) { }
470      *
471      *  return value;
472      *
473      * </pre></blockquote><p>
474      *
475      * @param   etext   the possibly encoded value
476      * @exception       UnsupportedEncodingException if the charset
477      *          conversion failed.
478      */
479     public static String   decodeText(String   etext)
480         throws UnsupportedEncodingException {
481     /*
482      * We look for sequences separated by "linear-white-space".
483      * (as per RFC 2047, Section 6.1.1)
484      * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
485      */
486     String   lwsp = " \t\n\r";
487     StringTokenizer st;
488 
489     /*
490      * First, lets do a quick run thru the string and check
491      * whether the sequence "=?"  exists at all. If none exists,
492      * we know there are no encoded-words in here and we can just
493      * return the string as-is, without suffering thru the later 
494      * decoding logic. 
495      * This handles the most common case of unencoded headers 
496      * efficiently.
497      */
498     if (etext.indexOf("=?") == -1)
499         return etext;
500 
501     // Encoded words found. Start decoding ...
502 
503     st = new StringTokenizer(etext, lwsp, true);
504     StringBuffer   sb = new StringBuffer  ();  // decode buffer
505     StringBuffer   wsb = new StringBuffer  (); // white space buffer
506     boolean prevWasEncoded = false;
507 
508     while (st.hasMoreTokens()) {
509         char c;
510         String   s = st.nextToken();
511         // If whitespace, append it to the whitespace buffer
512         if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
513         (c == '\r') || (c == '\n'))
514         wsb.append(c);
515         else {
516         // Check if token is an 'encoded-word' ..
517         String   word;
518         try {
519             word = decodeWord(s);
520             // Yes, this IS an 'encoded-word'.
521             if (!prevWasEncoded && wsb.length() > 0) {
522             // if the previous word was also encoded, we
523             // should ignore the collected whitespace. Else
524             // we include the whitespace as well.
525             sb.append(wsb);
526             }
527             prevWasEncoded = true;
528         } catch (ParseException   pex) {
529             // This is NOT an 'encoded-word'.
530             word = s;
531             // possibly decode inner encoded words
532             if (!decodeStrict)
533             word = decodeInnerWords(word);
534             // include colleced whitespace ..
535             if (wsb.length() > 0)
536             sb.append(wsb);
537             prevWasEncoded = false;
538         }
539         sb.append(word); // append the actual word
540         wsb.setLength(0); // reset wsb for reuse
541         }
542     }
543     return sb.toString();
544     }
545 
546     /**
547      * Encode a RFC 822 "word" token into mail-safe form as per
548      * RFC 2047. <p>
549      *
550      * The given Unicode string is examined for non US-ASCII
551      * characters. If the string contains only US-ASCII characters,
552      * it is returned as-is.  If the string contains non US-ASCII
553      * characters, it is first character-encoded using the platform's
554      * default charset, then transfer-encoded using either the B or 
555      * Q encoding. The resulting bytes are then returned as a Unicode 
556      * string containing only ASCII  characters. <p>
557      * 
558      * This method is meant to be used when creating RFC 822 "phrases".
559      * The InternetAddress class, for example, uses this to encode
560      * it's 'phrase' component.
561      *
562      * @param   word    Unicode string
563      * @return  Array of Unicode strings containing only US-ASCII 
564      *      characters.
565      * @exception UnsupportedEncodingException if the encoding fails
566      */
567     public static String   encodeWord(String   word) 
568             throws UnsupportedEncodingException {
569     return encodeWord(word, null, null);
570     }
571 
572     /**
573      * Encode a RFC 822 "word" token into mail-safe form as per
574      * RFC 2047. <p>
575      *
576      * The given Unicode string is examined for non US-ASCII
577      * characters. If the string contains only US-ASCII characters,
578      * it is returned as-is.  If the string contains non US-ASCII
579      * characters, it is first character-encoded using the specified
580      * charset, then transfer-encoded using either the B or Q encoding.
581      * The resulting bytes are then returned as a Unicode string 
582      * containing only ASCII characters. <p>
583      * 
584      * @param   word    Unicode string
585      * @param   charset the MIME charset
586      * @param   encoding the encoding to be used. Currently supported
587      *      values are "B" and "Q". If this parameter is null, then
588      *      the "Q" encoding is used if most of characters to be
589      *      encoded are in the ASCII charset, otherwise "B" encoding
590      *      is used.
591      * @return  Unicode string containing only US-ASCII characters
592      * @exception UnsupportedEncodingException if the encoding fails
593      */
594     public static String   encodeWord(String   word, String   charset, 
595                     String   encoding)
596                 throws UnsupportedEncodingException {
597     return encodeWord(word, charset, encoding, true);
598     }
599 
600     /*
601      * Encode the given string. The parameter 'encodingWord' should
602      * be true if a RFC 822 "word" token is being encoded and false if a
603      * RFC 822 "text" token is being encoded. This is because the 
604      * "Q" encoding defined in RFC 2047 has more restrictions when
605      * encoding "word" tokens. (Sigh)
606      */ 
607     private static String   encodeWord(String   string, String   charset,
608                      String   encoding, boolean encodingWord)
609             throws UnsupportedEncodingException {
610 
611     // If 'string' contains only US-ASCII characters, just
612     // return it.
613     int ascii = checkAscii(string);
614     if (ascii == ALL_ASCII)
615         return string;
616 
617     // Else, apply the specified charset conversion.
618     String   jcharset;
619     if (charset == null) { // use default charset
620         jcharset = getDefaultJavaCharset(); // the java charset
621         charset = getDefaultMIMECharset(); // the MIME equivalent
622     } else // MIME charset -> java charset
623         jcharset = javaCharset(charset);
624 
625     // If no transfer-encoding is specified, figure one out.
626     if (encoding == null) {
627         if (ascii != MOSTLY_NONASCII)
628         encoding = "Q";
629         else
630         encoding = "B";
631     }
632 
633     boolean b64;
634     if (encoding.equalsIgnoreCase("B")) 
635         b64 = true;
636     else if (encoding.equalsIgnoreCase("Q"))
637         b64 = false;
638     else
639         throw new UnsupportedEncodingException(
640             "Unknown transfer encoding: " + encoding);
641 
642     StringBuffer   outb = new StringBuffer  (); // the output buffer
643     doEncode(string, b64, jcharset, 
644          // As per RFC 2047, size of an encoded string should not
645          // exceed 75 bytes.
646          // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
647          75 - 7 - charset.length(), // the available space
648          "=?" + charset + "?" + encoding + "?", // prefix
649          true, encodingWord, outb);
650 
651     return outb.toString();
652     }
653 
654     private static void doEncode(String   string, boolean b64, 
655         String   jcharset, int avail, String   prefix, 
656         boolean first, boolean encodingWord, StringBuffer   buf) 
657             throws UnsupportedEncodingException {
658 
659     // First find out what the length of the encoded version of
660     // 'string' would be.
661     byte[] bytes = string.getBytes(jcharset);
662     int len;
663     if (b64) // "B" encoding
664         len = BEncoderStream.encodedLength(bytes);
665     else // "Q"
666         len = QEncoderStream.encodedLength(bytes, encodingWord);
667     
668     int size;
669     if ((len > avail) && ((size = string.length()) > 1)) { 
670         // If the length is greater than 'avail', split 'string'
671         // into two and recurse.
672         doEncode(string.substring(0, size/2), b64, jcharset, 
673              avail, prefix, first, encodingWord, buf);
674         doEncode(string.substring(size/2, size), b64, jcharset,
675              avail, prefix, false, encodingWord, buf);
676     } else {
677         // length <= than 'avail'. Encode the given string
678         ByteArrayOutputStream os = new ByteArrayOutputStream();
679         OutputStream eos; // the encoder
680         if (b64) // "B" encoding
681         eos = new BEncoderStream(os);
682         else // "Q" encoding
683         eos = new QEncoderStream(os, encodingWord);
684         
685         try { // do the encoding
686         eos.write(bytes);
687         eos.close();
688         } catch (IOException ioex) { }
689 
690         byte[] encodedBytes = os.toByteArray(); // the encoded stuff
691         // Now write out the encoded (all ASCII) bytes into our
692         // StringBuffer
693         if (!first) // not the first line of this sequence
694         if (foldEncodedWords)
695             buf.append("\r\n "); // start a continuation line
696         else
697             buf.append(" "); // line will be folded later
698 
699         buf.append(prefix);
700         for (int i = 0; i < encodedBytes.length; i++)
701         buf.append((char)encodedBytes[i]);
702         buf.append("?="); // terminate the current sequence
703     }
704     }
705 
706     /**
707      * The string is parsed using the rules in RFC 2047 for parsing
708      * an "encoded-word". If the parse fails, a ParseException is 
709      * thrown. Otherwise, it is transfer-decoded, and then 
710      * charset-converted into Unicode. If the charset-conversion
711      * fails, an UnsupportedEncodingException is thrown.<p>
712      *
713      * @param   eword   the possibly encoded value
714      * @exception       ParseException if the string is not an
715      *          encoded-word as per RFC 2047.
716      * @exception       UnsupportedEncodingException if the charset
717      *          conversion failed.
718      */
719     public static String   decodeWord(String   eword)
720         throws ParseException  , UnsupportedEncodingException {
721 
722     if (!eword.startsWith("=?")) // not an encoded word
723         throw new ParseException  ();
724     
725     // get charset
726     int start = 2; int pos; 
727     if ((pos = eword.indexOf('?', start)) == -1)
728         throw new ParseException  ();
729     String   charset = javaCharset(eword.substring(start, pos));
730 
731     // get encoding
732     start = pos+1;
733     if ((pos = eword.indexOf('?', start)) == -1)
734         throw new ParseException  ();
735     String   encoding = eword.substring(start, pos);
736 
737     // get encoded-sequence
738     start = pos+1;
739     if ((pos = eword.indexOf("?=", start)) == -1)
740         throw new ParseException  ();
741     String   word = eword.substring(start, pos);
742 
743     try {
744         String   decodedWord;
745         if (word.length() > 0) {
746         // Extract the bytes from word
747         ByteArrayInputStream bis = 
748             new ByteArrayInputStream(ASCIIUtility.getBytes(word));
749 
750         // Get the appropriate decoder
751         InputStream is;
752         if (encoding.equalsIgnoreCase("B")) 
753             is = new BASE64DecoderStream(bis);
754         else if (encoding.equalsIgnoreCase("Q"))
755             is = new QDecoderStream(bis);
756         else
757             throw new UnsupportedEncodingException(
758                     "unknown encoding: " + encoding);
759 
760         // For b64 & q, size of decoded word <= size of word. So
761         // the decoded bytes must fit into the 'bytes' array. This
762         // is certainly more efficient than writing bytes into a
763         // ByteArrayOutputStream and then pulling out the byte[]
764         // from it.
765         int count = bis.available();
766         byte[] bytes = new byte[count];
767         // count is set to the actual number of decoded bytes 
768         count = is.read(bytes, 0, count);
769 
770         // Finally, convert the decoded bytes into a String using
771         // the specified charset
772         decodedWord = count <= 0 ? "" :
773                 new String  (bytes, 0, count, charset);
774         } else {
775         // no characters to decode, return empty string
776         decodedWord = "";
777         }
778         if (pos + 2 < eword.length()) {
779         // there's still more text in the string
780         String   rest = eword.substring(pos + 2);
781         if (!decodeStrict)
782             rest = decodeInnerWords(rest);
783         decodedWord += rest;
784         }
785         return decodedWord;
786     } catch (UnsupportedEncodingException uex) {
787         // explicitly catch and rethrow this exception, otherwise
788         // the below IOException catch will swallow this up!
789         throw uex;
790     } catch (IOException ioex) {
791         // Shouldn't happen.
792         throw new ParseException  ();
793     } catch (IllegalArgumentException   iex) {
794         /* An unknown charset of the form ISO-XXX-XXX, will cause
795          * the JDK to throw an IllegalArgumentException ... Since the
796          * JDK will attempt to create a classname using this string,
797          * but valid classnames must not contain the character '-',
798          * and this results in an IllegalArgumentException, rather than
799          * the expected UnsupportedEncodingException. Yikes
800          */
801         throw new UnsupportedEncodingException();
802     }
803     }
804 
805     /**
806      * Look for encoded words within a word.  The MIME spec doesn't
807      * allow this, but many broken mailers, especially Japanese mailers,
808      * produce such incorrect encodings.
809      */
810     private static String   decodeInnerWords(String   word)
811                 throws UnsupportedEncodingException {
812     int start = 0, i;
813     StringBuffer   buf = new StringBuffer  ();
814     while ((i = word.indexOf("=?", start)) >= 0) {
815         buf.append(word.substring(start, i));
816         int end = word.indexOf("?=", i);
817         if (end < 0)
818         break;
819         String   s = word.substring(i, end + 2);
820         try {
821         s = decodeWord(s);
822         } catch (ParseException   pex) {
823         // ignore it, just use the original string
824         }
825         buf.append(s);
826         start = end + 2;
827     }
828     if (start == 0)
829         return word;
830     if (start < word.length())
831         buf.append(word.substring(start));
832     return buf.toString();
833     }
834 
835     /**
836      * A utility method to quote a word, if the word contains any
837      * characters from the specified 'specials' list.<p>
838      *
839      * The <code>HeaderTokenizer</code> class defines two special
840      * sets of delimiters - MIME and RFC 822. <p>
841      *
842      * This method is typically used during the generation of 
843      * RFC 822 and MIME header fields.
844      *
845      * @param   word    word to be quoted
846      * @param   specials the set of special characters
847      * @return      the possibly quoted word
848      * @see javax.mail.internet.HeaderTokenizer#MIME
849      * @see javax.mail.internet.HeaderTokenizer#RFC822
850      */
851     public static String   quote(String   word, String   specials) {
852     int len = word.length();
853 
854     /*
855      * Look for any "bad" characters, Escape and
856      *  quote the entire string if necessary.
857      */
858     boolean needQuoting = false;
859     for (int i = 0; i < len; i++) {
860         char c = word.charAt(i);
861         if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
862         // need to escape them and then quote the whole string
863         StringBuffer   sb = new StringBuffer  (len + 3);
864         sb.append('"');
865         sb.append(word.substring(0, i));
866         int lastc = 0;
867         for (int j = i; j < len; j++) {
868             char cc = word.charAt(j);
869             if ((cc == '"') || (cc == '\\') || 
870             (cc == '\r') || (cc == '\n'))
871             if (cc == '\n' && lastc == '\r')
872                 ;   // do nothing, CR was already escaped
873             else
874                 sb.append('\\');    // Escape the character
875             sb.append(cc);
876             lastc = cc;
877         }
878         sb.append('"');
879         return sb.toString();
880         } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
881         // These characters cause the string to be quoted
882         needQuoting = true;
883     }
884 
885     if (needQuoting) {
886         StringBuffer   sb = new StringBuffer  (len + 2);
887         sb.append('"').append(word).append('"');
888         return sb.toString();
889     } else 
890         return word;
891     }
892 
893     /**
894      * Fold a string at linear whitespace so that each line is no longer
895      * than 76 characters, if possible.  If there are more than 76
896      * non-whitespace characters consecutively, the string is folded at
897      * the first whitespace after that sequence.  The parameter
898      * <code>used</code> indicates how many characters have been used in
899      * the current line; it is usually the length of the header name. <p>
900      *
901      * Note that line breaks in the string aren't escaped; they probably
902      * should be.
903      *
904      * @param   used    characters used in line so far
905      * @param   s   the string to fold
906      * @return      the folded string
907      * @since       JavaMail 1.4
908      */
909     public static String   fold(int used, String   s) {
910     if (!foldText)
911         return s;
912 
913     int end;
914     char c;
915     // Strip trailing spaces and newlines
916     for (end = s.length() - 1; end >= 0; end--) {
917         c = s.charAt(end);
918         if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
919         break;
920     }
921     if (end != s.length() - 1)
922         s = s.substring(0, end + 1);
923 
924     // if the string fits now, just return it
925     if (used + s.length() <= 76)
926         return s;
927 
928     // have to actually fold the string
929     StringBuffer   sb = new StringBuffer  (s.length() + 4);
930     char lastc = 0;
931     while (used + s.length() > 76) {
932         int lastspace = -1;
933         for (int i = 0; i < s.length(); i++) {
934         if (lastspace != -1 && used + i > 76)
935             break;
936         c = s.charAt(i);
937         if (c == ' ' || c == '\t')
938             if (!(lastc == ' ' || lastc == '\t'))
939             lastspace = i;
940         lastc = c;
941         }
942         if (lastspace == -1) {
943         // no space, use the whole thing
944         sb.append(s);
945         s = "";
946         used = 0;
947         break;
948         }
949         sb.append(s.substring(0, lastspace));
950         sb.append("\r\n");
951         lastc = s.charAt(lastspace);
952         sb.append(lastc);
953         s = s.substring(lastspace + 1);
954         used = 1;
955     }
956     sb.append(s);
957     return sb.toString();
958     }
959 
960     /**
961      * Unfold a folded header.  Any line breaks that aren't escaped and
962      * are followed by whitespace are removed.
963      *
964      * @param   s   the string to unfold
965      * @return      the unfolded string
966      * @since       JavaMail 1.4
967      */
968     public static String   unfold(String   s) {
969     if (!foldText)
970         return s;
971 
972     StringBuffer   sb = null;
973     int i;
974     while ((i = indexOfAny(s, "\r\n")) >= 0) {
975         int start = i;
976         int l = s.length();
977         i++;        // skip CR or NL
978         if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
979         i++;    // skip LF
980         if (start == 0 || s.charAt(start - 1) != '\\') {
981         char c;
982         // if next line starts with whitespace, skip all of it
983         // XXX - always has to be true?
984         if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
985             i++;    // skip whitespace
986             while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
987             i++;
988             if (sb == null)
989             sb = new StringBuffer  (s.length());
990             if (start != 0) {
991             sb.append(s.substring(0, start));
992             sb.append(' ');
993             }
994             s = s.substring(i);
995             continue;
996         }
997         // it's not a continuation line, just leave it in
998         if (sb == null)
999             sb = new StringBuffer  (s.length());
1000        sb.append(s.substring(0, i));
1001        s = s.substring(i);
1002        } else {
1003        // there's a backslash at "start - 1"
1004        // strip it out, but leave in the line break
1005        if (sb == null)
1006            sb = new StringBuffer  (s.length());
1007        sb.append(s.substring(0, start - 1));
1008        sb.append(s.substring(start, i));
1009        s = s.substring(i);
1010        }
1011    }
1012    if (sb != null) {
1013        sb.append(s);
1014        return sb.toString();
1015    } else
1016        return s;
1017    }
1018
1019    /**
1020     * Return the first index of any of the characters in "any" in "s",
1021     * or -1 if none are found.
1022     *
1023     * This should be a method on String.
1024     */
1025    private static int indexOfAny(String   s, String   any) {
1026    return indexOfAny(s, any, 0);
1027    }
1028
1029    private static int indexOfAny(String   s, String   any, int start) {
1030    try {
1031        int len = s.length();
1032        for (int i = start; i < len; i++) {
1033        if (any.indexOf(s.charAt(i)) >= 0)
1034            return i;
1035        }
1036        return -1;
1037    } catch (StringIndexOutOfBoundsException   e) {
1038        return -1;
1039    }
1040    }
1041
1042    /**
1043     * Convert a MIME charset name into a valid Java charset name. <p>
1044     *
1045     * @param charset   the MIME charset name
1046     * @return  the Java charset equivalent. If a suitable mapping is
1047     *      not available, the passed in charset is itself returned.
1048     */
1049    public static String   javaCharset(String   charset) {
1050    if (mime2java == null || charset == null)
1051        // no mapping table, or charset parameter is null
1052        return charset;
1053
1054    String   alias = (String  )mime2java.get(charset.toLowerCase());
1055    return alias == null ? charset : alias;
1056    }
1057
1058    /**
1059     * Convert a java charset into its MIME charset name. <p>
1060     *
1061     * Note that a future version of JDK (post 1.2) might provide
1062     * this functionality, in which case, we may deprecate this
1063     * method then.
1064     *
1065     * @param   charset    the JDK charset
1066     * @return          the MIME/IANA equivalent. If a mapping
1067     *          is not possible, the passed in charset itself
1068     *          is returned.
1069     * @since       JavaMail 1.1
1070     */
1071    public static String   mimeCharset(String   charset) {
1072    if (java2mime == null || charset == null) 
1073        // no mapping table or charset param is null
1074        return charset;
1075
1076    String   alias = (String  )java2mime.get(charset.toLowerCase());
1077    return alias == null ? charset : alias;
1078    }
1079
1080    private static String   defaultJavaCharset;
1081    private static String   defaultMIMECharset;
1082
1083    /**
1084     * Get the default charset corresponding to the system's current 
1085     * default locale.  If the System property <code>mail.mime.charset</code>
1086     * is set, a system charset corresponding to this MIME charset will be
1087     * returned. <p>
1088     * 
1089     * @return  the default charset of the system's default locale, 
1090     *      as a Java charset. (NOT a MIME charset)
1091     * @since   JavaMail 1.1
1092     */
1093    public static String   getDefaultJavaCharset() {
1094    if (defaultJavaCharset == null) {
1095        /*
1096         * If mail.mime.charset is set, it controls the default
1097         * Java charset as well.
1098         */
1099        String   mimecs = null;
1100        try {
1101        mimecs = System.getProperty("mail.mime.charset");
1102        } catch (SecurityException   ex) { }  // ignore it
1103        if (mimecs != null && mimecs.length() > 0) {
1104        defaultJavaCharset = javaCharset(mimecs);
1105        return defaultJavaCharset;
1106        }
1107
1108        try {
1109        defaultJavaCharset = System.getProperty("file.encoding", 
1110                            "8859_1");
1111        } catch (SecurityException   sex) {
1112        
1113        class NullInputStream extends InputStream {
1114            public int read() {
1115            return 0;
1116            }
1117        }
1118        InputStreamReader reader = 
1119            new InputStreamReader(new NullInputStream());
1120        defaultJavaCharset = reader.getEncoding();
1121        if (defaultJavaCharset == null)
1122            defaultJavaCharset = "8859_1";
1123        }
1124    }
1125
1126    return defaultJavaCharset;
1127    }
1128
1129    /*
1130     * Get the default MIME charset for this locale.
1131     */
1132    static String   getDefaultMIMECharset() {
1133    if (defaultMIMECharset == null) {
1134        try {
1135        defaultMIMECharset = System.getProperty("mail.mime.charset");
1136        } catch (SecurityException   ex) { }  // ignore it
1137    }
1138    if (defaultMIMECharset == null)
1139        defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
1140    return defaultMIMECharset;
1141    }
1142
1143    // Tables to map MIME charset names to Java names and vice versa.
1144    // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
1145    private static Hashtable mime2java;
1146    private static Hashtable java2mime;
1147
1148    static {
1149    java2mime = new Hashtable(40);
1150    mime2java = new Hashtable(10);
1151
1152    try {
1153        // Use this class's classloader to load the mapping file
1154        // XXX - we should use SecuritySupport, but it's in another package
1155        InputStream is = 
1156            javax.mail.internet.MimeUtility  .class.getResourceAsStream(
1157            "/META-INF/javamail.charset.map");
1158
1159        if (is != null) {
1160        try {
1161            is = new LineInputStream(is);
1162
1163            // Load the JDK-to-MIME charset mapping table
1164            loadMappings((LineInputStream)is, java2mime);
1165
1166            // Load the MIME-to-JDK charset mapping table
1167            loadMappings((LineInputStream)is, mime2java);
1168        } finally {
1169            try {
1170            is.close();
1171            } catch (Exception   cex) {
1172            // ignore
1173            }
1174        }
1175        }
1176    } catch (Exception   ex) { }
1177
1178    // If we didn't load the tables, e.g., because we didn't have
1179    // permission, load them manually.  The entries here should be
1180    // the same as the default javamail.charset.map.
1181    if (java2mime.isEmpty()) {
1182        java2mime.put("8859_1", "ISO-8859-1");
1183        java2mime.put("iso8859_1", "ISO-8859-1");
1184        java2mime.put("iso8859-1", "ISO-8859-1");
1185
1186        java2mime.put("8859_2", "ISO-8859-2");
1187        java2mime.put("iso8859_2", "ISO-8859-2");
1188        java2mime.put("iso8859-2", "ISO-8859-2");
1189
1190        java2mime.put("8859_3", "ISO-8859-3");
1191        java2mime.put("iso8859_3", "ISO-8859-3");
1192        java2mime.put("iso8859-3", "ISO-8859-3");
1193
1194        java2mime.put("8859_4", "ISO-8859-4");
1195        java2mime.put("iso8859_4", "ISO-8859-4");
1196        java2mime.put("iso8859-4", "ISO-8859-4");
1197
1198        java2mime.put("8859_5", "ISO-8859-5");
1199        java2mime.put("iso8859_5", "ISO-8859-5");
1200        java2mime.put("iso8859-5", "ISO-8859-5");
1201
1202        java2mime.put("8859_6", "ISO-8859-6");
1203        java2mime.put("iso8859_6", "ISO-8859-6");
1204        java2mime.put("iso8859-6", "ISO-8859-6");
1205
1206        java2mime.put("8859_7", "ISO-8859-7");
1207        java2mime.put("iso8859_7", "ISO-8859-7");
1208        java2mime.put("iso8859-7", "ISO-8859-7");
1209
1210        java2mime.put("8859_8", "ISO-8859-8");
1211        java2mime.put("iso8859_8", "ISO-8859-8");
1212        java2mime.put("iso8859-8", "ISO-8859-8");
1213
1214        java2mime.put("8859_9", "ISO-8859-9");
1215        java2mime.put("iso8859_9", "ISO-8859-9");
1216        java2mime.put("iso8859-9", "ISO-8859-9");
1217
1218        java2mime.put("sjis", "Shift_JIS");
1219        java2mime.put("jis", "ISO-2022-JP");
1220        java2mime.put("iso2022jp", "ISO-2022-JP");
1221        java2mime.put("euc_jp", "euc-jp");
1222        java2mime.put("koi8_r", "koi8-r");
1223        java2mime.put("euc_cn", "euc-cn");
1224        java2mime.put("euc_tw", "euc-tw");
1225        java2mime.put("euc_kr", "euc-kr");
1226    }
1227    if (mime2java.isEmpty()) {
1228        mime2java.put("iso-2022-cn", "ISO2022CN");
1229        mime2java.put("iso-2022-kr", "ISO2022KR");
1230        mime2java.put("utf-8", "UTF8");
1231        mime2java.put("utf8", "UTF8");
1232        mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
1233        mime2java.put("ja_jp.eucjp", "EUCJIS");
1234        mime2java.put("euc-kr", "KSC5601");
1235        mime2java.put("euckr", "KSC5601");
1236        mime2java.put("us-ascii", "ISO-8859-1");
1237        mime2java.put("x-us-ascii", "ISO-8859-1");
1238    }
1239    }
1240
1241    private static void loadMappings(LineInputStream is, Hashtable table) {
1242    String   currLine;
1243
1244    while (true) {
1245        try {
1246        currLine = is.readLine();
1247        } catch (IOException ioex) {
1248        break; // error in reading, stop
1249        }
1250
1251        if (currLine == null) // end of file, stop
1252        break;
1253        if (currLine.startsWith("--") && currLine.endsWith("--"))
1254        // end of this table
1255        break;  
1256
1257        // ignore empty lines and comments
1258        if (currLine.trim().length() == 0 || currLine.startsWith("#"))
1259        continue;
1260        
1261        // A valid entry is of the form <key><separator><value>
1262        // where, <separator> := SPACE | HT. Parse this
1263        StringTokenizer tk = new StringTokenizer(currLine, " \t");
1264        try {
1265        String   key = tk.nextToken();
1266        String   value = tk.nextToken();
1267        table.put(key.toLowerCase(), value);
1268        } catch (NoSuchElementException nex) { }
1269    }
1270    }
1271
1272    static final int ALL_ASCII      = 1;
1273    static final int MOSTLY_ASCII   = 2;
1274    static final int MOSTLY_NONASCII    = 3;
1275
1276    /** 
1277     * Check if the given string contains non US-ASCII characters.
1278     * @param   s   string
1279     * @return      ALL_ASCII if all characters in the string 
1280     *          belong to the US-ASCII charset. MOSTLY_ASCII
1281     *          if more than half of the available characters
1282     *          are US-ASCII characters. Else MOSTLY_NONASCII.
1283     */
1284    static int checkAscii(String   s) {
1285    int ascii = 0, non_ascii = 0;
1286    int l = s.length();
1287
1288    for (int i = 0; i < l; i++) {
1289        if (nonascii((int)s.charAt(i))) // non-ascii
1290        non_ascii++;
1291        else
1292        ascii++;
1293    }
1294
1295    if (non_ascii == 0)
1296        return ALL_ASCII;
1297    if (ascii > non_ascii)
1298        return MOSTLY_ASCII;
1299
1300    return MOSTLY_NONASCII;
1301    }
1302
1303    /** 
1304     * Check if the given byte array contains non US-ASCII characters.
1305     * @param   b   byte array
1306     * @return      ALL_ASCII if all characters in the string 
1307     *          belong to the US-ASCII charset. MOSTLY_ASCII
1308     *          if more than half of the available characters
1309     *          are US-ASCII characters. Else MOSTLY_NONASCII.
1310     *
1311     * XXX - this method is no longer used
1312     */
1313    static int checkAscii(byte[] b) {
1314    int ascii = 0, non_ascii = 0;
1315
1316    for (int i=0; i < b.length; i++) {
1317        // The '&' operator automatically causes b[i] to be promoted
1318        // to an int, and we mask out the higher bytes in the int 
1319        // so that the resulting value is not a negative integer.
1320        if (nonascii(b[i] & 0xff)) // non-ascii
1321        non_ascii++;
1322        else
1323        ascii++;
1324    }
1325    
1326    if (non_ascii == 0)
1327        return ALL_ASCII;
1328    if (ascii > non_ascii)
1329        return MOSTLY_ASCII;
1330    
1331    return MOSTLY_NONASCII;
1332    }
1333
1334    /** 
1335     * Check if the given input stream contains non US-ASCII characters.
1336     * Upto <code>max</code> bytes are checked. If <code>max</code> is
1337     * set to <code>ALL</code>, then all the bytes available in this
1338     * input stream are checked. If <code>breakOnNonAscii</code> is true
1339     * the check terminates when the first non-US-ASCII character is
1340     * found and MOSTLY_NONASCII is returned. Else, the check continues
1341     * till <code>max</code> bytes or till the end of stream.
1342     *
1343     * @param   is  the input stream
1344     * @param   max maximum bytes to check for. The special value
1345     *          ALL indicates that all the bytes in this input
1346     *          stream must be checked.
1347     * @param   breakOnNonAscii if <code>true</code>, then terminate the
1348     *          the check when the first non-US-ASCII character
1349     *          is found.
1350     * @return      ALL_ASCII if all characters in the string 
1351     *          belong to the US-ASCII charset. MOSTLY_ASCII
1352     *          if more than half of the available characters
1353     *          are US-ASCII characters. Else MOSTLY_NONASCII.
1354     */
1355    static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
1356    int ascii = 0, non_ascii = 0;
1357    int len;
1358    int block = 4096;
1359    int linelen = 0;
1360    boolean longLine = false, badEOL = false;
1361    boolean checkEOL = encodeEolStrict && breakOnNonAscii;
1362    byte buf[] = null;
1363    if (max != 0) {
1364        block = (max == ALL) ? 4096 : Math.min(max, 4096);
1365        buf = new byte[block]; 
1366    }
1367    while (max != 0) {
1368        try {
1369        if ((len = is.read(buf, 0, block)) == -1)
1370            break;
1371        int lastb = 0;
1372        for (int i = 0; i < len; i++) {
1373                // The '&' operator automatically causes b[i] to 
1374            // be promoted to an int, and we mask out the higher
1375            // bytes in the int so that the resulting value is 
1376            // not a negative integer.
1377            int b = buf[i] & 0xff;
1378            if (checkEOL &&
1379                ((lastb == '\r' && b != '\n') ||
1380                (lastb != '\r' && b == '\n')))
1381            badEOL = true;
1382            if (b == '\r' || b == '\n')
1383            linelen = 0;
1384            else {
1385            linelen++;
1386            if (linelen > 998)  // 1000 - CRLF
1387                longLine = true;
1388            }
1389            if (nonascii(b)) {  // non-ascii
1390                if (breakOnNonAscii) // we are done
1391                return MOSTLY_NONASCII;
1392                else
1393                non_ascii++;
1394            } else
1395                ascii++;
1396            lastb = b;
1397        }
1398        } catch (IOException ioex) {
1399        break;
1400        }
1401        if (max != ALL)
1402        max -= len;
1403    }
1404
1405    if (max == 0 && breakOnNonAscii)
1406        // We have been told to break on the first non-ascii character.
1407        // We haven't got any non-ascii character yet, but then we
1408        // have not checked all of the available bytes either. So we
1409        // cannot say for sure that this input stream is ALL_ASCII,
1410        // and hence we must play safe and return MOSTLY_NONASCII
1411
1412        return MOSTLY_NONASCII;
1413
1414    if (non_ascii == 0) { // no non-us-ascii characters so far
1415        // If we're looking at non-text data, and we saw CR without LF
1416        // or vice versa, consider this mostly non-ASCII so that it
1417        // will be base64 encoded (since the quoted-printable encoder
1418        // doesn't encode this case properly).
1419        if (badEOL)
1420        return MOSTLY_NONASCII;
1421        // if we've seen a long line, we degrade to mostly ascii
1422        else if (longLine)
1423        return MOSTLY_ASCII;
1424        else
1425        return ALL_ASCII;
1426    }
1427    if (ascii > non_ascii) // mostly ascii
1428        return MOSTLY_ASCII;
1429    return MOSTLY_NONASCII;
1430    }
1431
1432    static final boolean nonascii(int b) {
1433    return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
1434    }
1435}
1436
1437/**
1438 * An OutputStream that determines whether the data written to
1439 * it is all ASCII, mostly ASCII, or mostly non-ASCII.
1440 */
1441class AsciiOutputStream extends OutputStream {
1442    private boolean breakOnNonAscii;
1443    private int ascii = 0, non_ascii = 0;
1444    private int linelen = 0;
1445    private boolean longLine = false;
1446    private boolean badEOL = false;
1447    private boolean checkEOL = false;
1448    private int lastb = 0;
1449    private int ret = 0;
1450
1451    public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
1452    this.breakOnNonAscii = breakOnNonAscii;
1453    checkEOL = encodeEolStrict && breakOnNonAscii;
1454    }
1455
1456    public void write(int b) throws IOException {
1457    check(b);
1458    }
1459
1460    public void write(byte b[]) throws IOException {
1461    write(b, 0, b.length);
1462    }
1463
1464    public void write(byte b[], int off, int len) throws IOException {
1465    len += off;
1466    for (int i = off; i < len ; i++)
1467        check(b[i]);
1468    }
1469
1470    private final void check(int b) throws IOException {
1471    b &= 0xff;
1472    if (checkEOL &&
1473        ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
1474        badEOL = true;
1475    if (b == '\r' || b == '\n')
1476        linelen = 0;
1477    else {
1478        linelen++;
1479        if (linelen > 998)  // 1000 - CRLF
1480        longLine = true;
1481    }
1482    if (MimeUtility.nonascii(b)) { // non-ascii
1483        non_ascii++;
1484        if (breakOnNonAscii) {  // we are done
1485        ret = MimeUtility.MOSTLY_NONASCII;
1486        throw new EOFException();
1487        }
1488    } else
1489        ascii++;
1490    lastb = b;
1491    }
1492
1493    /**
1494     * Return ASCII-ness of data stream.
1495     */
1496    public int getAscii() {
1497    if (ret != 0)
1498        return ret;
1499    // If we're looking at non-text data, and we saw CR without LF
1500    // or vice versa, consider this mostly non-ASCII so that it
1501    // will be base64 encoded (since the quoted-printable encoder
1502    // doesn't encode this case properly).
1503    if (badEOL)
1504        return MimeUtility.MOSTLY_NONASCII;
1505    else if (non_ascii == 0) { // no non-us-ascii characters so far
1506        // if we've seen a long line, we degrade to mostly ascii
1507        if (longLine)
1508        return MimeUtility.MOSTLY_ASCII;
1509        else
1510        return MimeUtility.ALL_ASCII;
1511    }
1512    if (ascii > non_ascii) // mostly ascii
1513        return MimeUtility.MOSTLY_ASCII;
1514    return MimeUtility.MOSTLY_NONASCII;
1515    }
1516}
1517
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags