Normalizer


1   /*
2    * @(#)Normalizer.java  1.1 05/05/13
3    *
4    * Portions Copyright 2006 Sun Microsystems, Inc. All rights reserved.
5    * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6    */
7   
8   /*
9    *******************************************************************************
10   * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
11   *                                                                             *
12   * The original version of this source code and documentation is copyrighted   *
13   * and owned by IBM, These materials are provided under terms of a License     *
14   * Agreement between IBM and Sun. This technology is protected by multiple     *
15   * US and International patents. This notice and attribution to IBM may not    *
16   * to removed.                                                                 *
17   *******************************************************************************
18   */
19  
20  package java.text;
21  
22  import sun.text.normalizer.NormalizerBase;
23  import sun.text.normalizer.NormalizerImpl;
24  
25  /**
26   * This class provides the method <code>normalize</code> which transforms Unicode
27   * text into an equivalent composed or decomposed form, allowing for easier
28   * sorting and searching of text.
29   * The <code>normalize</code> method supports the standard normalization forms
30   * described in
31   * <a HREF="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
32   * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
33   * <p>
34   * Characters with accents or other adornments can be encoded in
35   * several different ways in Unicode.  For example, take the character A-acute.
36   * In Unicode, this can be encoded as a single character (the "composed" form):
37   *
38   * <p><pre>
39   *      U+00C1    LATIN CAPITAL LETTER A WITH ACUTE</pre>
40   * </p>
41   *
42   * or as two separate characters (the "decomposed" form):
43   *
44   * <p><pre>
45   *      U+0041    LATIN CAPITAL LETTER A
46   *      U+0301    COMBINING ACUTE ACCENT</pre>
47   * </p>
48   *
49   * To a user of your program, however, both of these sequences should be
50   * treated as the same "user-level" character "A with acute accent".  When you
51   * are searching or comparing text, you must ensure that these two sequences are
52   * treated as equivalent.  In addition, you must handle characters with more than
53   * one accent. Sometimes the order of a character's combining accents is
54   * significant, while in other cases accent sequences in different orders are
55   * really equivalent.
56   * <p>
57   * Similarly, the string "ffi" can be encoded as three separate letters:
58   *
59   * <p><pre>
60   *      U+0066    LATIN SMALL LETTER F
61   *      U+0066    LATIN SMALL LETTER F
62   *      U+0069    LATIN SMALL LETTER I</pre>
63   * </p>
64   *
65   * or as the single character
66   *
67   * <p><pre>
68   *      U+FB03    LATIN SMALL LIGATURE FFI</pre>
69   * </p>
70   *
71   * The ffi ligature is not a distinct semantic character, and strictly speaking
72   * it shouldn't be in Unicode at all, but it was included for compatibility
73   * with existing character sets that already provided it.  The Unicode standard
74   * identifies such characters by giving them "compatibility" decompositions
75   * into the corresponding semantic characters.  When sorting and searching, you
76   * will often want to use these mappings.
77   * <p>
78   * The <code>normalize</code> method helps solve these problems by transforming
79   * text into the canonical composed and decomposed forms as shown in the first
80   * example above. In addition, you can have it perform compatibility
81   * decompositions so that you can treat compatibility characters the same as
82   * their equivalents.
83   * Finally, the <code>normalize</code> method rearranges accents into the
84   * proper canonical order, so that you do not have to worry about accent
85   * rearrangement on your own.
86   * <p>
87   * The W3C generally recommends to exchange texts in NFC.
88   * Note also that most legacy character encodings use only precomposed forms and
89   * often do not encode any combining marks by themselves. For conversion to such
90   * character encodings the Unicode text needs to be normalized to NFC.
91   * For more usage examples, see the Unicode Standard Annex.
92   *
93   * @since 1.6
94   */
95  public final class Normalizer {
96  
97     private Normalizer() {};
98  
99      /**
100      * This enum provides constants of the four Unicode normalization forms
101      * that are described in
102      * <a HREF="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
103      * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
104      * and two methods to access them.
105      *
106      * @since 1.6
107      */
108     public static enum Form {
109 
110     /**
111          * Canonical decomposition.
112      */
113     NFD,
114 
115     /**
116          * Canonical decomposition, followed by canonical composition.
117      */
118     NFC,    
119 
120     /** 
121          * Compatibility decomposition.
122      */
123         NFKD,
124 
125     /**
126          * Compatibility decomposition, followed by canonical composition.
127      */
128         NFKC
129     }
130 
131     /**
132      * Normalize a sequence of char values.
133      * The sequence will be normalized according to the specified normalization
134      * from.
135      * @param src        The sequence of char values to normalize.
136      * @param form       The normalization form; one of
137      *                   {@link java.text.Normalizer.Form#NFC},
138      *                   {@link java.text.Normalizer.Form#NFD},
139      *                   {@link java.text.Normalizer.Form#NFKC},
140      *                   {@link java.text.Normalizer.Form#NFKD}
141      * @return The normalized String
142      * @throws NullPointerException If <code>src</code> or <code>form</code>
143      * is null.
144      */
145     public static String   normalize(CharSequence   src, Form form) {
146         return NormalizerBase.normalize(src.toString(), form);
147     }
148 
149     /**
150      * Determines if the given sequence of char values is normalized.
151      * @param src        The sequence of char values to be checked.
152      * @param form       The normalization form; one of
153      *                   {@link java.text.Normalizer.Form#NFC},
154      *                   {@link java.text.Normalizer.Form#NFD},
155      *                   {@link java.text.Normalizer.Form#NFKC},
156      *                   {@link java.text.Normalizer.Form#NFKD}
157      * @return true if the sequence of char values is normalized;
158      * false otherwise.
159      * @throws NullPointerException If <code>src</code> or <code>form</code>
160      * is null.
161      */
162     public static boolean isNormalized(CharSequence   src, Form form) {
163         return NormalizerBase.isNormalized(src.toString(), form);
164     }
165 }
166
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags