KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > java > text > Normalizer


1 /*
2  * @(#)Normalizer.java 1.1 05/05/13
3  *
4  * Portions Copyright 2006 Sun Microsystems, Inc. All rights reserved.
5  * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
6  */

7
8 /*
9  *******************************************************************************
10  * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
11  * *
12  * The original version of this source code and documentation is copyrighted *
13  * and owned by IBM, These materials are provided under terms of a License *
14  * Agreement between IBM and Sun. This technology is protected by multiple *
15  * US and International patents. This notice and attribution to IBM may not *
16  * to removed. *
17  *******************************************************************************
18  */

19
20 package java.text;
21
22 import sun.text.normalizer.NormalizerBase;
23 import sun.text.normalizer.NormalizerImpl;
24
25 /**
26  * This class provides the method <code>normalize</code> which transforms Unicode
27  * text into an equivalent composed or decomposed form, allowing for easier
28  * sorting and searching of text.
29  * The <code>normalize</code> method supports the standard normalization forms
30  * described in
31  * <a HREF="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
32  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
33  * <p>
34  * Characters with accents or other adornments can be encoded in
35  * several different ways in Unicode. For example, take the character A-acute.
36  * In Unicode, this can be encoded as a single character (the "composed" form):
37  *
38  * <p><pre>
39  * U+00C1 LATIN CAPITAL LETTER A WITH ACUTE</pre>
40  * </p>
41  *
42  * or as two separate characters (the "decomposed" form):
43  *
44  * <p><pre>
45  * U+0041 LATIN CAPITAL LETTER A
46  * U+0301 COMBINING ACUTE ACCENT</pre>
47  * </p>
48  *
49  * To a user of your program, however, both of these sequences should be
50  * treated as the same "user-level" character "A with acute accent". When you
51  * are searching or comparing text, you must ensure that these two sequences are
52  * treated as equivalent. In addition, you must handle characters with more than
53  * one accent. Sometimes the order of a character's combining accents is
54  * significant, while in other cases accent sequences in different orders are
55  * really equivalent.
56  * <p>
57  * Similarly, the string "ffi" can be encoded as three separate letters:
58  *
59  * <p><pre>
60  * U+0066 LATIN SMALL LETTER F
61  * U+0066 LATIN SMALL LETTER F
62  * U+0069 LATIN SMALL LETTER I</pre>
63  * </p>
64  *
65  * or as the single character
66  *
67  * <p><pre>
68  * U+FB03 LATIN SMALL LIGATURE FFI</pre>
69  * </p>
70  *
71  * The ffi ligature is not a distinct semantic character, and strictly speaking
72  * it shouldn't be in Unicode at all, but it was included for compatibility
73  * with existing character sets that already provided it. The Unicode standard
74  * identifies such characters by giving them "compatibility" decompositions
75  * into the corresponding semantic characters. When sorting and searching, you
76  * will often want to use these mappings.
77  * <p>
78  * The <code>normalize</code> method helps solve these problems by transforming
79  * text into the canonical composed and decomposed forms as shown in the first
80  * example above. In addition, you can have it perform compatibility
81  * decompositions so that you can treat compatibility characters the same as
82  * their equivalents.
83  * Finally, the <code>normalize</code> method rearranges accents into the
84  * proper canonical order, so that you do not have to worry about accent
85  * rearrangement on your own.
86  * <p>
87  * The W3C generally recommends to exchange texts in NFC.
88  * Note also that most legacy character encodings use only precomposed forms and
89  * often do not encode any combining marks by themselves. For conversion to such
90  * character encodings the Unicode text needs to be normalized to NFC.
91  * For more usage examples, see the Unicode Standard Annex.
92  *
93  * @since 1.6
94  */

95 public final class Normalizer {
96
97    private Normalizer() {};
98
99     /**
100      * This enum provides constants of the four Unicode normalization forms
101      * that are described in
102      * <a HREF="http://www.unicode.org/unicode/reports/tr15/tr15-23.html">
103      * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>
104      * and two methods to access them.
105      *
106      * @since 1.6
107      */

108     public static enum Form {
109
110     /**
111          * Canonical decomposition.
112      */

113     NFD,
114
115     /**
116          * Canonical decomposition, followed by canonical composition.
117      */

118     NFC,
119
120     /**
121          * Compatibility decomposition.
122      */

123         NFKD,
124
125     /**
126          * Compatibility decomposition, followed by canonical composition.
127      */

128         NFKC
129     }
130
131     /**
132      * Normalize a sequence of char values.
133      * The sequence will be normalized according to the specified normalization
134      * from.
135      * @param src The sequence of char values to normalize.
136      * @param form The normalization form; one of
137      * {@link java.text.Normalizer.Form#NFC},
138      * {@link java.text.Normalizer.Form#NFD},
139      * {@link java.text.Normalizer.Form#NFKC},
140      * {@link java.text.Normalizer.Form#NFKD}
141      * @return The normalized String
142      * @throws NullPointerException If <code>src</code> or <code>form</code>
143      * is null.
144      */

145     public static String JavaDoc normalize(CharSequence JavaDoc src, Form form) {
146         return NormalizerBase.normalize(src.toString(), form);
147     }
148
149     /**
150      * Determines if the given sequence of char values is normalized.
151      * @param src The sequence of char values to be checked.
152      * @param form The normalization form; one of
153      * {@link java.text.Normalizer.Form#NFC},
154      * {@link java.text.Normalizer.Form#NFD},
155      * {@link java.text.Normalizer.Form#NFKC},
156      * {@link java.text.Normalizer.Form#NFKD}
157      * @return true if the sequence of char values is normalized;
158      * false otherwise.
159      * @throws NullPointerException If <code>src</code> or <code>form</code>
160      * is null.
161      */

162     public static boolean isNormalized(CharSequence JavaDoc src, Form form) {
163         return NormalizerBase.isNormalized(src.toString(), form);
164     }
165 }
166
Popular Tags