KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > osgi > util > TextProcessor


1 /*******************************************************************************
2  * Copyright (c) 2006, 2007 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.osgi.util;
12
13 import java.util.Locale JavaDoc;
14
15 /**
16  * This class is used to process strings that have special semantic meaning
17  * (such as file paths) in RTL-oriented locales so that they render in a way
18  * that does not corrupt the semantic meaning of the string but also maintains
19  * compliance with the Unicode BiDi algorithm of rendering Bidirectional text.
20  * <p>
21  * Processing of the string is done by breaking it down into segments that are
22  * specified by a set of user provided delimiters. Directional punctuation
23  * characters are injected into the string in order to ensure the string retains
24  * its semantic meaning and conforms with the Unicode BiDi algorithm within each
25  * segment.
26  * </p>
27  *
28  * @since 3.2
29  */

30 public class TextProcessor {
31
32     // commonly used delimiters
33
/**
34      * Dot (.) delimiter. Used most often in package names and file extensions.
35      */

36     private static final String JavaDoc DOT = "."; //$NON-NLS-1$
37

38     /**
39      * Colon (:) delimiter. Used most often in file paths and URLs.
40      */

41     private static final String JavaDoc COLON = ":"; //$NON-NLS-1$
42

43     /**
44      * Forward slash (/) delimiter. Used most often in file paths and URLs.
45      */

46     private static final String JavaDoc FILE_SEP_FSLASH = "/"; //$NON-NLS-1$
47

48     /**
49      * Backslash (\) delimiter. Used most often in file paths.
50      */

51     private static final String JavaDoc FILE_SEP_BSLASH = "\\"; //$NON-NLS-1$
52

53     /**
54      * The default set of delimiters to use to segment a string.
55      */

56     private static final String JavaDoc delimiterString = DOT + COLON + FILE_SEP_FSLASH
57             + FILE_SEP_BSLASH;
58
59     // left to right marker
60
private static final char LRM = '\u200e';
61
62     // left to right embedding
63
private static final char LRE = '\u202a';
64
65     // pop directional format
66
private static final char PDF = '\u202c';
67
68     // whether or not the locale BiDi
69
private static boolean isBidi = false;
70
71     // whether or not the current platform supports directional characters
72
private static boolean isSupportedPlatform = false;
73
74     // constant used to indicate an LRM need not precede a delimiter
75
private static final int INDEX_NOT_SET = 999999999;
76
77     static {
78         Locale JavaDoc locale = Locale.getDefault();
79         String JavaDoc lang = locale.getLanguage();
80
81         if ("iw".equals(lang) || "he".equals(lang) || "ar".equals(lang) || "fa".equals(lang) || "ur".equals(lang)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
82
isBidi = true;
83         
84         String JavaDoc osName = System.getProperty("os.name").toLowerCase(); //$NON-NLS-1$
85
if (osName.startsWith("windows") || osName.startsWith("linux")) { //$NON-NLS-1$ //$NON-NLS-2$
86

87             // Only consider platforms that can support control characters
88
isSupportedPlatform = true;
89         }
90     }
91
92     /**
93      * Process the given text and return a string with the appropriate
94      * substitution based on the locale. This is equivalent to calling
95      * <code>process(String, String)</code> with the default set of
96      * delimiters.
97      *
98      * @param text
99      * the text to be processed
100      * @return the manipulated string
101      * @see #process(String, String)
102      * @see #getDefaultDelimiters()
103      */

104     public static String JavaDoc process(String JavaDoc text) {
105         return process(text, getDefaultDelimiters());
106     }
107
108     /**
109      * Process a string that has a particular semantic meaning to render on BiDi
110      * locales in way that maintains the semantic meaning of the text, but
111      * differs from the Unicode BiDi algorithm. The text is segmented according
112      * to the provided delimiters. Each segment has the Unicode BiDi algorithm
113      * applied to it, but as a whole, the string is oriented left to right.
114      * <p>
115      * For example a file path such as <tt>d:\myFolder\FOLDER\MYFILE.java</tt>
116      * (where capital letters indicate RTL text) should render as
117      * <tt>d:\myFolder\REDLOF\ELIFYM.java</tt> when using the Unicode BiDi
118      * algorithm and segmenting the string according to the specified delimiter
119      * set.
120      * </p>
121      * <p>
122      * The following algorithm is used:
123      * <ol>
124      * <li>Scan the string to locate the delimiters.</li>
125      * <li>While scanning, note the direction of the last strong character
126      * scanned. Strong characters are characters which have a BiDi
127      * classification of L, R or AL as defined in the Unicode standard.</li>
128      * <li>If the last strong character before a separator is of class R or AL,
129      * add a LRM before the separator. Since LRM itself is a strong L character,
130      * following separators do not need an LRM until a strong R or AL character
131      * is found.</li>
132      * <li>If the component where the pattern is displayed has a RTL basic
133      * direction, add a LRE at the beginning of the pattern and a PDF at its
134      * end. The string is considered to have RTL direction if it contains RTL
135      * characters and the runtime locale is BiDi. There is no need to add
136      * LRE/PDF if the string begins with an LTR letter, contains no RTL letter,
137      * and ends with either a LTR letter or a digit.</li>
138      * </ol>
139      * </p>
140      * <p>
141      * NOTE: this method will change the shape of the original string passed in
142      * by inserting punctuation characters into the text in order to make it
143      * render to correctly reflect the semantic meaning of the text. Methods
144      * like <code>String.equals(String)</code> and
145      * <code>String.length()</code> called on the resulting string will not
146      * return the same values as would be returned for the original string.
147      * </p>
148      *
149      * @param str
150      * the text to process, if <code>null</code> return the string
151      * as it was passed in
152      * @param delimiter
153      * delimiters by which the string will be segmented, if
154      * <code>null</code> the default delimiters are used
155      * @return the processed string
156      */

157     public static String JavaDoc process(String JavaDoc str, String JavaDoc delimiter) {
158         if (str == null || str.length() <= 1 || !isSupportedPlatform || !isBidi)
159             return str;
160
161         // do not process a string that has already been processed.
162
if (str.charAt(0) == LRE && str.charAt(str.length() - 1) == PDF) {
163             return str;
164         }
165         
166         // String contains RTL characters
167
boolean isStringBidi = false;
168         // Last strong character is RTL
169
boolean isLastRTL = false;
170         // Last candidate delimiter index
171
int delimIndex = INDEX_NOT_SET;
172
173         delimiter = delimiter == null ? getDefaultDelimiters() : delimiter;
174         
175         StringBuffer JavaDoc target = new StringBuffer JavaDoc();
176         target.append(LRE);
177         char ch;
178
179         for (int i = 0, n = str.length(); i < n; i++) {
180             ch = str.charAt(i);
181             if (delimiter.indexOf(ch) != -1) {
182                 // character is a delimiter, note its index in the buffer
183
if (isLastRTL) {
184                     delimIndex = target.length();
185                 }
186             } else if (Character.isDigit(ch)) {
187                 if (delimIndex != INDEX_NOT_SET) {
188                     // consecutive neutral and weak directional characters
189
// explicitly force direction to be LRM
190
target.insert(delimIndex, LRM);
191                     delimIndex = INDEX_NOT_SET;
192                     isLastRTL = false;
193                 }
194             } else if (Character.isLetter(ch)) {
195                 if (isRTL(ch)) {
196                     isStringBidi = true;
197                     if (delimIndex != INDEX_NOT_SET) {
198                         // neutral character followed by strong right directional character
199
// explicitly force direction to be LRM
200
target.insert(delimIndex, LRM);
201                         delimIndex = INDEX_NOT_SET;
202                     }
203                     isLastRTL = true;
204                 } else {
205                     // strong LTR character, no LRM will be required
206
delimIndex = INDEX_NOT_SET;
207                     isLastRTL = false;
208                 }
209             }
210             target.append(ch);
211         }
212         /*
213          * TextProcessor is not aware of the orientation of the component owning
214          * the processed string. Enclose the string in LRE/PDF in either of 2
215          * cases:
216          * (1) The string contains BiDi characters - implying that the
217          * string appearance depends on the basic orientation
218          * (2) The runtime locale is BiDi AND either the string does not start with
219          * an LTR character or it ends with LTR char or digit.
220          */

221         if (isStringBidi || !Character.isLetter(str.charAt(0))
222                 || isNeutral(str.charAt(str.length() - 1))) {
223             target.append(PDF);
224             return target.toString();
225         }
226         // Otherwise, return the original string
227
return str;
228     }
229
230     /**
231      * Removes directional marker characters in the given string that were inserted by
232      * utilizing the <code>process(String)</code> or <code>process(String, String)</code>
233      * methods.
234      *
235      * @param str string with directional markers to remove
236      * @return string with no directional markers
237      * @see #process(String)
238      * @see #process(String, String)
239      * @since 3.3
240      */

241     public static String JavaDoc deprocess(String JavaDoc str){
242         // don't do all the work if not a valid case
243
if (str == null || str.length() <= 1 || !isSupportedPlatform || !isBidi)
244             return str;
245         
246         StringBuffer JavaDoc buf = new StringBuffer JavaDoc();
247         for (int i = 0; i < str.length(); i++){
248             char c = str.charAt(i);
249             switch(c){
250                 case LRE: continue;
251                 case PDF: continue;
252                 case LRM: continue;
253                 default:
254                     buf.append(c);
255             }
256         }
257         
258         return buf.toString();
259     }
260     
261     /**
262      * Return the string containing all the default delimiter characters to be
263      * used to segment a given string.
264      *
265      * @return delimiter string
266      */

267     public static String JavaDoc getDefaultDelimiters() {
268         return delimiterString;
269     }
270
271     /*
272      * Return whether or not the character falls is right to left oriented.
273      */

274     private static boolean isRTL(char c) {
275         /*
276          * Cannot use Character.getDirectionality() since the OSGi library can
277          * be compiled with execution environments that pre-date that API.
278          *
279          * The first range of characters is Unicode Hebrew and Arabic
280          * characters. The second range of characters is Unicode Hebrew and
281          * Arabic presentation forms.
282          *
283          * NOTE: Farsi and Urdu fall within the Arabic scripts.
284          */

285         return (((c >= 0x05d0) && (c <= 0x07b1)) || ((c >= 0xfb1d) && (c <= 0xfefc)));
286     }
287
288     /*
289      * Return whether or not the given character has a weak directional type
290      */

291     private static boolean isNeutral(char c) {
292         return !(Character.isDigit(c) || Character.isLetter(c));
293     }
294
295     /*
296      * Constructor for the class.
297      */

298     private TextProcessor() {
299         // prevent instantiation
300
}
301 }
302
Popular Tags