TextProcessor


1   /*******************************************************************************
2    * Copyright (c) 2006, 2007 IBM Corporation and others.
3    * All rights reserved. This program and the accompanying materials 
4    * are made available under the terms of the Eclipse Public License v1.0
5    * which accompanies this distribution, and is available at
6    * http://www.eclipse.org/legal/epl-v10.html
7    * 
8    * Contributors:
9    *     IBM Corporation - initial API and implementation
10   *******************************************************************************/
11  package org.eclipse.osgi.util;
12  
13  import java.util.Locale  ;
14  
15  /**
16   * This class is used to process strings that have special semantic meaning
17   * (such as file paths) in RTL-oriented locales so that they render in a way
18   * that does not corrupt the semantic meaning of the string but also maintains
19   * compliance with the Unicode BiDi algorithm of rendering Bidirectional text.
20   * <p>
21   * Processing of the string is done by breaking it down into segments that are
22   * specified by a set of user provided delimiters. Directional punctuation
23   * characters are injected into the string in order to ensure the string retains
24   * its semantic meaning and conforms with the Unicode BiDi algorithm within each
25   * segment.
26   * </p>
27   * 
28   * @since 3.2
29   */
30  public class TextProcessor {
31  
32      // commonly used delimiters
33      /**
34       * Dot (.) delimiter. Used most often in package names and file extensions.
35       */
36      private static final String   DOT = "."; //$NON-NLS-1$
37  
38      /**
39       * Colon (:) delimiter. Used most often in file paths and URLs.
40       */
41      private static final String   COLON = ":"; //$NON-NLS-1$
42  
43      /**
44       * Forward slash (/) delimiter. Used most often in file paths and URLs.
45       */
46      private static final String   FILE_SEP_FSLASH = "/"; //$NON-NLS-1$
47  
48      /**
49       * Backslash (\) delimiter. Used most often in file paths.
50       */
51      private static final String   FILE_SEP_BSLASH = "\\"; //$NON-NLS-1$
52  
53      /**
54       * The default set of delimiters to use to segment a string.
55       */
56      private static final String   delimiterString = DOT + COLON + FILE_SEP_FSLASH
57              + FILE_SEP_BSLASH;
58  
59      // left to right marker
60      private static final char LRM = '\u200e';
61  
62      // left to right embedding
63      private static final char LRE = '\u202a';
64  
65      // pop directional format
66      private static final char PDF = '\u202c';
67  
68      // whether or not the locale BiDi
69      private static boolean isBidi = false;
70  
71      // whether or not the current platform supports directional characters
72      private static boolean isSupportedPlatform = false;
73  
74      // constant used to indicate an LRM need not precede a delimiter 
75      private static final int INDEX_NOT_SET = 999999999;
76  
77      static {
78          Locale   locale = Locale.getDefault();
79          String   lang = locale.getLanguage();
80  
81          if ("iw".equals(lang) || "he".equals(lang) || "ar".equals(lang) || "fa".equals(lang) || "ur".equals(lang)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
82              isBidi = true;
83          
84          String   osName = System.getProperty("os.name").toLowerCase(); //$NON-NLS-1$
85          if (osName.startsWith("windows") || osName.startsWith("linux")) { //$NON-NLS-1$ //$NON-NLS-2$
86                                                                              
87              // Only consider platforms that can support control characters
88              isSupportedPlatform = true;
89          }
90      }
91  
92      /**
93       * Process the given text and return a string with the appropriate
94       * substitution based on the locale. This is equivalent to calling
95       * <code>process(String, String)</code> with the default set of
96       * delimiters.
97       * 
98       * @param text
99       *            the text to be processed
100      * @return the manipulated string
101      * @see #process(String, String)
102      * @see #getDefaultDelimiters()
103      */
104     public static String   process(String   text) {
105         return process(text, getDefaultDelimiters());
106     }
107 
108     /**
109      * Process a string that has a particular semantic meaning to render on BiDi
110      * locales in way that maintains the semantic meaning of the text, but
111      * differs from the Unicode BiDi algorithm. The text is segmented according
112      * to the provided delimiters. Each segment has the Unicode BiDi algorithm
113      * applied to it, but as a whole, the string is oriented left to right.
114      * <p>
115      * For example a file path such as <tt>d:\myFolder\FOLDER\MYFILE.java</tt>
116      * (where capital letters indicate RTL text) should render as
117      * <tt>d:\myFolder\REDLOF\ELIFYM.java</tt> when using the Unicode BiDi
118      * algorithm and segmenting the string according to the specified delimiter
119      * set.
120      * </p>
121      * <p>
122      * The following algorithm is used:
123      * <ol>
124      * <li>Scan the string to locate the delimiters.</li>
125      * <li>While scanning, note the direction of the last strong character
126      * scanned. Strong characters are characters which have a BiDi
127      * classification of L, R or AL as defined in the Unicode standard.</li>
128      * <li>If the last strong character before a separator is of class R or AL,
129      * add a LRM before the separator. Since LRM itself is a strong L character,
130      * following separators do not need an LRM until a strong R or AL character
131      * is found.</li>
132      * <li>If the component where the pattern is displayed has a RTL basic
133      * direction, add a LRE at the beginning of the pattern and a PDF at its
134      * end. The string is considered to have RTL direction if it contains RTL
135      * characters and the runtime locale is BiDi. There is no need to add
136      * LRE/PDF if the string begins with an LTR letter, contains no RTL letter,
137      * and ends with either a LTR letter or a digit.</li>
138      * </ol>
139      * </p>
140      * <p>
141      * NOTE: this method will change the shape of the original string passed in
142      * by inserting punctuation characters into the text in order to make it
143      * render to correctly reflect the semantic meaning of the text. Methods
144      * like <code>String.equals(String)</code> and
145      * <code>String.length()</code> called on the resulting string will not
146      * return the same values as would be returned for the original string.
147      * </p>
148      * 
149      * @param str
150      *            the text to process, if <code>null</code> return the string
151      *            as it was passed in
152      * @param delimiter
153      *            delimiters by which the string will be segmented, if
154      *            <code>null</code> the default delimiters are used
155      * @return the processed string
156      */
157     public static String   process(String   str, String   delimiter) {
158         if (str == null || str.length() <= 1 || !isSupportedPlatform || !isBidi)
159             return str;
160 
161         // do not process a string that has already been processed.
162         if (str.charAt(0) == LRE && str.charAt(str.length() - 1) == PDF) {
163             return str;
164         }
165         
166         // String contains RTL characters
167         boolean isStringBidi = false;
168         // Last strong character is RTL
169         boolean isLastRTL = false;
170         // Last candidate delimiter index
171         int delimIndex = INDEX_NOT_SET;
172 
173         delimiter = delimiter == null ? getDefaultDelimiters() : delimiter;
174         
175         StringBuffer   target = new StringBuffer  ();
176         target.append(LRE);
177         char ch;
178 
179         for (int i = 0, n = str.length(); i < n; i++) {
180             ch = str.charAt(i);
181             if (delimiter.indexOf(ch) != -1) { 
182                 // character is a delimiter, note its index in the buffer
183                 if (isLastRTL) {
184                     delimIndex = target.length(); 
185                 }
186             } else if (Character.isDigit(ch)) {
187                 if (delimIndex != INDEX_NOT_SET) {
188                     // consecutive neutral and weak directional characters
189                     // explicitly force direction to be LRM                 
190                     target.insert(delimIndex, LRM);
191                     delimIndex = INDEX_NOT_SET;
192                     isLastRTL = false;
193                 }
194             } else if (Character.isLetter(ch)) {
195                 if (isRTL(ch)) {
196                     isStringBidi = true;
197                     if (delimIndex != INDEX_NOT_SET) {
198                         // neutral character followed by strong right directional character
199                         // explicitly force direction to be LRM 
200                         target.insert(delimIndex, LRM);
201                         delimIndex = INDEX_NOT_SET;
202                     }
203                     isLastRTL = true;
204                 } else { 
205                     // strong LTR character, no LRM will be required
206                     delimIndex = INDEX_NOT_SET; 
207                     isLastRTL = false;
208                 }
209             }
210             target.append(ch);
211         }
212         /*
213          * TextProcessor is not aware of the orientation of the component owning
214          * the processed string. Enclose the string in LRE/PDF in either of 2
215          * cases: 
216          * (1) The string contains BiDi characters - implying that the
217          * string appearance depends on the basic orientation 
218          * (2) The runtime locale is BiDi AND either the string does not start with 
219          * an LTR character or it ends with LTR char or digit.
220          */
221         if (isStringBidi || !Character.isLetter(str.charAt(0))
222                 || isNeutral(str.charAt(str.length() - 1))) {
223             target.append(PDF);
224             return target.toString();
225         }
226         // Otherwise, return the original string
227         return str;
228     }
229 
230     /**
231      * Removes directional marker characters in the given string that were inserted by 
232      * utilizing the <code>process(String)</code> or <code>process(String, String)</code>
233      * methods.
234      * 
235      * @param str string with directional markers to remove
236      * @return string with no directional markers 
237      * @see #process(String)
238      * @see #process(String, String)
239      * @since 3.3
240      */
241     public static String   deprocess(String   str){
242         // don't do all the work if not a valid case 
243         if (str == null || str.length() <= 1 || !isSupportedPlatform || !isBidi)
244             return str;
245         
246         StringBuffer   buf = new StringBuffer  ();
247         for (int i = 0; i < str.length(); i++){
248             char c = str.charAt(i);
249             switch(c){
250                 case LRE: continue;
251                 case PDF: continue;
252                 case LRM: continue;
253                 default:
254                     buf.append(c);
255             }
256         }
257         
258         return buf.toString();
259     }
260     
261     /**
262      * Return the string containing all the default delimiter characters to be
263      * used to segment a given string.
264      * 
265      * @return delimiter string
266      */
267     public static String   getDefaultDelimiters() {
268         return delimiterString;
269     }
270 
271     /*
272      * Return whether or not the character falls is right to left oriented.
273      */
274     private static boolean isRTL(char c) {
275         /*
276          * Cannot use Character.getDirectionality() since the OSGi library can
277          * be compiled with execution environments that pre-date that API.
278          * 
279          * The first range of characters is Unicode Hebrew and Arabic
280          * characters. The second range of characters is Unicode Hebrew and
281          * Arabic presentation forms.
282          * 
283          * NOTE: Farsi and Urdu fall within the Arabic scripts.
284          */
285         return (((c >= 0x05d0) && (c <= 0x07b1)) || ((c >= 0xfb1d) && (c <= 0xfefc)));
286     }
287 
288     /*
289      * Return whether or not the given character has a weak directional type
290      */
291     private static boolean isNeutral(char c) {
292         return !(Character.isDigit(c) || Character.isLetter(c));
293     }
294 
295     /*
296      * Constructor for the class.
297      */
298     private TextProcessor() {
299         // prevent instantiation
300     }
301 }
302
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Free Books Free Magazines
Popular Tags