CBParse


1   package com.ca.commons.cbutil;
2   
3   /**
4    * Methods being moved out of CBUtility for stand-alone-ishness
5    */
6   public class CBParse
7   {
8   
9       public static String   bytes2Hex(byte[] bytes)
10      {
11          StringBuffer   ret = new StringBuffer  (bytes.length * 2);
12          for (int i = 0; i < bytes.length; i++)
13          {
14              ret.append(byte2Hex(bytes[i]));
15          }
16          return ret.toString();
17      }
18  
19  
20      public static String   string2Hex(String   orig)
21      {
22          StringBuffer   ret = new StringBuffer  (orig.length() * 2);
23          char[] c = orig.toCharArray();
24          for (int i = 0; i < c.length; i++)
25          {
26              ret.append(char2Hex(c[i]));
27          }
28          return ret.toString();
29      }
30  
31      static public String   byte2Hex(byte b)
32      {
33          // Returns hex String representation of byte b
34          final char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
35          char[] array = {hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f]};
36          return new String  (array);
37      }
38  
39      static public String   char2Hex(char c)
40      {
41          // Returns hex String representation of char c
42          byte hi = (byte) (c >>> 8);
43          byte lo = (byte) (c & 0xff);
44          return byte2Hex(hi) + byte2Hex(lo);
45      }
46  
47      static public byte hex2Byte(char hex1, char hex2)
48      {
49          byte a = hexChar2Byte(hex1);
50          byte b = hexChar2Byte(hex2);
51          return (byte) ((a << 4) + b);
52      }
53  
54      /**
55       * Convert a single character to a byte...
56       */
57  
58      static public byte hexChar2Byte(char hex)
59      {
60          if (hex <= '9')
61              return ((byte) (hex - 48)); // ('0' -> '9')
62          else if (hex <= 'F')
63              return ((byte) (hex - 55)); // ('A' -> 'F')
64          else
65              return ((byte) (hex - 87)); // ('a' -> 'f')
66      }
67  
68      /**
69       * From Van Bui - prints out a hex string formatted with
70       * spaces between each hex word of length wordlength.
71       *
72       * @param in         input array of bytes to convert
73       * @param wordlength the length of hex words to print otu.
74       */
75      public static String   bytes2HexSplit(byte[] in, int wordlength)
76      {
77          String   hex = bytes2Hex(in);
78          StringBuffer   buff = new StringBuffer  ();
79  
80          for (int i = 0; i < hex.length(); i++)
81          {
82              buff.append(hex.charAt(i));
83              if ((i + 1) % wordlength == 0)
84                  buff.append(" ");
85          }
86  
87          return buff.toString();
88      }
89  
90      /**
91       * From Van Bui - prints out a hex string formatted with
92       * spaces between each hex word of length wordlength, and
93       * new lines every linelength.
94       *
95       * @param in         input array of bytes to convert
96       * @param wordlength the length of hex words to print otu.
97       * @param linelength the length of a line to print before inserting
98       *                   a line feed.
99       */
100 
101     public static String   bytes2HexSplit(byte[] in, int wordlength, int linelength)
102     {
103         String   hex = bytes2Hex(in);
104         StringBuffer   buff = new StringBuffer  ();
105 
106         for (int i = 0; i < hex.length(); i++)
107         {
108             buff.append(hex.charAt(i));
109             if ((i + 1) % wordlength == 0)
110                 buff.append(" ");
111             if ((i + 1) % linelength == 0)
112                 buff.append("\n");
113         }
114 
115         return buff.toString();
116     }
117 
118     /**
119      * Determines whether a given byte sequence is a valid utf-8
120      * encoding.  While this does not mean that the byte *is* a
121      * utf-8 encoded string, the chance of a random byte sequence
122      * happening to be utf8 is roughly (1/2 ** (byte array length)).<p>
123      * Note that '7 bit ascii' is *always* a valid utf-8 string...<p>
124      * see rfc 2279
125      */
126     public static boolean isUTF8(byte[] sequence)
127     {
128         boolean debug = false;
129         if (debug) System.out.println("\n\n Starting UTF8 Check\n\n");
130         int numberBytesInChar;
131 
132         for (int i = 0; i < sequence.length; i++)
133         {
134             byte b = sequence[i];
135             if (debug) System.out.println("testing byte: " + byte2Hex(b));
136             if (((b >> 6) & 0x03) == 2)
137             {
138                 if (debug) System.out.println("start byte is invalid utf8 - has 10... start");
139                 return false;
140             }
141             byte test = b;
142             numberBytesInChar = 0;
143             while ((test & 0x80) > 0)
144             {
145                 test <<= 1;
146                 numberBytesInChar++;
147             }
148 
149             if (numberBytesInChar > 1)  // check that extended bytes are also good...
150             {
151                 for (int j = 1; j < numberBytesInChar; j++)
152                 {
153                     if (i + j >= sequence.length)
154                     {
155                         if (debug) System.out.println("following byte length is invalid - overruns end... ");
156                         return false;           // not a character encoding - probably random bytes
157                     }
158                     if (debug) System.out.println("testing byte: " + byte2Hex(sequence[i + j]));
159                     if (((sequence[i + j] >> 6) & 0x03) != 2)
160                     {
161                         if (debug) System.out.println("following byte is invalid utf8 - does *not* have 10... start");
162                         return false;
163                     }
164                 }
165                 i += numberBytesInChar - 1;  // increment i to the next utf8 character start position.
166             }
167         }
168 
169         return true;
170     }
171 
172     /**
173      * Determines whether a given byte sequence is a valid utf-8
174      * encoding, encoding (at least in part) something *other* than
175      * normal Ascii (i.e.
176      * it is utf-8 encoding something that is not just 7-bit ascii,
177      * which in utf-8 is indistinguishable from the original text).<p>
178      * <p/>
179      * While this does not mean that the bytes *are* a
180      * utf-8 encoded string, the chance of a random byte sequence
181      * (containing bytes with the high-bit set)
182      * happening to be utf8 is roughly (1/2 ** (byte array length)).<p>
183      * see rfc 2279
184      */
185 
186     public static boolean isNonAsciiUTF8(byte[] sequence)
187     {
188         boolean nonAsciiDetected = false;
189 
190         int numberBytesInChar;
191         for (int i = 0; i < sequence.length - 3; i++)
192         {
193             byte b = sequence[i];
194             if (((b >> 6) & 0x03) == 2) return false;
195             byte test = b;
196             numberBytesInChar = 0;
197             while ((test & 0x80) > 0)
198             {
199                 test <<= 1;
200                 numberBytesInChar++;
201             }
202 
203             // check if multi-byte utf8 sequence found
204             if (numberBytesInChar > 1)  // check that extended bytes are also good...
205             {
206                 nonAsciiDetected = true;
207                 for (int j = 1; j < numberBytesInChar; j++)
208                 {
209                     if (((sequence[i + j] >> 6) & 0x03) != 2)
210                         return false;
211                 }
212                 i += numberBytesInChar - 1;  // increment i to the next utf8 character start position.
213             }
214         }
215 
216         return nonAsciiDetected;
217     }
218 
219 
220     /**
221      * This uses the implicit 'unicode marker' at the start of a
222      * Unicode file to determine whether a file is a unicode file.
223      * At the beginning of every unicode file is a two byte code
224      * indicating the endien-ness of the file (either FFFE or FEFF).
225      * If either of these sequences is found, this function returns
226      * true, otherwise it returns false.  <i>Technically</i> this isn't
227      * a sure test, since a) something else could have this signiture,
228      * and b) unicode files are not absolutely required to have this
229      * signiture (but most do).
230      */
231 
232     public static boolean isUnicode(byte[] sequence)
233     {
234         if (sequence.length >= 2)
235         {
236             if (sequence[0] == (byte) 0xFF && sequence[1] == (byte) 0xFE) return true;
237             if (sequence[0] == (byte) 0xFE && sequence[1] == (byte) 0xFF) return true;
238         }
239         return false;
240     }
241 
242     /**
243      * Turns a string into HTML displayable text by escaping
244      * special characters ('<','&' etc...).
245      * <p/>
246      * ... add new ones as required; or see if an existing ftn somewhere
247      * does this already...
248      */
249 
250     public static String   toHTML(String   rawText)
251     {
252         String   test;
253         if (rawText.length() > 14)
254             test = rawText.substring(0, 14).toLowerCase();
255         else
256             test = rawText.toLowerCase();
257 
258         if (test.startsWith("<html>") || test.startsWith("<!doctype html>"))
259         {
260 
261 // XXX this was commented out, but it seems to be necessaary/desirable?
262             if (test.startsWith("<html>"))
263                 rawText = rawText.substring(6);
264             else if (test.startsWith("<!doctype html>"))
265                 rawText = rawText.substring(15);
266 
267             if (rawText.toLowerCase().endsWith("</html>"))
268             {
269                 rawText = rawText.substring(0, rawText.length() - 7);
270             }
271 
272 // END XXX
273 
274             return rawText;
275         }
276         char C;
277         StringBuffer   temp = new StringBuffer  (rawText);
278 
279         for (int pos = 0; pos < temp.length(); pos++)
280         {
281             C = temp.charAt(pos);
282 
283             switch (C)
284             {
285                 case '<':
286                     replaceChar(temp, pos, "&lt;");
287                     break;
288                 case '>':
289                     replaceChar(temp, pos, "&gt;");
290                     break;
291                 case '&':
292                     replaceChar(temp, pos, "&amp;");
293                     break;
294                 case '\"':
295                     replaceChar(temp, pos, "&quot;");
296                     break;
297                 case '#':
298                     replaceChar(temp, pos, "&#35;");
299                     pos++;
300                     break;
301             }
302         }
303         return temp.toString();
304     }
305 
306     /**
307      * Deletes a character in <i>text</i> at position <i>pos<i> and replaces
308      * it with the string <i>replacement</i>.
309      *
310      * @param text        the text to be modified
311      * @param pos         the position of the character to be deleted
312      * @param replacement the string the character is to be replaced with.
313      */
314 
315     public static int replaceChar(StringBuffer   text, int pos, String   replacement)
316     {
317         text.deleteCharAt(pos);
318         text.insert(pos, replacement);
319         return (pos + replacement.length());
320     }
321 
322     /**
323      * Deletes all characters <i>c</i> in <i>text</i> replaces
324      * it with the string <i>replacement</i>.
325      *
326      * @param text        the text to be modified
327      * @param replacement the string the character is to be replaced with.
328      */
329 
330     public static String   replaceAllChar(StringBuffer   text, char c, String   replacement)
331     {
332         return replaceAllBufferChar(text, c, replacement).toString();
333     }
334 
335     /**
336      * Deletes all characters <i>c</i> in <i>text</i> replaces
337      * it with the string <i>replacement</i>.
338      *
339      * @param text        the text to be modified
340      * @param replacement the string the character is to be replaced with.
341      */
342 
343     public static StringBuffer   replaceAllBufferChar(StringBuffer   text, char c, String   replacement)
344     {
345         int pos = 0;
346         while (pos != -1)
347         {
348             pos = text.toString().indexOf(c, pos);
349             if (pos != -1)
350                 pos = replaceChar(text, pos, replacement);
351         }
352         return text;
353     }
354 
355     /**
356      * Deletes a substring in <i>text</i> at position <i>pos<i>, of length <i>len</i> and replaces
357      * it with the string <i>replacement</i>.
358      *
359      * @param text        the text to be modified
360      * @param pos         the position of the character to be deleted
361      * @param replacement the string the character is to be replaced with.
362      */
363 
364     public static int replaceString(StringBuffer   text, int pos, int len, String   replacement)
365     {
366         text.replace(pos, pos + len, replacement);
367         //text.delete(pos, pos+len);
368         //text.insert(pos, replacement);
369         return (pos + replacement.length());
370     }
371 
372     /**
373      * Deletes all characters <i>orig</i> in <i>text</i> and replaces
374      * it with the string <i>replacement</i>.
375      *
376      * @param text        the text to be modified
377      * @param orig        the original text substring to be changed
378      * @param replacement the string the original substring is to be replaced with.
379      */
380 
381     public static String   replaceAllString(StringBuffer   text, String   orig, String   replacement)
382     {
383         return replaceAllBufferString(text, orig, replacement).toString();
384     }
385 
386     /**
387      * Deletes all characters <i>orig</i> in <i>text</i> replaces
388      * it with the string <i>replacement</i>.
389      *
390      * @param text        the text to be modified
391      * @param orig        the original text substring to be changed
392      * @param replacement the string the original substring is to be replaced with.
393      */
394 
395     public static StringBuffer   replaceAllBufferString(StringBuffer   text, String   orig, String   replacement)
396     {
397         int pos = 0;
398         while (pos != -1)
399         {
400             pos = text.toString().indexOf(orig, pos);
401             if (pos != -1)
402                 pos = replaceString(text, pos, orig.length(), replacement);
403         }
404         return text;
405     }
406 
407 
408     /**
409      * Utility for micro-parser.  Gets the next character pos in a string
410      * after an initial offset that either matches, or does not match, <i>any</i>
411      * of a set of comparison characters.
412      *
413      * @param pos      the position to start searching from
414      * @param searchMe the string to search
415      * @param compare  a string containing characters to compare against
416      * @param match    whether the match is for characters in the compare string (true)
417      *                 or <i>not</i> in the compare string (false)
418      * @return the position found, or -1 if no position is found.
419      */
420 
421     public static int nextCharIn(int pos, String   searchMe, String   compare, boolean match)
422     {
423         char test;
424         int length = searchMe.length();
425         while (pos < length)
426         {
427             test = searchMe.charAt(pos);
428             if ((compare.indexOf(test) != -1) == match)
429                 return pos;
430             pos++;
431         }
432         return -1;
433     }
434 }
435
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags