KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > util > StringUtil


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.util;
5
6 import java.util.HashMap JavaDoc;
7 import java.nio.charset.Charset JavaDoc;
8
9 /**
10  * A collection of String processing utility methods.
11  */

12 public class StringUtil {
13
14   /**
15    * Returns a copy of <code>s</code> padded with trailing spaces so
16    * that it's length is <code>length</code>. Strings already
17    * <code>length</code> characters long or longer are not altered.
18    */

19   public static String JavaDoc rightPad(String JavaDoc s, int length) {
20     StringBuffer JavaDoc sb= new StringBuffer JavaDoc(s);
21     for (int i= length - s.length(); i > 0; i--)
22       sb.append(" ");
23     return sb.toString();
24   }
25
26   /**
27    * Returns a copy of <code>s</code> padded with leading spaces so
28    * that it's length is <code>length</code>. Strings already
29    * <code>length</code> characters long or longer are not altered.
30    */

31   public static String JavaDoc leftPad(String JavaDoc s, int length) {
32     StringBuffer JavaDoc sb= new StringBuffer JavaDoc();
33     for (int i= length - s.length(); i > 0; i--)
34       sb.append(" ");
35     sb.append(s);
36     return sb.toString();
37   }
38
39   /**
40    * Parse the character encoding from the specified content type header.
41    * If the content type is null, or there is no explicit character encoding,
42    * <code>null</code> is returned.
43    * <br />
44    * This method was copy from org.apache.catalina.util.RequestUtil
45    * is licensed under the Apache License, Version 2.0 (the "License").
46    *
47    * @param contentType a content type header
48    */

49   public static String JavaDoc parseCharacterEncoding(String JavaDoc contentType) {
50     if (contentType == null)
51       return (null);
52     int start = contentType.indexOf("charset=");
53     if (start < 0)
54       return (null);
55     String JavaDoc encoding = contentType.substring(start + 8);
56     int end = encoding.indexOf(';');
57     if (end >= 0)
58       encoding = encoding.substring(0, end);
59     encoding = encoding.trim();
60     if ((encoding.length() > 2) && (encoding.startsWith("\""))
61       && (encoding.endsWith("\"")))
62       encoding = encoding.substring(1, encoding.length() - 1);
63     return (encoding.trim());
64
65   }
66
67   private static HashMap JavaDoc encodingAliases = new HashMap JavaDoc();
68
69   /**
70    * the following map is not an alias mapping table, but
71    * maps character encodings which are often used in mislabelled
72    * documents to their correct encodings. For instance,
73    * there are a lot of documents labelled 'ISO-8859-1' which contain
74    * characters not covered by ISO-8859-1 but covered by windows-1252.
75    * Because windows-1252 is a superset of ISO-8859-1 (sharing code points
76    * for the common part), it's better to treat ISO-8859-1 as
77    * synonymous with windows-1252 than to reject, as invalid, documents
78    * labelled as ISO-8859-1 that have characters outside ISO-8859-1.
79    */

80   static {
81     encodingAliases.put("ISO-8859-1", "windows-1252");
82     encodingAliases.put("EUC-KR", "x-windows-949");
83     encodingAliases.put("x-EUC-CN", "GB18030");
84     encodingAliases.put("GBK", "GB18030");
85  // encodingAliases.put("Big5", "Big5HKSCS");
86
// encodingAliases.put("TIS620", "Cp874");
87
// encodingAliases.put("ISO-8859-11", "Cp874");
88

89   }
90
91   public static String JavaDoc resolveEncodingAlias(String JavaDoc encoding) {
92     if (!Charset.isSupported(encoding))
93       return null;
94     String JavaDoc canonicalName = new String JavaDoc(Charset.forName(encoding).name());
95     return encodingAliases.containsKey(canonicalName) ?
96            (String JavaDoc) encodingAliases.get(canonicalName) : canonicalName;
97   }
98
99   public static void main(String JavaDoc[] args) {
100     if (args.length != 1)
101       System.out.println("Usage: StringUtil <encoding name>");
102     else
103       System.out.println(args[0] + " is resolved to " +
104                          resolveEncodingAlias(args[0]));
105   }
106 }
107
Popular Tags