KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > io > UTF8


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.io;
5
6 import java.io.IOException JavaDoc;
7 import java.io.DataInput JavaDoc;
8 import java.io.DataOutput JavaDoc;
9
10 import java.util.logging.Logger JavaDoc;
11 import net.nutch.util.LogFormatter;
12
13 /** A WritableComparable for strings that uses the UTF8 encoding.
14  *
15  * <p>Also includes utilities for efficiently reading and writing UTF-8.
16  *
17  * @author Doug Cutting
18  */

19 public class UTF8 implements WritableComparable {
20   private static final Logger JavaDoc LOG= LogFormatter.getLogger("net.nutch.io.UTF8");
21   private static final DataOutputBuffer OBUF = new DataOutputBuffer();
22   private static final DataInputBuffer IBUF = new DataInputBuffer();
23
24   private byte[] bytes;
25   private int length;
26
27   public UTF8() {
28     set("");
29   }
30
31   /** Construct from a given string. */
32   public UTF8(String JavaDoc string) {
33     set(string);
34   }
35
36   /** Construct from a given string. */
37   public UTF8(UTF8 utf8) {
38     set(utf8);
39   }
40
41   /** The raw bytes. */
42   public byte[] getBytes() {
43     return bytes;
44   }
45
46   /** The number of bytes in the encoded string. */
47   public int getLength() {
48     return length;
49   }
50
51   /** Set to contain the contents of a string. */
52   public void set(String JavaDoc string) {
53     if (string.length() > 0xffff/3) { // maybe too long
54
LOG.warning("truncating long string: " + string.length()
55                   + " chars, starting with " + string.substring(0, 20));
56       string = string.substring(0, 0xffff/3);
57     }
58
59     length = utf8Length(string); // compute length
60
if (length > 0xffff) // double-check length
61
throw new RuntimeException JavaDoc("string too long!");
62
63     if (bytes == null || length > bytes.length) // grow buffer
64
bytes = new byte[length];
65
66     try { // avoid sync'd allocations
67
synchronized (OBUF) {
68         OBUF.reset();
69         writeChars(OBUF, string, 0, string.length());
70         System.arraycopy(OBUF.getData(), 0, bytes, 0, length);
71       }
72     } catch (IOException JavaDoc e) {
73       throw new RuntimeException JavaDoc(e);
74     }
75   }
76
77   /** Set to contain the contents of a string. */
78   public void set(UTF8 other) {
79     length = other.length;
80     if (bytes == null || length > bytes.length) // grow buffer
81
bytes = new byte[length];
82     System.arraycopy(other.bytes, 0, bytes, 0, length);
83   }
84
85   public void readFields(DataInput JavaDoc in) throws IOException JavaDoc {
86     length = in.readUnsignedShort();
87     if (bytes == null || bytes.length < length)
88       bytes = new byte[length];
89     in.readFully(bytes, 0, length);
90   }
91
92   /** Skips over one UTF8 in the input. */
93   public static void skip(DataInput JavaDoc in) throws IOException JavaDoc {
94     int length = in.readUnsignedShort();
95     in.skipBytes(length);
96   }
97
98   public void write(DataOutput JavaDoc out) throws IOException JavaDoc {
99     out.writeShort(length);
100     out.write(bytes, 0, length);
101   }
102
103   /** Compare two UTF8s. */
104   public int compareTo(Object JavaDoc o) {
105     UTF8 that = (UTF8)o;
106     return WritableComparator.compareBytes(bytes, 0, length,
107                                            that.bytes, 0, that.length);
108   }
109
110   /** Convert to a String. */
111   public String JavaDoc toString() {
112     StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(length);
113     try {
114       synchronized (IBUF) {
115         IBUF.reset(bytes, length);
116         readChars(IBUF, buffer, length);
117       }
118     } catch (IOException JavaDoc e) {
119       throw new RuntimeException JavaDoc(e);
120     }
121     return buffer.toString();
122   }
123
124   /** Returns true iff <code>o</code> is a UTF8 with the same contents. */
125   public boolean equals(Object JavaDoc o) {
126     if (!(o instanceof UTF8))
127       return false;
128     UTF8 that = (UTF8)o;
129     if (this.length != that.length)
130       return false;
131     else
132       return WritableComparator.compareBytes(bytes, 0, length,
133                                              that.bytes, 0, that.length) == 0;
134   }
135
136   /** A WritableComparator optimized for UTF8 keys. */
137   public static class Comparator extends WritableComparator {
138     public Comparator() {
139       super(UTF8.class);
140     }
141
142     public int compare(byte[] b1, int s1, int l1,
143                        byte[] b2, int s2, int l2) {
144       int n1 = readUnsignedShort(b1, s1);
145       int n2 = readUnsignedShort(b2, s2);
146       return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
147     }
148   }
149
150   /// STATIC UTILITIES FROM HERE DOWN
151

152   /// These are probably not used much anymore, and might be removed...
153

154   /** Convert a string to a UTF-8 encoded byte array.
155    * @see String#getBytes(String)
156    */

157   public static byte[] getBytes(String JavaDoc string) {
158     byte[] result = new byte[utf8Length(string)];
159     try { // avoid sync'd allocations
160
synchronized (OBUF) {
161         OBUF.reset();
162         writeChars(OBUF, string, 0, string.length());
163         System.arraycopy(OBUF.getData(), 0, result, 0, OBUF.getLength());
164       }
165     } catch (IOException JavaDoc e) {
166       throw new RuntimeException JavaDoc(e);
167     }
168     return result;
169   }
170
171   /** Read a UTF-8 encoded string.
172    *
173    * @see DataInput#readUTF()
174    */

175   public static String JavaDoc readString(DataInput JavaDoc in) throws IOException JavaDoc {
176     int bytes = in.readUnsignedShort();
177     StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(bytes);
178     readChars(in, buffer, bytes);
179     return buffer.toString();
180   }
181
182   private static void readChars(DataInput JavaDoc in, StringBuffer JavaDoc buffer, int nBytes)
183     throws IOException JavaDoc {
184     synchronized (OBUF) {
185       OBUF.reset();
186       OBUF.write(in, nBytes);
187       byte[] bytes = OBUF.getData();
188       int i = 0;
189       while (i < nBytes) {
190         byte b = bytes[i++];
191         if ((b & 0x80) == 0) {
192           buffer.append((char)(b & 0x7F));
193         } else if ((b & 0xE0) != 0xE0) {
194           buffer.append((char)(((b & 0x1F) << 6)
195                                | (bytes[i++] & 0x3F)));
196         } else {
197           buffer.append((char)(((b & 0x0F) << 12)
198                                | ((bytes[i++] & 0x3F) << 6)
199                                | (bytes[i++] & 0x3F)));
200         }
201       }
202     }
203   }
204
205   /** Write a UTF-8 encoded string.
206    *
207    * @see DataOutput#writeUTF(String)
208    */

209   public static int writeString(DataOutput JavaDoc out, String JavaDoc s) throws IOException JavaDoc {
210     if (s.length() > 0xffff/3) { // maybe too long
211
LOG.warning("truncating long string: " + s.length()
212                   + " chars, starting with " + s.substring(0, 20));
213       s = s.substring(0, 0xffff/3);
214     }
215
216     int len = utf8Length(s);
217     if (len > 0xffff) // double-check length
218
throw new IOException JavaDoc("string too long!");
219       
220     out.writeShort(len);
221     writeChars(out, s, 0, s.length());
222     return len;
223   }
224
225   /** Returns the number of bytes required to write this. */
226   private static int utf8Length(String JavaDoc string) {
227     int stringLength = string.length();
228     int utf8Length = 0;
229     for (int i = 0; i < stringLength; i++) {
230       int c = string.charAt(i);
231       if ((c >= 0x0001) && (c <= 0x007F)) {
232         utf8Length++;
233       } else if (c > 0x07FF) {
234         utf8Length += 3;
235       } else {
236         utf8Length += 2;
237       }
238     }
239     return utf8Length;
240   }
241
242   private static void writeChars(DataOutput JavaDoc out,
243                                  String JavaDoc s, int start, int length)
244     throws IOException JavaDoc {
245     final int end = start + length;
246     for (int i = start; i < end; i++) {
247       int code = s.charAt(i);
248       if (code >= 0x01 && code <= 0x7F) {
249         out.writeByte((byte)code);
250       } else if (code <= 0x07FF) {
251         out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
252         out.writeByte((byte)(0x80 | code & 0x3F));
253       } else {
254         out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
255         out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
256         out.writeByte((byte)(0x80 | (code & 0x3F)));
257       }
258     }
259   }
260
261 }
262
Popular Tags