1 2 3 4 package net.nutch.io; 5 6 import java.io.IOException ; 7 import java.io.DataInput ; 8 import java.io.DataOutput ; 9 10 import java.util.logging.Logger ; 11 import net.nutch.util.LogFormatter; 12 13 19 public class UTF8 implements WritableComparable { 20 private static final Logger LOG= LogFormatter.getLogger("net.nutch.io.UTF8"); 21 private static final DataOutputBuffer OBUF = new DataOutputBuffer(); 22 private static final DataInputBuffer IBUF = new DataInputBuffer(); 23 24 private byte[] bytes; 25 private int length; 26 27 public UTF8() { 28 set(""); 29 } 30 31 32 public UTF8(String string) { 33 set(string); 34 } 35 36 37 public UTF8(UTF8 utf8) { 38 set(utf8); 39 } 40 41 42 public byte[] getBytes() { 43 return bytes; 44 } 45 46 47 public int getLength() { 48 return length; 49 } 50 51 52 public void set(String string) { 53 if (string.length() > 0xffff/3) { LOG.warning("truncating long string: " + string.length() 55 + " chars, starting with " + string.substring(0, 20)); 56 string = string.substring(0, 0xffff/3); 57 } 58 59 length = utf8Length(string); if (length > 0xffff) throw new RuntimeException ("string too long!"); 62 63 if (bytes == null || length > bytes.length) bytes = new byte[length]; 65 66 try { synchronized (OBUF) { 68 OBUF.reset(); 69 writeChars(OBUF, string, 0, string.length()); 70 System.arraycopy(OBUF.getData(), 0, bytes, 0, length); 71 } 72 } catch (IOException e) { 73 throw new RuntimeException (e); 74 } 75 } 76 77 78 public void set(UTF8 other) { 79 length = other.length; 80 if (bytes == null || length > bytes.length) bytes = new byte[length]; 82 System.arraycopy(other.bytes, 0, bytes, 0, length); 83 } 84 85 public void readFields(DataInput in) throws IOException { 86 length = in.readUnsignedShort(); 87 if (bytes == null || bytes.length < length) 88 bytes = new byte[length]; 89 in.readFully(bytes, 0, length); 90 } 91 92 93 public static void skip(DataInput in) throws IOException { 94 int length = in.readUnsignedShort(); 95 in.skipBytes(length); 96 } 97 98 public void write(DataOutput out) throws IOException { 99 out.writeShort(length); 100 out.write(bytes, 0, length); 101 } 102 103 104 public int compareTo(Object o) { 105 UTF8 that = (UTF8)o; 106 return WritableComparator.compareBytes(bytes, 0, length, 107 that.bytes, 0, that.length); 108 } 109 110 111 public String toString() { 112 StringBuffer buffer = new StringBuffer (length); 113 try { 114 synchronized (IBUF) { 115 IBUF.reset(bytes, length); 116 readChars(IBUF, buffer, length); 117 } 118 } catch (IOException e) { 119 throw new RuntimeException (e); 120 } 121 return buffer.toString(); 122 } 123 124 125 public boolean equals(Object o) { 126 if (!(o instanceof UTF8)) 127 return false; 128 UTF8 that = (UTF8)o; 129 if (this.length != that.length) 130 return false; 131 else 132 return WritableComparator.compareBytes(bytes, 0, length, 133 that.bytes, 0, that.length) == 0; 134 } 135 136 137 public static class Comparator extends WritableComparator { 138 public Comparator() { 139 super(UTF8.class); 140 } 141 142 public int compare(byte[] b1, int s1, int l1, 143 byte[] b2, int s2, int l2) { 144 int n1 = readUnsignedShort(b1, s1); 145 int n2 = readUnsignedShort(b2, s2); 146 return compareBytes(b1, s1+2, n1, b2, s2+2, n2); 147 } 148 } 149 150 152 154 157 public static byte[] getBytes(String string) { 158 byte[] result = new byte[utf8Length(string)]; 159 try { synchronized (OBUF) { 161 OBUF.reset(); 162 writeChars(OBUF, string, 0, string.length()); 163 System.arraycopy(OBUF.getData(), 0, result, 0, OBUF.getLength()); 164 } 165 } catch (IOException e) { 166 throw new RuntimeException (e); 167 } 168 return result; 169 } 170 171 175 public static String readString(DataInput in) throws IOException { 176 int bytes = in.readUnsignedShort(); 177 StringBuffer buffer = new StringBuffer (bytes); 178 readChars(in, buffer, bytes); 179 return buffer.toString(); 180 } 181 182 private static void readChars(DataInput in, StringBuffer buffer, int nBytes) 183 throws IOException { 184 synchronized (OBUF) { 185 OBUF.reset(); 186 OBUF.write(in, nBytes); 187 byte[] bytes = OBUF.getData(); 188 int i = 0; 189 while (i < nBytes) { 190 byte b = bytes[i++]; 191 if ((b & 0x80) == 0) { 192 buffer.append((char)(b & 0x7F)); 193 } else if ((b & 0xE0) != 0xE0) { 194 buffer.append((char)(((b & 0x1F) << 6) 195 | (bytes[i++] & 0x3F))); 196 } else { 197 buffer.append((char)(((b & 0x0F) << 12) 198 | ((bytes[i++] & 0x3F) << 6) 199 | (bytes[i++] & 0x3F))); 200 } 201 } 202 } 203 } 204 205 209 public static int writeString(DataOutput out, String s) throws IOException { 210 if (s.length() > 0xffff/3) { LOG.warning("truncating long string: " + s.length() 212 + " chars, starting with " + s.substring(0, 20)); 213 s = s.substring(0, 0xffff/3); 214 } 215 216 int len = utf8Length(s); 217 if (len > 0xffff) throw new IOException ("string too long!"); 219 220 out.writeShort(len); 221 writeChars(out, s, 0, s.length()); 222 return len; 223 } 224 225 226 private static int utf8Length(String string) { 227 int stringLength = string.length(); 228 int utf8Length = 0; 229 for (int i = 0; i < stringLength; i++) { 230 int c = string.charAt(i); 231 if ((c >= 0x0001) && (c <= 0x007F)) { 232 utf8Length++; 233 } else if (c > 0x07FF) { 234 utf8Length += 3; 235 } else { 236 utf8Length += 2; 237 } 238 } 239 return utf8Length; 240 } 241 242 private static void writeChars(DataOutput out, 243 String s, int start, int length) 244 throws IOException { 245 final int end = start + length; 246 for (int i = start; i < end; i++) { 247 int code = s.charAt(i); 248 if (code >= 0x01 && code <= 0x7F) { 249 out.writeByte((byte)code); 250 } else if (code <= 0x07FF) { 251 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F))); 252 out.writeByte((byte)(0x80 | code & 0x3F)); 253 } else { 254 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F))); 255 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); 256 out.writeByte((byte)(0x80 | (code & 0x3F))); 257 } 258 } 259 } 260 261 } 262 | Popular Tags |