hadoop UTF8 源码

  • 2022-10-20
  • 浏览 (364)

haddop UTF8 代码


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package org.apache.hadoop.io;

import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.UTFDataFormatException;

import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** A WritableComparable for strings that uses the UTF8 encoding.
 * <p>Also includes utilities for efficiently reading and writing UTF-8.
 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
 * UTF-8: see http://en.wikipedia.org/wiki/CESU-8
 * @deprecated replaced by Text
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
public class UTF8 implements WritableComparable<UTF8> {
  private static final Logger LOG= LoggerFactory.getLogger(UTF8.class);
  private static final DataInputBuffer IBUF = new DataInputBuffer();

  private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
    new ThreadLocal<DataOutputBuffer>(){
    protected DataOutputBuffer initialValue() {
      return new DataOutputBuffer();

  private static final byte[] EMPTY_BYTES = new byte[0];

  private byte[] bytes = EMPTY_BYTES;
  private int length;

  public UTF8() {

   * Construct from a given string.
   * @param string input string.
  public UTF8(String string) {

   * Construct from a given string.
   * @param utf8 input utf8.
  public UTF8(UTF8 utf8) {

  /** @return The raw bytes. */
  public byte[] getBytes() {
    return bytes;

  /** @return The number of bytes in the encoded string. */
  public int getLength() {
    return length;

   * Set to contain the contents of a string.
   * @param string input string.
  public void set(String string) {
    if (string.length() > 0xffff/3) {             // maybe too long
      LOG.warn("truncating long string: " + string.length()
               + " chars, starting with " + string.substring(0, 20));
      string = string.substring(0, 0xffff/3);

    length = utf8Length(string);                  // compute length
    if (length > 0xffff)                          // double-check length
      throw new RuntimeException("string too long!");

    if (bytes == null || length > bytes.length)   // grow buffer
      bytes = new byte[length];

    try {                                         // avoid sync'd allocations
      DataOutputBuffer obuf = OBUF_FACTORY.get();
      writeChars(obuf, string, 0, string.length());
      System.arraycopy(obuf.getData(), 0, bytes, 0, length);
    } catch (IOException e) {
      throw new RuntimeException(e);

   * Set to contain the contents of a string.
   * @param other input other.
  public void set(UTF8 other) {
    length = other.length;
    if (bytes == null || length > bytes.length)   // grow buffer
      bytes = new byte[length];
    System.arraycopy(other.bytes, 0, bytes, 0, length);

  public void readFields(DataInput in) throws IOException {
    length = in.readUnsignedShort();
    if (bytes == null || bytes.length < length)
      bytes = new byte[length];
    in.readFully(bytes, 0, length);

   * Skips over one UTF8 in the input.
   * @param in datainput.
   * @throws IOException raised on errors performing I/O.
  public static void skip(DataInput in) throws IOException {
    int length = in.readUnsignedShort();
    WritableUtils.skipFully(in, length);

  public void write(DataOutput out) throws IOException {
    out.write(bytes, 0, length);

  /** Compare two UTF8s. */
  public int compareTo(UTF8 o) {
    return WritableComparator.compareBytes(bytes, 0, length,
                                           o.bytes, 0, o.length);

  /** Convert to a String. */
  public String toString() {
    StringBuilder buffer = new StringBuilder(length);
    try {
      synchronized (IBUF) {
        IBUF.reset(bytes, length);
        readChars(IBUF, buffer, length);
    } catch (IOException e) {
      throw new RuntimeException(e);
    return buffer.toString();
   * Convert to a string, checking for valid UTF8.
   * @return the converted string
   * @throws UTFDataFormatException if the underlying bytes contain invalid
   * UTF8 data.
  public String toStringChecked() throws IOException {
    StringBuilder buffer = new StringBuilder(length);
    synchronized (IBUF) {
      IBUF.reset(bytes, length);
      readChars(IBUF, buffer, length);
    return buffer.toString();

  /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
  public boolean equals(Object o) {
    if (!(o instanceof UTF8))
      return false;
    UTF8 that = (UTF8)o;
    if (this.length != that.length)
      return false;
      return WritableComparator.compareBytes(bytes, 0, length,
                                             that.bytes, 0, that.length) == 0;

  public int hashCode() {
    return WritableComparator.hashBytes(bytes, length);

  /** A WritableComparator optimized for UTF8 keys. */
  public static class Comparator extends WritableComparator {
    public Comparator() {

    public int compare(byte[] b1, int s1, int l1,
                       byte[] b2, int s2, int l2) {
      int n1 = readUnsignedShort(b1, s1);
      int n2 = readUnsignedShort(b2, s2);
      return compareBytes(b1, s1+2, n1, b2, s2+2, n2);

  static {                                        // register this comparator
    WritableComparator.define(UTF8.class, new Comparator());


  /// These are probably not used much anymore, and might be removed...

   * @return Convert a string to a UTF-8 encoded byte array.
   * @see String#getBytes(String)
   * @param string input string.
  public static byte[] getBytes(String string) {
    byte[] result = new byte[utf8Length(string)];
    try {                                         // avoid sync'd allocations
      DataOutputBuffer obuf = OBUF_FACTORY.get();
      writeChars(obuf, string, 0, string.length());
      System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
    } catch (IOException e) {
      throw new RuntimeException(e);
    return result;

   * @return Convert a UTF-8 encoded byte array back into a string.
   * @param bytes input bytes.
   * @throws IOException if the byte array is invalid UTF8
  public static String fromBytes(byte[] bytes) throws IOException {
    DataInputBuffer dbuf = new DataInputBuffer();
    dbuf.reset(bytes, 0, bytes.length);
    StringBuilder buf = new StringBuilder(bytes.length);
    readChars(dbuf, buf, bytes.length);
    return buf.toString();

   * @return Read a UTF-8 encoded string.
   * @see DataInput#readUTF()
   * @param in DataInput.
   * @throws IOException raised on errors performing I/O.
  public static String readString(DataInput in) throws IOException {
    int bytes = in.readUnsignedShort();
    StringBuilder buffer = new StringBuilder(bytes);
    readChars(in, buffer, bytes);
    return buffer.toString();

  private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
    throws UTFDataFormatException, IOException {
    DataOutputBuffer obuf = OBUF_FACTORY.get();
    obuf.write(in, nBytes);
    byte[] bytes = obuf.getData();
    int i = 0;
    while (i < nBytes) {
      byte b = bytes[i++];
      if ((b & 0x80) == 0) {
        // 0b0xxxxxxx: 1-byte sequence
        buffer.append((char)(b & 0x7F));
      } else if ((b & 0xE0) == 0xC0) {
        if (i >= nBytes) {
          throw new UTFDataFormatException("Truncated UTF8 at " +
              StringUtils.byteToHexString(bytes, i - 1, 1));
        // 0b110xxxxx: 2-byte sequence
        buffer.append((char)(((b & 0x1F) << 6)
            | (bytes[i++] & 0x3F)));
      } else if ((b & 0xF0) == 0xE0) {
        // 0b1110xxxx: 3-byte sequence
        if (i + 1 >= nBytes) {
          throw new UTFDataFormatException("Truncated UTF8 at " +
              StringUtils.byteToHexString(bytes, i - 1, 2));
        buffer.append((char)(((b & 0x0F) << 12)
            | ((bytes[i++] & 0x3F) << 6)
            |  (bytes[i++] & 0x3F)));
      } else if ((b & 0xF8) == 0xF0) {
        if (i + 2 >= nBytes) {
          throw new UTFDataFormatException("Truncated UTF8 at " +
              StringUtils.byteToHexString(bytes, i - 1, 3));
        // 0b11110xxx: 4-byte sequence
        int codepoint =
            ((b & 0x07) << 18)
          | ((bytes[i++] & 0x3F) <<  12)
          | ((bytes[i++] & 0x3F) <<  6)
          | ((bytes[i++] & 0x3F));
      } else {
        // The UTF8 standard describes 5-byte and 6-byte sequences, but
        // these are no longer allowed as of 2003 (see RFC 3629)

        // Only show the next 6 bytes max in the error code - in case the
        // buffer is large, this will prevent an exceedingly large message.
        int endForError = Math.min(i + 5, nBytes);
        throw new UTFDataFormatException("Invalid UTF8 at " +
            StringUtils.byteToHexString(bytes, i - 1, endForError));

  private static char highSurrogate(int codePoint) {
    return (char) ((codePoint >>> 10)
        + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));

  private static char lowSurrogate(int codePoint) {
    return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);

   * @return Write a UTF-8 encoded string.
   * @see DataOutput#writeUTF(String)
   * @param out input out.
   * @param s input s.
   * @throws IOException raised on errors performing I/O.
  public static int writeString(DataOutput out, String s) throws IOException {
    if (s.length() > 0xffff/3) {         // maybe too long
      LOG.warn("truncating long string: " + s.length()
               + " chars, starting with " + s.substring(0, 20));
      s = s.substring(0, 0xffff/3);

    int len = utf8Length(s);
    if (len > 0xffff)                             // double-check length
      throw new IOException("string too long!");
    writeChars(out, s, 0, s.length());
    return len;

  /** Returns the number of bytes required to write this. */
  private static int utf8Length(String string) {
    int stringLength = string.length();
    int utf8Length = 0;
    for (int i = 0; i < stringLength; i++) {
      int c = string.charAt(i);
      if (c <= 0x007F) {
      } else if (c > 0x07FF) {
        utf8Length += 3;
      } else {
        utf8Length += 2;
    return utf8Length;

  private static void writeChars(DataOutput out,
                                 String s, int start, int length)
    throws IOException {
    final int end = start + length;
    for (int i = start; i < end; i++) {
      int code = s.charAt(i);
      if (code <= 0x7F) {
      } else if (code <= 0x07FF) {
        out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
        out.writeByte((byte)(0x80 |   code       & 0x3F));
      } else {
        out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
        out.writeByte((byte)(0x80 | ((code >>  6) & 0x3F)));
        out.writeByte((byte)(0x80 |  (code        & 0x3F)));



hadoop 源码目录


hadoop AbstractMapWritable 源码

hadoop ArrayFile 源码

hadoop ArrayPrimitiveWritable 源码

hadoop ArrayWritable 源码

hadoop BinaryComparable 源码

hadoop BloomMapFile 源码

hadoop BooleanWritable 源码

hadoop BoundedByteArrayOutputStream 源码

hadoop ByteBufferPool 源码

hadoop ByteWritable 源码

0  赞