spark UTF8String 源码

  • 2022-10-20
  • 浏览 (283)

spark UTF8String 代码

文件路径:/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.unsafe.types;

import javax.annotation.Nonnull;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Map;
import java.util.regex.Pattern;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.KryoSerializable;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;

import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.UTF8StringBuilder;
import org.apache.spark.unsafe.array.ByteArrayMethods;
import org.apache.spark.unsafe.hash.Murmur3_x86_32;

import static org.apache.spark.unsafe.Platform.*;


/**
 * A UTF-8 String for internal Spark use.
 * <p>
 * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
 * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
 * <p>
 * Note: This is not designed for general use cases, should not be used outside SQL.
 */
public final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable,
  Cloneable {

  // These are only updated by readExternal() or read()
  @Nonnull
  private Object base;
  private long offset;
  private int numBytes;

  public Object getBaseObject() { return base; }
  public long getBaseOffset() { return offset; }

  /**
   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which
   * indicates the size of the char. See Unicode standard in page 126, Table 3-6:
   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
   *
   * Binary    Hex          Comments
   * 0xxxxxxx  0x00..0x7F   Only byte of a 1-byte character encoding
   * 10xxxxxx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
   * 110xxxxx  0xC0..0xDF   First byte of a 2-byte character encoding
   * 1110xxxx  0xE0..0xEF   First byte of a 3-byte character encoding
   * 11110xxx  0xF0..0xF7   First byte of a 4-byte character encoding
   *
   * As a consequence of the well-formedness conditions specified in
   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
   *   C0–C1, F5–FF.
   */
  private static byte[] bytesOfCodePointInUTF8 = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
    // Continuation bytes cannot appear as the first byte
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90..0x9F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0..0xAF
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0..0xBF
    0, 0, // 0xC0..0xC1 - disallowed in UTF-8
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC2..0xCF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0..0xDF
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0..0xEF
    4, 4, 4, 4, 4, // 0xF0..0xF4
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0xF5..0xFF - disallowed in UTF-8
  };

  private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
  public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");

  /**
   * Creates an UTF8String from byte array, which should be encoded in UTF-8.
   *
   * Note: `bytes` will be hold by returned UTF8String.
   */
  public static UTF8String fromBytes(byte[] bytes) {
    if (bytes != null) {
      return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length);
    } else {
      return null;
    }
  }

  /**
   * Creates an UTF8String from byte array, which should be encoded in UTF-8.
   *
   * Note: `bytes` will be hold by returned UTF8String.
   */
  public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
    if (bytes != null) {
      return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
    } else {
      return null;
    }
  }

  /**
   * Creates an UTF8String from given address (base and offset) and length.
   */
  public static UTF8String fromAddress(Object base, long offset, int numBytes) {
    return new UTF8String(base, offset, numBytes);
  }

  /**
   * Creates an UTF8String from String.
   */
  public static UTF8String fromString(String str) {
    return str == null ? null : fromBytes(str.getBytes(StandardCharsets.UTF_8));
  }

  /**
   * Creates an UTF8String that contains `length` spaces.
   */
  public static UTF8String blankString(int length) {
    byte[] spaces = new byte[length];
    Arrays.fill(spaces, (byte) ' ');
    return fromBytes(spaces);
  }

  private UTF8String(Object base, long offset, int numBytes) {
    this.base = base;
    this.offset = offset;
    this.numBytes = numBytes;
  }

  // for serialization
  public UTF8String() {
    this(null, 0, 0);
  }

  /**
   * Writes the content of this string into a memory address, identified by an object and an offset.
   * The target memory address must already been allocated, and have enough space to hold all the
   * bytes in this string.
   */
  public void writeToMemory(Object target, long targetOffset) {
    Platform.copyMemory(base, offset, target, targetOffset, numBytes);
  }

  public void writeTo(ByteBuffer buffer) {
    assert(buffer.hasArray());
    byte[] target = buffer.array();
    int offset = buffer.arrayOffset();
    int pos = buffer.position();
    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
    buffer.position(pos + numBytes);
  }

  /**
   * Returns a {@link ByteBuffer} wrapping the base object if it is a byte array
   * or a copy of the data if the base object is not a byte array.
   *
   * Unlike getBytes this will not create a copy the array if this is a slice.
   */
  @Nonnull
  public ByteBuffer getByteBuffer() {
    if (base instanceof byte[] && offset >= BYTE_ARRAY_OFFSET) {
      final byte[] bytes = (byte[]) base;

      // the offset includes an object header... this is only needed for unsafe copies
      final long arrayOffset = offset - BYTE_ARRAY_OFFSET;

      // verify that the offset and length points somewhere inside the byte array
      // and that the offset can safely be truncated to a 32-bit integer
      if ((long) bytes.length < arrayOffset + numBytes) {
        throw new ArrayIndexOutOfBoundsException();
      }

      return ByteBuffer.wrap(bytes, (int) arrayOffset, numBytes);
    } else {
      return ByteBuffer.wrap(getBytes());
    }
  }

  public void writeTo(OutputStream out) throws IOException {
    final ByteBuffer bb = this.getByteBuffer();
    assert(bb.hasArray());

    // similar to Utils.writeByteBuffer but without the spark-core dependency
    out.write(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
  }

  /**
   * Returns the number of bytes for a code point with the first byte as `b`
   * @param b The first byte of a code point
   */
  private static int numBytesForFirstByte(final byte b) {
    final int offset = b & 0xFF;
    byte numBytes = bytesOfCodePointInUTF8[offset];
    return (numBytes == 0) ? 1: numBytes; // Skip the first byte disallowed in UTF-8
  }

  /**
   * Returns the number of bytes
   */
  public int numBytes() {
    return numBytes;
  }

  /**
   * Returns the number of code points in it.
   */
  public int numChars() {
    int len = 0;
    for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) {
      len += 1;
    }
    return len;
  }

  /**
   * Returns a 64-bit integer that can be used as the prefix used in sorting.
   */
  public long getPrefix() {
    return ByteArray.getPrefix(base, offset, numBytes);
  }

  /**
   * Returns the underline bytes, will be a copy of it if it's part of another array.
   */
  public byte[] getBytes() {
    // avoid copy if `base` is `byte[]`
    if (offset == BYTE_ARRAY_OFFSET && base instanceof byte[]
      && ((byte[]) base).length == numBytes) {
      return (byte[]) base;
    } else {
      byte[] bytes = new byte[numBytes];
      copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes);
      return bytes;
    }
  }

  /**
   * Returns a substring of this.
   * @param start the position of first code point
   * @param until the position after last code point, exclusive.
   */
  public UTF8String substring(final int start, final int until) {
    if (until <= start || start >= numBytes) {
      return EMPTY_UTF8;
    }

    int i = 0;
    int c = 0;
    while (i < numBytes && c < start) {
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    }

    int j = i;
    while (i < numBytes && c < until) {
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    }

    if (i > j) {
      byte[] bytes = new byte[i - j];
      copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j);
      return fromBytes(bytes);
    } else {
      return EMPTY_UTF8;
    }
  }

  public UTF8String substringSQL(int pos, int length) {
    // Information regarding the pos calculation:
    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
    // negative indices for start positions. If a start index i is greater than 0, it
    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
    // to the -ith element before the end of the sequence. If a start index i is 0, it
    // refers to the first element.
    int len = numChars();
    // `len + pos` does not overflow as `len >= 0`.
    int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0);

    int end;
    if ((long) start + length > Integer.MAX_VALUE) {
      end = Integer.MAX_VALUE;
    } else if ((long) start + length < Integer.MIN_VALUE) {
      end = Integer.MIN_VALUE;
    } else {
      end = start + length;
    }
    return substring(start, end);
  }

  /**
   * Returns whether this contains `substring` or not.
   */
  public boolean contains(final UTF8String substring) {
    if (substring.numBytes == 0) {
      return true;
    }

    byte first = substring.getByte(0);
    for (int i = 0; i <= numBytes - substring.numBytes; i++) {
      if (getByte(i) == first && matchAt(substring, i)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Returns the byte at position `i`.
   */
  private byte getByte(int i) {
    return Platform.getByte(base, offset + i);
  }

  public boolean matchAt(final UTF8String s, int pos) {
    if (s.numBytes + pos > numBytes || pos < 0) {
      return false;
    }
    return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
  }

  public boolean startsWith(final UTF8String prefix) {
    return matchAt(prefix, 0);
  }

  public boolean endsWith(final UTF8String suffix) {
    return matchAt(suffix, numBytes - suffix.numBytes);
  }

  /**
   * Returns the upper case of this string
   */
  public UTF8String toUpperCase() {
    if (numBytes == 0) {
      return EMPTY_UTF8;
    }

    byte[] bytes = new byte[numBytes];
    bytes[0] = (byte) Character.toTitleCase(getByte(0));
    for (int i = 0; i < numBytes; i++) {
      byte b = getByte(i);
      if (numBytesForFirstByte(b) != 1) {
        // fallback
        return toUpperCaseSlow();
      }
      int upper = Character.toUpperCase(b);
      if (upper > 127) {
        // fallback
        return toUpperCaseSlow();
      }
      bytes[i] = (byte) upper;
    }
    return fromBytes(bytes);
  }

  private UTF8String toUpperCaseSlow() {
    return fromString(toString().toUpperCase());
  }

  /**
   * Returns the lower case of this string
   */
  public UTF8String toLowerCase() {
    if (numBytes == 0) {
      return EMPTY_UTF8;
    }

    byte[] bytes = new byte[numBytes];
    bytes[0] = (byte) Character.toTitleCase(getByte(0));
    for (int i = 0; i < numBytes; i++) {
      byte b = getByte(i);
      if (numBytesForFirstByte(b) != 1) {
        // fallback
        return toLowerCaseSlow();
      }
      int lower = Character.toLowerCase(b);
      if (lower > 127) {
        // fallback
        return toLowerCaseSlow();
      }
      bytes[i] = (byte) lower;
    }
    return fromBytes(bytes);
  }

  private UTF8String toLowerCaseSlow() {
    return fromString(toString().toLowerCase());
  }

  /**
   * Returns the title case of this string, that could be used as title.
   */
  public UTF8String toTitleCase() {
    if (numBytes == 0) {
      return EMPTY_UTF8;
    }

    byte[] bytes = new byte[numBytes];
    for (int i = 0; i < numBytes; i++) {
      byte b = getByte(i);
      if (i == 0 || getByte(i - 1) == ' ') {
        if (numBytesForFirstByte(b) != 1) {
          // fallback
          return toTitleCaseSlow();
        }
        int upper = Character.toTitleCase(b);
        if (upper > 127) {
          // fallback
          return toTitleCaseSlow();
        }
        bytes[i] = (byte) upper;
      } else {
        bytes[i] = b;
      }
    }
    return fromBytes(bytes);
  }

  private UTF8String toTitleCaseSlow() {
    StringBuilder sb = new StringBuilder();
    String s = toString();
    sb.append(s);
    sb.setCharAt(0, Character.toTitleCase(sb.charAt(0)));
    for (int i = 1; i < s.length(); i++) {
      if (sb.charAt(i - 1) == ' ') {
        sb.setCharAt(i, Character.toTitleCase(sb.charAt(i)));
      }
    }
    return fromString(sb.toString());
  }

  /*
   * Returns the index of the string `match` in this String. This string has to be a comma separated
   * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String,
   * 0 will be returned, else the index of match (1-based index)
   */
  public int findInSet(UTF8String match) {
    if (match.contains(COMMA_UTF8)) {
      return 0;
    }

    int n = 1, lastComma = -1;
    for (int i = 0; i < numBytes; i++) {
      if (getByte(i) == (byte) ',') {
        if (i - (lastComma + 1) == match.numBytes &&
          ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset,
            match.numBytes)) {
          return n;
        }
        lastComma = i;
        n++;
      }
    }
    if (numBytes - (lastComma + 1) == match.numBytes &&
      ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset,
        match.numBytes)) {
      return n;
    }
    return 0;
  }

  /**
   * Copy the bytes from the current UTF8String, and make a new UTF8String.
   * @param start the start position of the current UTF8String in bytes.
   * @param end the end position of the current UTF8String in bytes.
   * @return a new UTF8String in the position of [start, end] of current UTF8String bytes.
   */
  private UTF8String copyUTF8String(int start, int end) {
    int len = end - start + 1;
    byte[] newBytes = new byte[len];
    copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
    return UTF8String.fromBytes(newBytes);
  }

  /**
   * Trims space characters (ASCII 32) from both ends of this string.
   *
   * @return this string with no spaces at the start or end
   */
  public UTF8String trim() {
    int s = 0;
    // skip all of the space (0x20) in the left side
    while (s < this.numBytes && getByte(s) == ' ') s++;
    if (s == this.numBytes) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    // skip all of the space (0x20) in the right side
    int e = this.numBytes - 1;
    while (e > s && getByte(e) == ' ') e--;
    if (s == 0 && e == numBytes - 1) {
      // Nothing trimmed
      return this;
    }
    return copyUTF8String(s, e);
  }

  /**
   * Trims whitespace ASCII characters from both ends of this string.
   *
   * Note that, this method is different from {@link UTF8String#trim()} which removes
   * only spaces(= ASCII 32) from both ends.
   *
   * @return A UTF8String whose value is this UTF8String, with any leading and trailing white
   * space removed, or this UTF8String if it has no leading or trailing whitespace.
   *
   */
  public UTF8String trimAll() {
    int s = 0;
    // skip all of the whitespaces in the left side
    while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
    if (s == this.numBytes) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    // skip all of the whitespaces in the right side
    int e = this.numBytes - 1;
    while (e > s && Character.isWhitespace(getByte(e))) e--;
    if (s == 0 && e == numBytes - 1) {
      // Nothing trimmed
      return this;
    }
    return copyUTF8String(s, e);
  }

  /**
   * Trims instances of the given trim string from both ends of this string.
   *
   * @param trimString the trim character string
   * @return this string with no occurrences of the trim string at the start or end, or `null`
   *  if `trimString` is `null`
   */
  public UTF8String trim(UTF8String trimString) {
    if (trimString != null) {
      return trimLeft(trimString).trimRight(trimString);
    } else {
      return null;
    }
  }

  /**
   * Trims space characters (ASCII 32) from the start of this string.
   *
   * @return this string with no spaces at the start
   */
  public UTF8String trimLeft() {
    int s = 0;
    // skip all of the space (0x20) in the left side
    while (s < this.numBytes && getByte(s) == 0x20) s++;
    if (s == 0) {
      // Nothing trimmed
      return this;
    }
    if (s == this.numBytes) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    return copyUTF8String(s, this.numBytes - 1);
  }

  /**
   * Trims instances of the given trim string from the start of this string.
   *
   * @param trimString the trim character string
   * @return this string with no occurrences of the trim string at the start, or `null`
   *  if `trimString` is `null`
   */
  public UTF8String trimLeft(UTF8String trimString) {
    if (trimString == null) return null;
    // the searching byte position in the source string
    int searchIdx = 0;
    // the first beginning byte position of a non-matching character
    int trimIdx = 0;

    while (searchIdx < numBytes) {
      UTF8String searchChar = copyUTF8String(
          searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1);
      int searchCharBytes = searchChar.numBytes;
      // try to find the matching for the searchChar in the trimString set
      if (trimString.find(searchChar, 0) >= 0) {
        trimIdx += searchCharBytes;
      } else {
        // no matching, exit the search
        break;
      }
      searchIdx += searchCharBytes;
    }
    if (searchIdx == 0) {
      // Nothing trimmed
      return this;
    }
    if (trimIdx >= numBytes) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    return copyUTF8String(trimIdx, numBytes - 1);
  }

  /**
   * Trims space characters (ASCII 32) from the end of this string.
   *
   * @return this string with no spaces at the end
   */
  public UTF8String trimRight() {
    int e = numBytes - 1;
    // skip all of the space (0x20) in the right side
    while (e >= 0 && getByte(e) == 0x20) e--;
    if (e == numBytes - 1) {
      // Nothing trimmed
      return this;
    }
    if (e < 0) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    return copyUTF8String(0, e);
  }

  /**
   * Trims at most `numSpaces` space characters (ASCII 32) from the end of this string.
   */
  public UTF8String trimTrailingSpaces(int numSpaces) {
    assert numSpaces > 0;
    int endIdx = numBytes - 1;
    int trimTo = numBytes - numSpaces;
    while (endIdx >= trimTo && getByte(endIdx) == 0x20) endIdx--;
    return copyUTF8String(0, endIdx);
  }

  /**
   * Trims instances of the given trim string from the end of this string.
   *
   * @param trimString the trim character string
   * @return this string with no occurrences of the trim string at the end, or `null`
   *  if `trimString` is `null`
   */
  public UTF8String trimRight(UTF8String trimString) {
    if (trimString == null) return null;
    int charIdx = 0;
    // number of characters from the source string
    int numChars = 0;
    // array of character length for the source string
    int[] stringCharLen = new int[numBytes];
    // array of the first byte position for each character in the source string
    int[] stringCharPos = new int[numBytes];
    // build the position and length array
    while (charIdx < numBytes) {
      stringCharPos[numChars] = charIdx;
      stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
      charIdx += stringCharLen[numChars];
      numChars ++;
    }

    // index trimEnd points to the first no matching byte position from the right side of
    // the source string.
    int trimEnd = numBytes - 1;
    while (numChars > 0) {
      UTF8String searchChar = copyUTF8String(
          stringCharPos[numChars - 1],
          stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
      if (trimString.find(searchChar, 0) >= 0) {
        trimEnd -= stringCharLen[numChars - 1];
      } else {
        break;
      }
      numChars --;
    }

    if (trimEnd == numBytes - 1) {
      // Nothing trimmed
      return this;
    }
    if (trimEnd < 0) {
      // Everything trimmed
      return EMPTY_UTF8;
    }
    return copyUTF8String(0, trimEnd);
  }

  public UTF8String reverse() {
    byte[] result = new byte[this.numBytes];

    int i = 0; // position in byte
    while (i < numBytes) {
      int len = numBytesForFirstByte(getByte(i));
      copyMemory(this.base, this.offset + i, result,
        BYTE_ARRAY_OFFSET + result.length - i - len, len);

      i += len;
    }

    return UTF8String.fromBytes(result);
  }

  public UTF8String repeat(int times) {
    if (times <= 0) {
      return EMPTY_UTF8;
    }

    byte[] newBytes = new byte[Math.multiplyExact(numBytes, times)];
    copyMemory(this.base, this.offset, newBytes, BYTE_ARRAY_OFFSET, numBytes);

    int copied = 1;
    while (copied < times) {
      int toCopy = Math.min(copied, times - copied);
      System.arraycopy(newBytes, 0, newBytes, copied * numBytes, numBytes * toCopy);
      copied += toCopy;
    }

    return UTF8String.fromBytes(newBytes);
  }

  /**
   * Returns the position of the first occurrence of substr in
   * current string from the specified position (0-based index).
   *
   * @param v the string to be searched
   * @param start the start position of the current string for searching
   * @return the position of the first occurrence of substr, if not found, -1 returned.
   */
  public int indexOf(UTF8String v, int start) {
    if (v.numBytes() == 0) {
      return 0;
    }

    // locate to the start position.
    int i = 0; // position in byte
    int c = 0; // position in character
    while (i < numBytes && c < start) {
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    }

    do {
      if (i + v.numBytes > numBytes) {
        return -1;
      }
      if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) {
        return c;
      }
      i += numBytesForFirstByte(getByte(i));
      c += 1;
    } while (i < numBytes);

    return -1;
  }

  /**
   * Find the `str` from left to right.
   */
  private int find(UTF8String str, int start) {
    assert (str.numBytes > 0);
    while (start <= numBytes - str.numBytes) {
      if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
        return start;
      }
      start += 1;
    }
    return -1;
  }

  /**
   * Find the `str` from right to left.
   */
  private int rfind(UTF8String str, int start) {
    assert (str.numBytes > 0);
    while (start >= 0) {
      if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
        return start;
      }
      start -= 1;
    }
    return -1;
  }

  /**
   * Returns the substring from string str before count occurrences of the delimiter delim.
   * If count is positive, everything the left of the final delimiter (counting from left) is
   * returned. If count is negative, every to the right of the final delimiter (counting from the
   * right) is returned. subStringIndex performs a case-sensitive match when searching for delim.
   */
  public UTF8String subStringIndex(UTF8String delim, int count) {
    if (delim.numBytes == 0 || count == 0) {
      return EMPTY_UTF8;
    }
    if (count > 0) {
      int idx = -1;
      while (count > 0) {
        idx = find(delim, idx + 1);
        if (idx >= 0) {
          count --;
        } else {
          // can not find enough delim
          return this;
        }
      }
      if (idx == 0) {
        return EMPTY_UTF8;
      }
      byte[] bytes = new byte[idx];
      copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, idx);
      return fromBytes(bytes);

    } else {
      int idx = numBytes - delim.numBytes + 1;
      count = -count;
      while (count > 0) {
        idx = rfind(delim, idx - 1);
        if (idx >= 0) {
          count --;
        } else {
          // can not find enough delim
          return this;
        }
      }
      if (idx + delim.numBytes == numBytes) {
        return EMPTY_UTF8;
      }
      int size = numBytes - delim.numBytes - idx;
      byte[] bytes = new byte[size];
      copyMemory(base, offset + idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size);
      return fromBytes(bytes);
    }
  }

  /**
   * Returns str, right-padded with pad to a length of len
   * For example:
   *   ('hi', 5, '??') =&gt; 'hi???'
   *   ('hi', 1, '??') =&gt; 'h'
   */
  public UTF8String rpad(int len, UTF8String pad) {
    int spaces = len - this.numChars(); // number of char need to pad
    if (spaces <= 0 || pad.numBytes() == 0) {
      // no padding at all, return the substring of the current string
      return substring(0, len);
    } else {
      int padChars = pad.numChars();
      int count = spaces / padChars; // how many padding string needed
      // the partial string of the padding
      UTF8String remain = pad.substring(0, spaces - padChars * count);

      int resultSize =
        Math.toIntExact((long) numBytes + (long) pad.numBytes * count + remain.numBytes);
      byte[] data = new byte[resultSize];
      copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET, this.numBytes);
      int offset = this.numBytes;
      int idx = 0;
      while (idx < count) {
        copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
        ++ idx;
        offset += pad.numBytes;
      }
      copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);

      return UTF8String.fromBytes(data);
    }
  }

  /**
   * Returns str, left-padded with pad to a length of len.
   * For example:
   *   ('hi', 5, '??') =&gt; '???hi'
   *   ('hi', 1, '??') =&gt; 'h'
   */
  public UTF8String lpad(int len, UTF8String pad) {
    int spaces = len - this.numChars(); // number of char need to pad
    if (spaces <= 0 || pad.numBytes() == 0) {
      // no padding at all, return the substring of the current string
      return substring(0, len);
    } else {
      int padChars = pad.numChars();
      int count = spaces / padChars; // how many padding string needed
      // the partial string of the padding
      UTF8String remain = pad.substring(0, spaces - padChars * count);

      int resultSize =
        Math.toIntExact((long) numBytes + (long) pad.numBytes * count + remain.numBytes);
      byte[] data = new byte[resultSize];

      int offset = 0;
      int idx = 0;
      while (idx < count) {
        copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
        ++ idx;
        offset += pad.numBytes;
      }
      copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
      offset += remain.numBytes;
      copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET + offset, numBytes());

      return UTF8String.fromBytes(data);
    }
  }

  /**
   * Concatenates input strings together into a single string. Returns null if any input is null.
   */
  public static UTF8String concat(UTF8String... inputs) {
    // Compute the total length of the result.
    long totalLength = 0;
    for (UTF8String input : inputs) {
      if (input == null) {
        return null;
      }
      totalLength += input.numBytes;
    }

    // Allocate a new byte array, and copy the inputs one by one into it.
    final byte[] result = new byte[Math.toIntExact(totalLength)];
    int offset = 0;
    for (UTF8String input : inputs) {
      int len = input.numBytes;
      copyMemory(
        input.base, input.offset,
        result, BYTE_ARRAY_OFFSET + offset,
        len);
      offset += len;
    }
    return fromBytes(result);
  }

  /**
   * Concatenates input strings together into a single string using the separator.
   * A null input is skipped. For example, concat(",", "a", null, "c") would yield "a,c".
   */
  public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
    if (separator == null) {
      return null;
    }

    long numInputBytes = 0L;  // total number of bytes from the inputs
    int numInputs = 0;      // number of non-null inputs
    for (UTF8String input : inputs) {
      if (input != null) {
        numInputBytes += input.numBytes;
        numInputs++;
      }
    }

    if (numInputs == 0) {
      // Return an empty string if there is no input, or all the inputs are null.
      return EMPTY_UTF8;
    }

    // Allocate a new byte array, and copy the inputs one by one into it.
    // The size of the new array is the size of all inputs, plus the separators.
    int resultSize = Math.toIntExact(numInputBytes + (numInputs - 1) * (long)separator.numBytes);
    final byte[] result = new byte[resultSize];
    int offset = 0;

    for (int i = 0, j = 0; i < inputs.length; i++) {
      if (inputs[i] != null) {
        int len = inputs[i].numBytes;
        copyMemory(
          inputs[i].base, inputs[i].offset,
          result, BYTE_ARRAY_OFFSET + offset,
          len);
        offset += len;

        j++;
        // Add separator if this is not the last input.
        if (j < numInputs) {
          copyMemory(
            separator.base, separator.offset,
            result, BYTE_ARRAY_OFFSET + offset,
            separator.numBytes);
          offset += separator.numBytes;
        }
      }
    }
    return fromBytes(result);
  }

  public UTF8String[] split(UTF8String pattern, int limit) {
    // For the empty `pattern` a `split` function ignores trailing empty strings unless original
    // string is empty.
    if (numBytes() != 0 && pattern.numBytes() == 0) {
      int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit;
      byte[] input = getBytes();
      int byteIndex = 0;
      int charIndex = 0;
      UTF8String[] result = new UTF8String[newLimit];
      while (charIndex < newLimit) {
        int currCharNumBytes = numBytesForFirstByte(input[byteIndex]);
        result[charIndex++] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes);
        byteIndex += currCharNumBytes;
      }
      return result;
    }
    return split(pattern.toString(), limit);
  }

  public UTF8String[] splitSQL(UTF8String delimiter, int limit) {
    // if delimiter is empty string, skip the regex based splitting directly as regex
    // treats empty string as matching anything, thus use the input directly.
    if (delimiter.numBytes() == 0) {
      return new UTF8String[]{this};
    } else {
      // we do not treat delimiter as a regex but consider the whole string of delimiter
      // as the separator to split string. Java String's split, however, only accept
      // regex as the pattern to split, thus we can quote the delimiter to escape special
      // characters in the string.
      return split(Pattern.quote(delimiter.toString()), limit);
    }
  }

  private UTF8String[] split(String delimiter, int limit) {
    // Java String's split method supports "ignore empty string" behavior when the limit is 0
    // whereas other languages do not. To avoid this java specific behavior, we fall back to
    // -1 when the limit is 0.
    if (limit == 0) {
      limit = -1;
    }
    String[] splits = toString().split(delimiter, limit);
    UTF8String[] res = new UTF8String[splits.length];
    for (int i = 0; i < res.length; i++) {
      res[i] = fromString(splits[i]);
    }
    return res;
  }

  public UTF8String replace(UTF8String search, UTF8String replace) {
    // This implementation is loosely based on commons-lang3's StringUtils.replace().
    if (numBytes == 0 || search.numBytes == 0) {
      return this;
    }
    // Find the first occurrence of the search string.
    int start = 0;
    int end = this.find(search, start);
    if (end == -1) {
      // Search string was not found, so string is unchanged.
      return this;
    }
    // At least one match was found. Estimate space needed for result.
    // The 16x multiplier here is chosen to match commons-lang3's implementation.
    int increase = Math.max(0, replace.numBytes - search.numBytes) * 16;
    final UTF8StringBuilder buf = new UTF8StringBuilder(numBytes + increase);
    while (end != -1) {
      buf.appendBytes(this.base, this.offset + start, end - start);
      buf.append(replace);
      start = end + search.numBytes;
      end = this.find(search, start);
    }
    buf.appendBytes(this.base, this.offset + start, numBytes - start);
    return buf.build();
  }

  public UTF8String translate(Map<String, String> dict) {
    String srcStr = this.toString();

    StringBuilder sb = new StringBuilder();
    int charCount = 0;
    for (int k = 0; k < srcStr.length(); k += charCount) {
      int codePoint = srcStr.codePointAt(k);
      charCount = Character.charCount(codePoint);
      String subStr = srcStr.substring(k, k + charCount);
      String translated = dict.get(subStr);
      if (null == translated) {
        sb.append(subStr);
      } else if (!"\0".equals(translated)) {
        sb.append(translated);
      }
    }
    return fromString(sb.toString());
  }

  /**
   * Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
   * This is done solely for better performance and is not expected to be used by end users.
   */
  public static class LongWrapper implements Serializable {
    public transient long value = 0;
  }

  /**
   * Wrapper over `int` to allow result of parsing integer from string to be accessed via reference.
   * This is done solely for better performance and is not expected to be used by end users.
   *
   * {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
   * conversion from `long` to `int`
   */
  public static class IntWrapper implements Serializable {
    public transient int value = 0;
  }

  /**
   * Parses this UTF8String(trimmed if needed) to long.
   *
   * Note that, in this method we accumulate the result in negative format, and convert it to
   * positive format at the end, if this string is not started with '-'. This is because min value
   * is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
   * Long.MIN_VALUE is '-9223372036854775808'.
   *
   * This code is mostly copied from LazyLong.parseLong in Hive.
   *
   * @param toLongResult If a valid `long` was parsed from this UTF8String, then its value would
   *                     be set in `toLongResult`
   * @return true if the parsing was successful else false
   */
  public boolean toLong(LongWrapper toLongResult) {
    return toLong(toLongResult, true);
  }

  private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
    int offset = 0;
    while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
    if (offset == this.numBytes) return false;

    int end = this.numBytes - 1;
    while (end > offset && Character.isWhitespace(getByte(end))) end--;

    byte b = getByte(offset);
    final boolean negative = b == '-';
    if (negative || b == '+') {
      if (end - offset == 0) {
        return false;
      }
      offset++;
    }

    final byte separator = '.';
    final int radix = 10;
    final long stopValue = Long.MIN_VALUE / radix;
    long result = 0;

    while (offset <= end) {
      b = getByte(offset);
      offset++;
      if (b == separator && allowDecimal) {
        // We allow decimals and will return a truncated integral in that case.
        // Therefore we won't throw an exception here (checking the fractional
        // part happens below.)
        break;
      }

      int digit;
      if (b >= '0' && b <= '9') {
        digit = b - '0';
      } else {
        return false;
      }

      // We are going to process the new digit and accumulate the result. However, before doing
      // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
      // result * 10 will definitely be smaller than minValue, and we can stop.
      if (result < stopValue) {
        return false;
      }

      result = result * radix - digit;
      // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
      // can just use `result > 0` to check overflow. If result overflows, we should stop.
      if (result > 0) {
        return false;
      }
    }

    // This is the case when we've encountered a decimal separator. The fractional
    // part will not change the number, but we will verify that the fractional part
    // is well formed.
    while (offset <= end) {
      byte currentByte = getByte(offset);
      if (currentByte < '0' || currentByte > '9') {
        return false;
      }
      offset++;
    }

    if (!negative) {
      result = -result;
      if (result < 0) {
        return false;
      }
    }

    toLongResult.value = result;
    return true;
  }

  /**
   * Parses this UTF8String(trimmed if needed) to int.
   *
   * Note that, in this method we accumulate the result in negative format, and convert it to
   * positive format at the end, if this string is not started with '-'. This is because min value
   * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
   * Integer.MIN_VALUE is '-2147483648'.
   *
   * This code is mostly copied from LazyInt.parseInt in Hive.
   *
   * Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
   * reasons, like Hive does.
   *
   * @param intWrapper If a valid `int` was parsed from this UTF8String, then its value would
   *                    be set in `intWrapper`
   * @return true if the parsing was successful else false
   */
  public boolean toInt(IntWrapper intWrapper) {
    return toInt(intWrapper, true);
  }

  private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
    int offset = 0;
    while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
    if (offset == this.numBytes) return false;

    int end = this.numBytes - 1;
    while (end > offset && Character.isWhitespace(getByte(end))) end--;

    byte b = getByte(offset);
    final boolean negative = b == '-';
    if (negative || b == '+') {
      if (end - offset == 0) {
        return false;
      }
      offset++;
    }

    final byte separator = '.';
    final int radix = 10;
    final int stopValue = Integer.MIN_VALUE / radix;
    int result = 0;

    while (offset <= end) {
      b = getByte(offset);
      offset++;
      if (b == separator && allowDecimal) {
        // We allow decimals and will return a truncated integral in that case.
        // Therefore we won't throw an exception here (checking the fractional
        // part happens below.)
        break;
      }

      int digit;
      if (b >= '0' && b <= '9') {
        digit = b - '0';
      } else {
        return false;
      }

      // We are going to process the new digit and accumulate the result. However, before doing
      // this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
      // result * 10 will definitely be smaller than minValue, and we can stop
      if (result < stopValue) {
        return false;
      }

      result = result * radix - digit;
      // Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
      // we can just use `result > 0` to check overflow. If result overflows, we should stop
      if (result > 0) {
        return false;
      }
    }

    // This is the case when we've encountered a decimal separator. The fractional
    // part will not change the number, but we will verify that the fractional part
    // is well formed.
    while (offset <= end) {
      byte currentByte = getByte(offset);
      if (currentByte < '0' || currentByte > '9') {
        return false;
      }
      offset++;
    }

    if (!negative) {
      result = -result;
      if (result < 0) {
        return false;
      }
    }
    intWrapper.value = result;
    return true;
  }

  public boolean toShort(IntWrapper intWrapper) {
    if (toInt(intWrapper)) {
      int intValue = intWrapper.value;
      short result = (short) intValue;
      return result == intValue;
    }
    return false;
  }

  public boolean toByte(IntWrapper intWrapper) {
    if (toInt(intWrapper)) {
      int intValue = intWrapper.value;
      byte result = (byte) intValue;
      return result == intValue;
    }
    return false;
  }

  /**
   * Parses UTF8String(trimmed if needed) to long. This method is used when ANSI is enabled.
   *
   * @return If string contains valid numeric value then it returns the long value otherwise a
   * NumberFormatException  is thrown.
   */
  public long toLongExact() {
    LongWrapper result = new LongWrapper();
    if (toLong(result, false)) {
      return result.value;
    }
    throw new NumberFormatException("invalid input syntax for type numeric: '" + this + "'");
  }

  /**
   * Parses UTF8String(trimmed if needed) to int. This method is used when ANSI is enabled.
   *
   * @return If string contains valid numeric value then it returns the int value otherwise a
   * NumberFormatException  is thrown.
   */
  public int toIntExact() {
    IntWrapper result = new IntWrapper();
    if (toInt(result, false)) {
      return result.value;
    }
    throw new NumberFormatException("invalid input syntax for type numeric: '" + this + "'");
  }

  public short toShortExact() {
    int value = this.toIntExact();
    short result = (short) value;
    if (result == value) {
      return result;
    }
    throw new NumberFormatException("invalid input syntax for type numeric: '" + this + "'");
  }

  public byte toByteExact() {
    int value = this.toIntExact();
    byte result = (byte) value;
    if (result == value) {
      return result;
    }
    throw new NumberFormatException("invalid input syntax for type numeric: '" + this + "'");
  }

  @Override
  public String toString() {
    return new String(getBytes(), StandardCharsets.UTF_8);
  }

  @Override
  public UTF8String clone() {
    return fromBytes(getBytes());
  }

  public UTF8String copy() {
    byte[] bytes = new byte[numBytes];
    copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes);
    return fromBytes(bytes);
  }

  @Override
  public int compareTo(@Nonnull final UTF8String other) {
    return ByteArray.compareBinary(
        base, offset, numBytes, other.base, other.offset, other.numBytes);
  }

  public int compare(final UTF8String other) {
    return compareTo(other);
  }

  @Override
  public boolean equals(final Object other) {
    if (other instanceof UTF8String) {
      UTF8String o = (UTF8String) other;
      if (numBytes != o.numBytes) {
        return false;
      }
      return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes);
    } else {
      return false;
    }
  }

  /**
   * Levenshtein distance is a metric for measuring the distance of two strings. The distance is
   * defined by the minimum number of single-character edits (i.e. insertions, deletions or
   * substitutions) that are required to change one of the strings into the other.
   */
  public int levenshteinDistance(UTF8String other) {
    // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance

    int n = numChars();
    int m = other.numChars();

    if (n == 0) {
      return m;
    } else if (m == 0) {
      return n;
    }

    UTF8String s, t;

    if (n <= m) {
      s = this;
      t = other;
    } else {
      s = other;
      t = this;
      int swap;
      swap = n;
      n = m;
      m = swap;
    }

    int[] p = new int[n + 1];
    int[] d = new int[n + 1];
    int[] swap;

    int i, i_bytes, j, j_bytes, num_bytes_j, cost;

    for (i = 0; i <= n; i++) {
      p[i] = i;
    }

    for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) {
      num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes));
      d[0] = j + 1;

      for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) {
        if (s.getByte(i_bytes) != t.getByte(j_bytes) ||
              num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) {
          cost = 1;
        } else {
          cost = (ByteArrayMethods.arrayEquals(t.base, t.offset + j_bytes, s.base,
              s.offset + i_bytes, num_bytes_j)) ? 0 : 1;
        }
        d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost);
      }

      swap = p;
      p = d;
      d = swap;
    }

    return p[n];
  }

  @Override
  public int hashCode() {
    return Murmur3_x86_32.hashUnsafeBytes(base, offset, numBytes, 42);
  }

  /**
   * Soundex mapping table
   */
  private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
    '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};

  /**
   * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
   * but can also be used as a general purpose scheme to find word with similar phonemes.
   * https://en.wikipedia.org/wiki/Soundex
   */
  public UTF8String soundex() {
    if (numBytes == 0) {
      return EMPTY_UTF8;
    }

    byte b = getByte(0);
    if ('a' <= b && b <= 'z') {
      b -= 32;
    } else if (b < 'A' || 'Z' < b) {
      // first character must be a letter
      return this;
    }
    byte[] sx = {'0', '0', '0', '0'};
    sx[0] = b;
    int sxi = 1;
    int idx = b - 'A';
    byte lastCode = US_ENGLISH_MAPPING[idx];

    for (int i = 1; i < numBytes; i++) {
      b = getByte(i);
      if ('a' <= b && b <= 'z') {
        b -= 32;
      } else if (b < 'A' || 'Z' < b) {
        // not a letter, skip it
        lastCode = '0';
        continue;
      }
      idx = b - 'A';
      byte code = US_ENGLISH_MAPPING[idx];
      if (code == '7') {
        // ignore it
      } else {
        if (code != '0' && code != lastCode) {
          sx[sxi++] = code;
          if (sxi > 3) break;
        }
        lastCode = code;
      }
    }
    return UTF8String.fromBytes(sx);
  }

  @Override
  public void writeExternal(ObjectOutput out) throws IOException {
    byte[] bytes = getBytes();
    out.writeInt(bytes.length);
    out.write(bytes);
  }

  @Override
  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
    offset = BYTE_ARRAY_OFFSET;
    numBytes = in.readInt();
    base = new byte[numBytes];
    in.readFully((byte[]) base);
  }

  @Override
  public void write(Kryo kryo, Output out) {
    byte[] bytes = getBytes();
    out.writeInt(bytes.length);
    out.write(bytes);
  }

  @Override
  public void read(Kryo kryo, Input in) {
    this.offset = BYTE_ARRAY_OFFSET;
    this.numBytes = in.readInt();
    this.base = new byte[numBytes];
    in.read((byte[]) base);
  }

}

相关信息

spark 源码目录

相关文章

spark ByteArray 源码

spark CalendarInterval 源码

0  赞