spark RocksDB 源码

  • 2022-10-20
  • 浏览 (455)

spark RocksDB 代码


 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package org.apache.spark.util.kvstore;

import java.lang.ref.Reference;
import java.lang.ref.WeakReference;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicReference;

import static java.nio.charset.StandardCharsets.UTF_8;

import org.rocksdb.*;

import org.apache.spark.annotation.Private;

 * Implementation of KVStore that uses RocksDB as the underlying data store.
public class RocksDB implements KVStore {

  static {

  static final long STORE_VERSION = 1L;

  static final byte[] STORE_VERSION_KEY = "__version__".getBytes(UTF_8);

  /** DB key where app metadata is stored. */
  private static final byte[] METADATA_KEY = "__meta__".getBytes(UTF_8);

  /** DB key where type aliases are stored. */
  private static final byte[] TYPE_ALIASES_KEY = "__types__".getBytes(UTF_8);

   * Use full filter.
  private static final BloomFilter fullFilter =
    new BloomFilter(10.0D /* BloomFilter.DEFAULT_BITS_PER_KEY */, false);

  /** Disable compression in index data. */
  private static final BlockBasedTableConfig tableFormatConfig = new BlockBasedTableConfig()

   * - Use ZSTD at the bottom most level to reduce the disk space
   * - Use LZ4 at the other levels because it's better than Snappy in general.
  private static final Options dbOptions = new Options()

   * - Use explicitly 'sync = false' like LevelDB KVStore implementation.
  private static final WriteOptions writeOptions = new WriteOptions().setSync(false);

  private final AtomicReference<org.rocksdb.RocksDB> _db;

  final KVStoreSerializer serializer;

   * Keep a mapping of class names to a shorter, unique ID managed by the store. This serves two
   * purposes: make the keys stored on disk shorter, and spread out the keys, since class names
   * will often have a long, redundant prefix (think "org.apache.spark.").
  private final ConcurrentMap<String, byte[]> typeAliases;
  private final ConcurrentMap<Class<?>, RocksDBTypeInfo> types;

   * Trying to close a JNI RocksDB handle with a closed DB causes JVM crashes. This is used to
   * ensure that all iterators are correctly closed before RocksDB is closed. Use weak references
   * to ensure that the iterator can be GCed, when it is only referenced here.
  private final ConcurrentLinkedQueue<Reference<RocksDBIterator<?>>> iteratorTracker;

  public RocksDB(File path) throws Exception {
    this(path, new KVStoreSerializer());

  public RocksDB(File path, KVStoreSerializer serializer) throws Exception {
    this.serializer = serializer;
    this.types = new ConcurrentHashMap<>();
    this._db = new AtomicReference<>(, path.toString()));

    byte[] versionData = db().get(STORE_VERSION_KEY);
    if (versionData != null) {
      long version = serializer.deserializeLong(versionData);
      if (version != STORE_VERSION) {
        throw new UnsupportedStoreVersionException();
    } else {
      db().put(STORE_VERSION_KEY, serializer.serialize(STORE_VERSION));

    Map<String, byte[]> aliases;
    try {
      aliases = get(TYPE_ALIASES_KEY, TypeAliases.class).aliases;
    } catch (NoSuchElementException e) {
      aliases = new HashMap<>();
    typeAliases = new ConcurrentHashMap<>(aliases);

    iteratorTracker = new ConcurrentLinkedQueue<>();

  public <T> T getMetadata(Class<T> klass) throws Exception {
    try {
      return get(METADATA_KEY, klass);
    } catch (NoSuchElementException nsee) {
      return null;

  public void setMetadata(Object value) throws Exception {
    if (value != null) {
      put(METADATA_KEY, value);
    } else {

  <T> T get(byte[] key, Class<T> klass) throws Exception {
    byte[] data = db().get(key);
    if (data == null) {
      throw new NoSuchElementException(new String(key, UTF_8));
    return serializer.deserialize(data, klass);

  private void put(byte[] key, Object value) throws Exception {
    Preconditions.checkArgument(value != null, "Null values are not allowed.");
    db().put(key, serializer.serialize(value));

  public <T> T read(Class<T> klass, Object naturalKey) throws Exception {
    Preconditions.checkArgument(naturalKey != null, "Null keys are not allowed.");
    byte[] key = getTypeInfo(klass).naturalIndex().start(null, naturalKey);
    return get(key, klass);

  public void write(Object value) throws Exception {
    Preconditions.checkArgument(value != null, "Null values are not allowed.");
    RocksDBTypeInfo ti = getTypeInfo(value.getClass());
    byte[] data = serializer.serialize(value);
    synchronized (ti) {
      try (WriteBatch writeBatch = new WriteBatch()) {
        updateBatch(writeBatch, value, data, value.getClass(), ti.naturalIndex(), ti.indices());
        db().write(writeOptions, writeBatch);

  public void writeAll(List<?> values) throws Exception {
    Preconditions.checkArgument(values != null && !values.isEmpty(),
      "Non-empty values required.");

    // Group by class, in case there are values from different classes in the values
    // Typical usecase is for this to be a single class.
    // A NullPointerException will be thrown if values contain null object.
    for (Map.Entry<? extends Class<?>, ? extends List<?>> entry : {

      final Iterator<?> valueIter = entry.getValue().iterator();
      final Iterator<byte[]> serializedValueIter;

      // Deserialize outside synchronized block
      List<byte[]> list = new ArrayList<>(entry.getValue().size());
      for (Object value : values) {
      serializedValueIter = list.iterator();

      final Class<?> klass = entry.getKey();
      final RocksDBTypeInfo ti = getTypeInfo(klass);

      synchronized (ti) {
        final RocksDBTypeInfo.Index naturalIndex = ti.naturalIndex();
        final Collection<RocksDBTypeInfo.Index> indices = ti.indices();

        try (WriteBatch writeBatch = new WriteBatch()) {
          while (valueIter.hasNext()) {
            updateBatch(writeBatch,,, klass,
                naturalIndex, indices);
          db().write(writeOptions, writeBatch);

  private void updateBatch(
      WriteBatch batch,
      Object value,
      byte[] data,
      Class<?> klass,
      RocksDBTypeInfo.Index naturalIndex,
      Collection<RocksDBTypeInfo.Index> indices) throws Exception {
    Object existing;
    try {
      existing = get(naturalIndex.entityKey(null, value), klass);
    } catch (NoSuchElementException e) {
      existing = null;

    PrefixCache cache = new PrefixCache(value);
    byte[] naturalKey = naturalIndex.toKey(naturalIndex.getValue(value));
    for (RocksDBTypeInfo.Index idx : indices) {
      byte[] prefix = cache.getPrefix(idx);
      idx.add(batch, value, existing, data, naturalKey, prefix);

  public void delete(Class<?> type, Object naturalKey) throws Exception {
    Preconditions.checkArgument(naturalKey != null, "Null keys are not allowed.");
    try (WriteBatch writeBatch = new WriteBatch()) {
      RocksDBTypeInfo ti = getTypeInfo(type);
      byte[] key = ti.naturalIndex().start(null, naturalKey);
      synchronized (ti) {
        byte[] data = db().get(key);
        if (data != null) {
          Object existing = serializer.deserialize(data, type);
          PrefixCache cache = new PrefixCache(existing);
          byte[] keyBytes = ti.naturalIndex().toKey(ti.naturalIndex().getValue(existing));
          for (RocksDBTypeInfo.Index idx : ti.indices()) {
            idx.remove(writeBatch, existing, keyBytes, cache.getPrefix(idx));
          db().write(writeOptions, writeBatch);
    } catch (NoSuchElementException nse) {
      // Ignore.

  public <T> KVStoreView<T> view(Class<T> type) throws Exception {
    return new KVStoreView<T>() {
      public Iterator<T> iterator() {
        try {
          RocksDBIterator<T> it = new RocksDBIterator<>(type, RocksDB.this, this);
          iteratorTracker.add(new WeakReference<>(it));
          return it;
        } catch (Exception e) {
          throw Throwables.propagate(e);

  public <T> boolean removeAllByIndexValues(
      Class<T> klass,
      String index,
      Collection<?> indexValues) throws Exception {
    RocksDBTypeInfo.Index naturalIndex = getTypeInfo(klass).naturalIndex();
    boolean removed = false;
    KVStoreView<T> view = view(klass).index(index);

    for (Object indexValue : indexValues) {
      try (KVStoreIterator<T> iterator =
        view.first(indexValue).last(indexValue).closeableIterator()) {
        while (iterator.hasNext()) {
          T value =;
          Object itemKey = naturalIndex.getValue(value);
          delete(klass, itemKey);
          removed = true;

    return removed;

  public long count(Class<?> type) throws Exception {
    RocksDBTypeInfo.Index idx = getTypeInfo(type).naturalIndex();
    return idx.getCount(idx.end(null));

  public long count(Class<?> type, String index, Object indexedValue) throws Exception {
    RocksDBTypeInfo.Index idx = getTypeInfo(type).index(index);
    return idx.getCount(idx.end(null, indexedValue));

  public void close() throws IOException {
    synchronized (this._db) {
      org.rocksdb.RocksDB _db = this._db.getAndSet(null);
      if (_db == null) {

      try {
        if (iteratorTracker != null) {
          for (Reference<RocksDBIterator<?>> ref: iteratorTracker) {
            RocksDBIterator<?> it = ref.get();
            if (it != null) {
      } catch (IOException ioe) {
        throw ioe;
      } catch (Exception e) {
        throw new IOException(e.getMessage(), e);

   * Closes the given iterator if the DB is still open. Trying to close a JNI RocksDB handle
   * with a closed DB can cause JVM crashes, so this ensures that situation does not happen.
  void closeIterator(RocksDBIterator<?> it) throws IOException {
    synchronized (this._db) {
      org.rocksdb.RocksDB _db = this._db.get();
      if (_db != null) {

   * Remove iterator from iterator tracker. `RocksDBIterator` calls it to notify
   * iterator is closed.
  void notifyIteratorClosed(RocksDBIterator<?> it) {
    iteratorTracker.removeIf(ref -> it.equals(ref.get()));

  /** Returns metadata about indices for the given type. */
  RocksDBTypeInfo getTypeInfo(Class<?> type) throws Exception {
    RocksDBTypeInfo ti = types.get(type);
    if (ti == null) {
      RocksDBTypeInfo tmp = new RocksDBTypeInfo(this, type, getTypeAlias(type));
      ti = types.putIfAbsent(type, tmp);
      if (ti == null) {
        ti = tmp;
    return ti;

   * Try to avoid use-after close since that has the tendency of crashing the JVM. This doesn't
   * prevent methods that retrieved the instance from using it after close, but hopefully will
   * catch most cases; otherwise, we'll need some kind of locking.
  org.rocksdb.RocksDB db() {
    org.rocksdb.RocksDB _db = this._db.get();
    if (_db == null) {
      throw new IllegalStateException("DB is closed.");
    return _db;

  private byte[] getTypeAlias(Class<?> klass) throws Exception {
    byte[] alias = typeAliases.get(klass.getName());
    if (alias == null) {
      synchronized (typeAliases) {
        byte[] tmp = String.valueOf(typeAliases.size()).getBytes(UTF_8);
        alias = typeAliases.putIfAbsent(klass.getName(), tmp);
        if (alias == null) {
          alias = tmp;
          put(TYPE_ALIASES_KEY, new TypeAliases(typeAliases));
    return alias;

  /** Needs to be public for Jackson. */
  public static class TypeAliases {

    public Map<String, byte[]> aliases;

    TypeAliases(Map<String, byte[]> aliases) {
      this.aliases = aliases;

    TypeAliases() {


  private static class PrefixCache {

    private final Object entity;
    private final Map<RocksDBTypeInfo.Index, byte[]> prefixes;

    PrefixCache(Object entity) {
      this.entity = entity;
      this.prefixes = new HashMap<>();

    byte[] getPrefix(RocksDBTypeInfo.Index idx) throws Exception {
      byte[] prefix = null;
      if (idx.isChild()) {
        prefix = prefixes.get(idx.parent());
        if (prefix == null) {
          prefix = idx.parent().childPrefix(idx.parent().getValue(entity));
          prefixes.put(idx.parent(), prefix);
      return prefix;




spark 源码目录


spark ArrayWrappers 源码

spark InMemoryStore 源码

spark KVIndex 源码

spark KVStore 源码

spark KVStoreIterator 源码

spark KVStoreSerializer 源码

spark KVStoreView 源码

spark KVTypeInfo 源码

spark LevelDB 源码

spark LevelDBIterator 源码

0  赞