HashTableBasedDictionaryBuilder.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.arrow.algorithm.dictionary;

import java.util.HashMap;
import org.apache.arrow.memory.util.ArrowBufPointer;
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.memory.util.hash.SimpleHasher;
import org.apache.arrow.vector.ElementAddressableVector;

/**
 * This class builds the dictionary based on a hash table. Each add operation can be finished in
 * O(1) time, where n is the current dictionary size.
 *
 * @param <V> the dictionary vector type.
 */
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector>
    implements DictionaryBuilder<V> {

  /** The dictionary to be built. */
  private final V dictionary;

  /** If null should be encoded. */
  private final boolean encodeNull;

  /**
   * The hash map for distinct dictionary entries. The key is the pointer to the dictionary element,
   * whereas the value is the index in the dictionary.
   */
  private HashMap<ArrowBufPointer, Integer> hashMap = new HashMap<>();

  /** The hasher used for calculating the hash code. */
  private final ArrowBufHasher hasher;

  /** Next pointer to try to add to the hash table. */
  private ArrowBufPointer nextPointer;

  /**
   * Constructs a hash table based dictionary builder.
   *
   * @param dictionary the dictionary to populate.
   */
  public HashTableBasedDictionaryBuilder(V dictionary) {
    this(dictionary, false);
  }

  /**
   * Constructs a hash table based dictionary builder.
   *
   * @param dictionary the dictionary to populate.
   * @param encodeNull if null values should be added to the dictionary.
   */
  public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull) {
    this(dictionary, encodeNull, SimpleHasher.INSTANCE);
  }

  /**
   * Constructs a hash table based dictionary builder.
   *
   * @param dictionary the dictionary to populate.
   * @param encodeNull if null values should be added to the dictionary.
   * @param hasher the hasher used to compute the hash code.
   */
  public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull, ArrowBufHasher hasher) {
    this.dictionary = dictionary;
    this.encodeNull = encodeNull;
    this.hasher = hasher;
    this.nextPointer = new ArrowBufPointer(hasher);
  }

  /**
   * Gets the dictionary built.
   *
   * @return the dictionary.
   */
  @Override
  public V getDictionary() {
    return dictionary;
  }

  /**
   * Try to add all values from the target vector to the dictionary.
   *
   * @param targetVector the target vector containing values to probe.
   * @return the number of values actually added to the dictionary.
   */
  @Override
  public int addValues(V targetVector) {
    int oldDictSize = dictionary.getValueCount();
    for (int i = 0; i < targetVector.getValueCount(); i++) {
      if (!encodeNull && targetVector.isNull(i)) {
        continue;
      }
      addValue(targetVector, i);
    }

    return dictionary.getValueCount() - oldDictSize;
  }

  /**
   * Try to add an element from the target vector to the dictionary.
   *
   * @param targetVector the target vector containing new element.
   * @param targetIndex the index of the new element in the target vector.
   * @return the index of the new element in the dictionary.
   */
  @Override
  public int addValue(V targetVector, int targetIndex) {
    targetVector.getDataPointer(targetIndex, nextPointer);

    Integer index = hashMap.get(nextPointer);
    if (index == null) {
      // a new dictionary element is found

      // insert it to the dictionary
      int dictSize = dictionary.getValueCount();
      dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
      dictionary.setValueCount(dictSize + 1);
      dictionary.getDataPointer(dictSize, nextPointer);

      // insert it to the hash map
      hashMap.put(nextPointer, dictSize);
      nextPointer = new ArrowBufPointer(hasher);

      return dictSize;
    }
    return index;
  }
}