VectorRunDeduplicator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.arrow.algorithm.deduplicate;

import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.BitVectorHelper;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.util.DataSizeRoundingUtil;

/**
 * Remove adjacent equal elements from a vector. If the vector is sorted, it removes all duplicated
 * values in the vector.
 *
 * @param <V> vector type.
 */
public class VectorRunDeduplicator<V extends ValueVector> implements AutoCloseable {

  /**
   * Bit set for distinct values. If the value at some index is not equal to the previous value, its
   * bit is set to 1, otherwise its bit is set to 0.
   */
  private ArrowBuf distinctValueBuffer;

  /** The vector to deduplicate. */
  private final V vector;

  private final BufferAllocator allocator;

  /**
   * Constructs a vector run deduplicator for a given vector.
   *
   * @param vector the vector to deduplicate. Ownership is NOT taken.
   * @param allocator the allocator used for allocating buffers for start indices.
   */
  public VectorRunDeduplicator(V vector, BufferAllocator allocator) {
    this.vector = vector;
    this.allocator = allocator;
  }

  private void createDistinctValueBuffer() {
    Preconditions.checkArgument(distinctValueBuffer == null);
    int bufSize = DataSizeRoundingUtil.divideBy8Ceil(vector.getValueCount());
    distinctValueBuffer = allocator.buffer(bufSize);
    DeduplicationUtils.populateRunStartIndicators(vector, distinctValueBuffer);
  }

  /**
   * Gets the number of values which are different from their predecessor.
   *
   * @return the run count.
   */
  public int getRunCount() {
    if (distinctValueBuffer == null) {
      createDistinctValueBuffer();
    }
    return vector.getValueCount()
        - BitVectorHelper.getNullCount(distinctValueBuffer, vector.getValueCount());
  }

  /**
   * Gets the vector with deduplicated adjacent values removed.
   *
   * @param outVector the output vector.
   */
  public void populateDeduplicatedValues(V outVector) {
    if (distinctValueBuffer == null) {
      createDistinctValueBuffer();
    }

    DeduplicationUtils.populateDeduplicatedValues(distinctValueBuffer, vector, outVector);
  }

  /**
   * Gets the length of each distinct value.
   *
   * @param lengthVector the vector for holding length values.
   */
  public void populateRunLengths(IntVector lengthVector) {
    if (distinctValueBuffer == null) {
      createDistinctValueBuffer();
    }

    DeduplicationUtils.populateRunLengths(
        distinctValueBuffer, lengthVector, vector.getValueCount());
  }

  @Override
  public void close() {
    if (distinctValueBuffer != null) {
      distinctValueBuffer.close();
      distinctValueBuffer = null;
    }
  }
}