SearchDictionaryEncoder.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.arrow.algorithm.dictionary;

import org.apache.arrow.algorithm.search.VectorSearcher;
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.ValueVector;

/**
 * Dictionary encoder based on searching.
 *
 * @param <E> encoded vector type.
 * @param <D> decoded vector type, which is also the dictionary type.
 */
public class SearchDictionaryEncoder<E extends BaseIntVector, D extends ValueVector>
    implements DictionaryEncoder<E, D> {

  /** The dictionary for encoding/decoding. It must be sorted. */
  private final D dictionary;

  /** The criteria by which the dictionary is sorted. */
  private final VectorValueComparator<D> comparator;

  /** A flag indicating if null should be encoded. */
  private final boolean encodeNull;

  /**
   * Constructs a dictionary encoder.
   *
   * @param dictionary the dictionary. It must be in sorted order.
   * @param comparator the criteria for sorting.
   */
  public SearchDictionaryEncoder(D dictionary, VectorValueComparator<D> comparator) {
    this(dictionary, comparator, false);
  }

  /**
   * Constructs a dictionary encoder.
   *
   * @param dictionary the dictionary. It must be in sorted order.
   * @param comparator the criteria for sorting.
   * @param encodeNull a flag indicating if null should be encoded. It determines the behaviors for
   *     processing null values in the input during encoding. When a null is encountered in the
   *     input, 1) If the flag is set to true, the encoder searches for the value in the dictionary,
   *     and outputs the index in the dictionary. 2) If the flag is set to false, the encoder simply
   *     produces a null in the output.
   */
  public SearchDictionaryEncoder(
      D dictionary, VectorValueComparator<D> comparator, boolean encodeNull) {
    this.dictionary = dictionary;
    this.comparator = comparator;
    this.encodeNull = encodeNull;
  }

  /**
   * Encodes an input vector by binary search. So the algorithm takes O(n * log(m)) time, where n is
   * the length of the input vector, and m is the length of the dictionary.
   *
   * @param input the input vector.
   * @param output the output vector. Note that it must be in a fresh state. At least, all its
   *     validity bits should be clear.
   */
  @Override
  public void encode(D input, E output) {
    for (int i = 0; i < input.getValueCount(); i++) {
      if (!encodeNull && input.isNull(i)) {
        // for this case, we should simply output a null in the output.
        // by assuming the output vector is fresh, we do nothing here.
        continue;
      }

      int index = VectorSearcher.binarySearch(dictionary, comparator, input, i);
      if (index == -1) {
        throw new IllegalArgumentException("The data element is not found in the dictionary: " + i);
      }
      output.setWithPossibleTruncate(i, index);
    }
    output.setValueCount(input.getValueCount());
  }
}