LinearDictionaryEncoder.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.arrow.algorithm.dictionary;

import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.compare.Range;
import org.apache.arrow.vector.compare.RangeEqualsVisitor;

/**
 * Dictionary encoder based on linear search.
 *
 * @param <E> encoded vector type.
 * @param <D> decoded vector type, which is also the dictionary type.
 */
public class LinearDictionaryEncoder<E extends BaseIntVector, D extends ValueVector>
    implements DictionaryEncoder<E, D> {

  /** The dictionary for encoding. */
  private final D dictionary;

  /** A flag indicating if null should be encoded. */
  private final boolean encodeNull;

  private RangeEqualsVisitor equalizer;

  private Range range;

  /**
   * Constructs a dictionary encoder, with the encode null flag set to false.
   *
   * @param dictionary the dictionary. Its entries should be sorted in the non-increasing order of
   *     their frequency. Otherwise, the encoder still produces correct results, but at the expense
   *     of performance overhead.
   */
  public LinearDictionaryEncoder(D dictionary) {
    this(dictionary, false);
  }

  /**
   * Constructs a dictionary encoder.
   *
   * @param dictionary the dictionary. Its entries should be sorted in the non-increasing order of
   *     their frequency. Otherwise, the encoder still produces correct results, but at the expense
   *     of performance overhead.
   * @param encodeNull a flag indicating if null should be encoded. It determines the behaviors for
   *     processing null values in the input during encoding. When a null is encountered in the
   *     input, 1) If the flag is set to true, the encoder searches for the value in the dictionary,
   *     and outputs the index in the dictionary. 2) If the flag is set to false, the encoder simply
   *     produces a null in the output.
   */
  public LinearDictionaryEncoder(D dictionary, boolean encodeNull) {
    this.dictionary = dictionary;
    this.encodeNull = encodeNull;

    // temporarily set left and right vectors to dictionary
    equalizer = new RangeEqualsVisitor(dictionary, dictionary, null);
    range = new Range(0, 0, 1);
  }

  /**
   * Encodes an input vector by linear search. When the dictionary is sorted in the non-increasing
   * order of the entry frequency, it will have constant time complexity, with no extra memory
   * requirement.
   *
   * @param input the input vector.
   * @param output the output vector. Note that it must be in a fresh state. At least, all its
   *     validity bits should be clear.
   */
  @Override
  public void encode(D input, E output) {
    for (int i = 0; i < input.getValueCount(); i++) {
      if (!encodeNull && input.isNull(i)) {
        // for this case, we should simply output a null in the output.
        // by assuming the output vector is fresh, we do nothing here.
        continue;
      }

      int index = linearSearch(input, i);
      if (index == -1) {
        throw new IllegalArgumentException("The data element is not found in the dictionary: " + i);
      }
      output.setWithPossibleTruncate(i, index);
    }
    output.setValueCount(input.getValueCount());
  }

  private int linearSearch(D input, int index) {
    range.setLeftStart(index);
    for (int i = 0; i < dictionary.getValueCount(); i++) {
      range.setRightStart(i);
      if (input.accept(equalizer, range)) {
        return i;
      }
    }
    return -1;
  }
}