HllSketch.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.hll;

import static org.apache.datasketches.common.Util.LS;
import static org.apache.datasketches.common.Util.checkBounds;
import static org.apache.datasketches.hll.HllUtil.EMPTY;
import static org.apache.datasketches.hll.HllUtil.KEY_BITS_26;
import static org.apache.datasketches.hll.HllUtil.LG_AUX_ARR_INTS;
import static org.apache.datasketches.hll.HllUtil.checkPreamble;
import static org.apache.datasketches.hll.PreambleUtil.HLL_BYTE_ARR_START;
import static org.apache.datasketches.hll.PreambleUtil.extractCompactFlag;
import static org.apache.datasketches.hll.PreambleUtil.extractLgK;
import static org.apache.datasketches.hll.PreambleUtil.extractTgtHllType;

import java.util.Objects;

import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;

/**
 * The HllSketch is actually a collection of compact implementations of Phillipe Flajolet���s HyperLogLog (HLL)
 * sketch but with significantly improved error behavior and excellent speed performance.
 *
 * <p>If the use case for sketching is primarily counting uniques and merging, the HLL sketch is the 2nd highest
 * performing in terms of accuracy for storage space consumed in the DataSketches library
 * (the new CPC sketch developed by Kevin J. Lang now beats HLL in terms of accuracy / space).
 * For large counts, HLL sketches can be 2 to 8 times smaller for the same accuracy than the DataSketches Theta
 * Sketches when serialized, but the Theta sketches can do set intersections and differences while HLL and CPC cannot.
 * The CPC sketch and HLL share similar use cases, but the CPC sketch is about 30 to 40% smaller than the HLL sketch
 * when serialized and larger than the HLL when active in memory.  Choose your weapons!</p>
 *
 * <p>A new HLL sketch is created with a simple constructor:</p>
 * <pre>{@code
 * int lgK = 12; //This is log-base2 of k, so k = 4096. lgK can be from 4 to 21
 * HllSketch sketch = new HllSketch(lgK); //TgtHllType.HLL_4 is the default
 * //OR
 * HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_6);
 * //OR
 * HllSketch sketch = new HllSketch(lgK, TgtHllType.HLL_8);
 * }</pre>
 *
 * <p>All three different sketch types are targets in that the sketches start out in a warm-up mode that is small in
 * size and gradually grows as needed until the full HLL array is allocated. The HLL_4, HLL_6 and HLL_8 represent
 * different levels of compression of the final HLL array where the 4, 6 and 8 refer to the number of bits each
 * bucket of the HLL array is compressed down to.
 * The HLL_4 is the most compressed but generally slower than the other two, especially during union operations.</p>
 *
 * <p>All three types share the same API. Updating the HllSketch is very simple:</p>
 *
 * <pre>{@code
 * long n = 1000000;
 * for (int i = 0; i < n; i++) {
 *   sketch.update(i);
 * }
 * }</pre>
 *
 * <p>Each of the presented integers above are first hashed into 128-bit hash values that are used by the sketch
 * HLL algorithm, so the above loop is essentially equivalent to using a random number generator initialized with a
 * seed so that the sequence is deterministic and random.</p>
 *
 * <p>Obtaining the cardinality results from the sketch is also simple:</p>
 *
 * <pre>{@code
 * double estimate = sketch.getEstimate();
 * double estUB = sketch.getUpperBound(1.0); //the upper bound at 1 standard deviation.
 * double estLB = sketch.getLowerBound(1.0); //the lower bound at 1 standard deviation.
 * //OR
 * System.out.println(sketch.toString()); //will output a summary of the sketch.
 * }</pre>
 *
 * <p>Which produces a console output something like this:</p>
 *
 * <pre>{@code
 * ### HLL SKETCH SUMMARY:
 *   Log Config K   : 12
 *   Hll Target     : HLL_4
 *   Current Mode   : HLL
 *   LB             : 977348.7024560181
 *   Estimate       : 990116.6007366662
 *   UB             : 1003222.5095308956
 *   OutOfOrder Flag: false
 *   CurMin         : 5
 *   NumAtCurMin    : 1
 *   HipAccum       : 990116.6007366662
 * }</pre>
 *
 * @author Lee Rhodes
 * @author Kevin Lang
 */
public class HllSketch extends BaseHllSketch {

  /**
   * The default Log_base2 of K
   */
  public static final int DEFAULT_LG_K = 12;

  /**
   * The default HLL-TYPE is HLL_4
   */
  public static final TgtHllType DEFAULT_HLL_TYPE = TgtHllType.HLL_4;

  HllSketchImpl hllSketchImpl = null;

  /**
   * Constructs a new on-heap sketch with the default lgConfigK and tgtHllType.
   */
  public HllSketch() {
    this(DEFAULT_LG_K, DEFAULT_HLL_TYPE);
  }

  /**
   * Constructs a new on-heap sketch with the default tgtHllType.
   * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
   * between 4 and 21 inclusively.
   */
  public HllSketch(final int lgConfigK) {
    this(lgConfigK, DEFAULT_HLL_TYPE);
  }

  /**
   * Constructs a new on-heap sketch with the type of HLL sketch to configure.
   * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
   * between 4 and 21 inclusively.
   * @param tgtHllType the desired HLL type.
   */
  public HllSketch(final int lgConfigK, final TgtHllType tgtHllType) {
    hllSketchImpl = new CouponList(HllUtil.checkLgK(lgConfigK), tgtHllType, CurMode.LIST);
  }

  /**
   * Constructs a new sketch with the type of HLL sketch to configure and the given
   * WritableMemory as the destination for the sketch. This WritableMemory is usually configured
   * for off-heap memory. What remains on the java heap is a thin wrapper object that reads and
   * writes to the given WritableMemory.
   *
   * <p>The given <i>dstMem</i> is checked for the required capacity as determined by
   * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}.
   * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
   * between 4 and 21 inclusively.
   * @param tgtHllType the desired HLL type.
   * @param dstMem the destination memory for the sketch.
   */
  public HllSketch(final int lgConfigK, final TgtHllType tgtHllType, final WritableMemory dstMem) {
    Objects.requireNonNull(dstMem, "Destination Memory must not be null");
    final long minBytes = getMaxUpdatableSerializationBytes(lgConfigK, tgtHllType);
    final long capBytes = dstMem.getCapacity();
    HllUtil.checkMemSize(minBytes, capBytes);
    dstMem.clear(0, minBytes);
    hllSketchImpl = DirectCouponList.newInstance(lgConfigK, tgtHllType, dstMem);
  }

  /**
   * Copy constructor used by copy().
   * @param that another HllSketch
   */
  HllSketch(final HllSketch that) {
    hllSketchImpl = that.hllSketchImpl.copy();
  }

  /**
   * Special constructor used by copyAs, heapify
   * @param that another HllSketchImpl, which must already be a copy
   */
  HllSketch(final HllSketchImpl that) {
    hllSketchImpl = that;
  }

  /**
   * Heapify the given byte array, which must be a valid HllSketch image and may have data.
   * @param byteArray the given byte array.  This byteArray is not modified and is not retained
   * by the on-heap sketch.
   * @return an HllSketch on the java heap.
   */
  public static final HllSketch heapify(final byte[] byteArray) {
    return heapify(Memory.wrap(byteArray));
  }

  /**
   * Heapify the given Memory, which must be a valid HllSketch image and may have data.
   * @param srcMem the given Memory, which is read-only.
   * @return an HllSketch on the java heap.
   */
  public static final HllSketch heapify(final Memory srcMem) {
    return heapify(srcMem, true);
  }

  //used by union and above
  static final HllSketch heapify(final Memory srcMem, final boolean checkRebuild) {
    Objects.requireNonNull(srcMem, "Source Memory must not be null");
    checkBounds(0, 8, srcMem.getCapacity()); //need min 8 bytes
    final CurMode curMode = checkPreamble(srcMem);
    final HllSketch heapSketch;
    if (curMode == CurMode.HLL) {
      final TgtHllType tgtHllType = extractTgtHllType(srcMem);
      if (tgtHllType == TgtHllType.HLL_4) {
        heapSketch = new HllSketch(Hll4Array.heapify(srcMem));
      } else if (tgtHllType == TgtHllType.HLL_6) {
        heapSketch = new HllSketch(Hll6Array.heapify(srcMem));
      } else { //Hll_8
        heapSketch = new HllSketch(Hll8Array.heapify(srcMem));
        if (checkRebuild) {
          Union.checkRebuildCurMinNumKxQ(heapSketch);
        }
      }
    } else if (curMode == CurMode.LIST) {
      heapSketch = new HllSketch(CouponList.heapifyList(srcMem));
    } else {
      heapSketch = new HllSketch(CouponHashSet.heapifySet(srcMem));
    }
    return heapSketch;
  }

  /**
   * Wraps the given WritableMemory, which must be a image of a valid updatable sketch,
   * and may have data. What remains on the java heap is a
   * thin wrapper object that reads and writes to the given WritableMemory, which, depending on
   * how the user configures the WritableMemory, may actually reside on the Java heap or off-heap.
   *
   * <p>The given <i>dstMem</i> is checked for the required capacity as determined by
   * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}.
   * @param srcWmem an writable image of a valid source sketch with data.
   * @return an HllSketch where the sketch data is in the given dstMem.
   */
  public static final HllSketch writableWrap(final WritableMemory srcWmem) {
    return writableWrap(srcWmem, true);
  }

  //used by union and above
  static final HllSketch writableWrap( final WritableMemory srcWmem, final boolean checkRebuild) {
    Objects.requireNonNull(srcWmem, "Source Memory must not be null");
    checkBounds(0, 8, srcWmem.getCapacity()); //need min 8 bytes
    if (extractCompactFlag(srcWmem)) {
      throw new SketchesArgumentException(
          "Cannot perform a writableWrap of a writable sketch image that is in compact form. "
          + "Compact sketches are by definition immutable.");
    }
    final int lgConfigK = extractLgK(srcWmem);
    final TgtHllType tgtHllType = extractTgtHllType(srcWmem);
    final long minBytes = getMaxUpdatableSerializationBytes(lgConfigK, tgtHllType);
    final long capBytes = srcWmem.getCapacity();
    HllUtil.checkMemSize(minBytes, capBytes);
    final CurMode curMode = checkPreamble(srcWmem);
    final HllSketch directSketch;
    if (curMode == CurMode.HLL) {
      if (tgtHllType == TgtHllType.HLL_4) {
        directSketch = new HllSketch(new DirectHll4Array(lgConfigK, srcWmem));
      } else if (tgtHllType == TgtHllType.HLL_6) {
        directSketch = new HllSketch(new DirectHll6Array(lgConfigK, srcWmem));
      } else { //Hll_8
        directSketch = new HllSketch(new DirectHll8Array(lgConfigK, srcWmem));
        if (checkRebuild) { //union only uses HLL_8, we allow non-finalized from a union call.
          Union.checkRebuildCurMinNumKxQ(directSketch);
        }
      }
    } else if (curMode == CurMode.LIST) {
      directSketch =
          new HllSketch(new DirectCouponList(lgConfigK, tgtHllType, curMode, srcWmem));
    } else { //SET
      directSketch =
          new HllSketch(new DirectCouponHashSet(lgConfigK, tgtHllType, srcWmem));
    }
    return directSketch;
  }

  /**
   * Wraps the given read-only Memory that must be a image of a valid sketch,
   * which may be in compact or updatable form, and should have data. Any attempt to update the
   * given source Memory will throw an exception.
   * @param srcMem a read-only image of a valid source sketch.
   * @return an HllSketch, where the read-only data of the sketch is in the given srcMem.
   *
   */
  public static final HllSketch wrap(final Memory srcMem) {
    Objects.requireNonNull(srcMem, "Source Memory must not be null");
    checkBounds(0, 8, srcMem.getCapacity()); //need min 8 bytes
    final int lgConfigK = extractLgK(srcMem);
    final TgtHllType tgtHllType = extractTgtHllType(srcMem);

    final CurMode curMode = checkPreamble(srcMem);
    final HllSketch directSketch;
    if (curMode == CurMode.HLL) {
      if (tgtHllType == TgtHllType.HLL_4) {
        directSketch = new HllSketch(new DirectHll4Array(lgConfigK, srcMem));
      } else if (tgtHllType == TgtHllType.HLL_6) {
        directSketch = new HllSketch(new DirectHll6Array(lgConfigK, srcMem));
      } else { //Hll_8
        directSketch = new HllSketch(new DirectHll8Array(lgConfigK, srcMem));
        //rebuild if srcMem came from a union and was not finalized, rather than throw exception.
        Union.checkRebuildCurMinNumKxQ(directSketch);
      }
    } else if (curMode == CurMode.LIST) {
      directSketch =
          new HllSketch(new DirectCouponList(lgConfigK, tgtHllType, curMode, srcMem));
    } else { //SET
      directSketch =
          new HllSketch(new DirectCouponHashSet(lgConfigK, tgtHllType, srcMem));
    }
    return directSketch;
  }

  /**
   * Return a copy of this sketch onto the Java heap.
   * @return a copy of this sketch onto the Java heap.
   */
  public HllSketch copy() {
    return new HllSketch(this);
  }

  /**
   * Return a deep copy of this sketch onto the Java heap with the specified TgtHllType.
   * @param tgtHllType the TgtHllType enum
   * @return a deep copy of this sketch with the specified TgtHllType.
   */
  public HllSketch copyAs(final TgtHllType tgtHllType) {
    return new HllSketch(hllSketchImpl.copyAs(tgtHllType));
  }

  @Override
  public double getCompositeEstimate() {
    return hllSketchImpl.getCompositeEstimate();
  }

  @Override
  public double getEstimate() {
    return hllSketchImpl.getEstimate();
  }

  double getHipEstimate() {
    return hllSketchImpl.getHipEstimate();
  }

  @Override
  public int getLgConfigK() {
    return hllSketchImpl.getLgConfigK();
  }

  @Override
  public int getCompactSerializationBytes() {
    return hllSketchImpl.getCompactSerializationBytes();
  }

  @Override
  public double getLowerBound(final int numStdDev) {
    return hllSketchImpl.getLowerBound(numStdDev);
  }

  /**
   * Returns the maximum size in bytes that this sketch can grow to given lgConfigK.
   * However, for the HLL_4 sketch type, this value can be exceeded in extremely rare cases.
   * If exceeded, it will be larger by only a few percent.
   *
   * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
   * between 4 and 21 inclusively.
   * @param tgtHllType the desired Hll type
   * @return the maximum size in bytes that this sketch can grow to.
   */
  public static final int getMaxUpdatableSerializationBytes(final int lgConfigK,
      final TgtHllType tgtHllType) {
    final int arrBytes;
    if (tgtHllType == TgtHllType.HLL_4) {
      final int auxBytes = 4 << LG_AUX_ARR_INTS[lgConfigK];
      arrBytes =  AbstractHllArray.hll4ArrBytes(lgConfigK) + auxBytes;
    }
    else if (tgtHllType == TgtHllType.HLL_6) {
      arrBytes = AbstractHllArray.hll6ArrBytes(lgConfigK);
    }
    else { //HLL_8
      arrBytes = AbstractHllArray.hll8ArrBytes(lgConfigK);
    }
    return HLL_BYTE_ARR_START + arrBytes;
  }

  Memory getMemory() {
    return hllSketchImpl.getMemory();
  }

  @Override
  public TgtHllType getTgtHllType() {
    return hllSketchImpl.getTgtHllType();
  }

  @Override
  public int getUpdatableSerializationBytes() {
    return hllSketchImpl.getUpdatableSerializationBytes();
  }

  WritableMemory getWritableMemory() {
    return hllSketchImpl.getWritableMemory();
  }

  @Override
  public double getUpperBound(final int numStdDev) {
    return hllSketchImpl.getUpperBound(numStdDev);
  }

  @Override
  public boolean isCompact() {
    return hllSketchImpl.isCompact();
  }

  @Override
  public boolean isEmpty() {
    return hllSketchImpl.isEmpty();
  }

  @Override
  public boolean isMemory() {
    return hllSketchImpl.isMemory();
  }

  @Override
  public boolean isOffHeap() {
    return hllSketchImpl.isOffHeap();
  }

  @Override
  boolean isOutOfOrder() {
    return hllSketchImpl.isOutOfOrder();
  }

  @Override
  public boolean isSameResource(final Memory mem) {
    return hllSketchImpl.isSameResource(mem);
  }

  void mergeTo(final HllSketch that) {
    hllSketchImpl.mergeTo(that);
  }

  HllSketch putOutOfOrderFlag(final boolean oooFlag) {
    hllSketchImpl.putOutOfOrder(oooFlag);
    return this;
  }

  @Override
  public void reset() {
    hllSketchImpl = hllSketchImpl.reset();
  }

  @Override
  public byte[] toCompactByteArray() {
    return hllSketchImpl.toCompactByteArray();
  }

  @Override
  public byte[] toUpdatableByteArray() {
    return hllSketchImpl.toUpdatableByteArray();
  }

  @Override
  public String toString(final boolean summary, final boolean detail, final boolean auxDetail,
      final boolean all) {
    final StringBuilder sb = new StringBuilder();
    if (summary) {
      sb.append("### HLL SKETCH SUMMARY: ").append(LS);
      sb.append("  Log Config K   : ").append(getLgConfigK()).append(LS);
      sb.append("  Hll Target     : ").append(getTgtHllType()).append(LS);
      sb.append("  Current Mode   : ").append(getCurMode()).append(LS);
      sb.append("  Memory         : ").append(isMemory()).append(LS);
      sb.append("  LB             : ").append(getLowerBound(1)).append(LS);
      sb.append("  Estimate       : ").append(getEstimate()).append(LS);
      sb.append("  UB             : ").append(getUpperBound(1)).append(LS);
      sb.append("  OutOfOrder Flag: ").append(isOutOfOrder()).append(LS);
      if (getCurMode() == CurMode.HLL) {
        final AbstractHllArray absHll = (AbstractHllArray) hllSketchImpl;
        sb.append("  CurMin         : ").append(absHll.getCurMin()).append(LS);
        sb.append("  NumAtCurMin    : ").append(absHll.getNumAtCurMin()).append(LS);
        sb.append("  HipAccum       : ").append(absHll.getHipAccum()).append(LS);
        sb.append("  KxQ0           : ").append(absHll.getKxQ0()).append(LS);
        sb.append("  KxQ1           : ").append(absHll.getKxQ1()).append(LS);
        sb.append("  Rebuild KxQ Flg: ").append(absHll.isRebuildCurMinNumKxQFlag()).append(LS);
      } else {
        sb.append("  Coupon Count   : ")
          .append(((AbstractCoupons)hllSketchImpl).getCouponCount()).append(LS);
      }
    }
    if (detail) {
      sb.append("### HLL SKETCH DATA DETAIL: ").append(LS);
      final PairIterator pitr = iterator();
      sb.append(pitr.getHeader()).append(LS);
      if (all) {
        while (pitr.nextAll()) {
          sb.append(pitr.getString()).append(LS);
        }
      } else {
        while (pitr.nextValid()) {
          sb.append(pitr.getString()).append(LS);
        }
      }
    }
    if (auxDetail) {
      if ((getCurMode() == CurMode.HLL) && (getTgtHllType() == TgtHllType.HLL_4)) {
        final AbstractHllArray absHll = (AbstractHllArray) hllSketchImpl;
        final PairIterator auxItr = absHll.getAuxIterator();
        if (auxItr != null) {
          sb.append("### HLL SKETCH AUX DETAIL: ").append(LS);
          sb.append(auxItr.getHeader()).append(LS);
          if (all) {
            while (auxItr.nextAll()) {
              sb.append(auxItr.getString()).append(LS);
            }
          } else {
            while (auxItr.nextValid()) {
              sb.append(auxItr.getString()).append(LS);
            }
          }
        }
      }
    }
    return sb.toString();
  }

  /**
   * Returns a human readable string of the preamble of a byte array image of an HllSketch.
   * @param byteArr the given byte array
   * @return a human readable string of the preamble of a byte array image of an HllSketch.
   */
  public static String toString(final byte[] byteArr) {
    return PreambleUtil.toString(byteArr);
  }

  /**
   * Returns a human readable string of the preamble of a Memory image of an HllSketch.
   * @param mem the given Memory object
   * @return a human readable string of the preamble of a Memory image of an HllSketch.
   */
  public static String toString(final Memory mem) {
    return PreambleUtil.toString(mem);
  }

  //restricted methods

  /**
   * Returns a PairIterator over the key, value pairs of the HLL array.
   * @return a PairIterator over the key, value pairs of the HLL array.
   */
  PairIterator iterator() {
    return hllSketchImpl.iterator();
  }

  @Override
  CurMode getCurMode() {
    return hllSketchImpl.getCurMode();
  }

  @Override
  void couponUpdate(final int coupon) {
    if ((coupon >>> KEY_BITS_26 ) == EMPTY) { return; }
    hllSketchImpl = hllSketchImpl.couponUpdate(coupon);
  }

}