DistinctLimitOperator.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.operator;

import com.facebook.presto.common.Page;
import com.facebook.presto.common.type.Type;
import com.facebook.presto.memory.context.LocalMemoryContext;
import com.facebook.presto.spi.function.aggregation.GroupByIdBlock;
import com.facebook.presto.spi.plan.PlanNodeId;
import com.facebook.presto.sql.gen.JoinCompiler;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.primitives.Ints;

import java.util.Arrays;
import java.util.List;
import java.util.Optional;

import static com.facebook.presto.SystemSessionProperties.isDictionaryAggregationEnabled;
import static com.facebook.presto.operator.GroupByHash.createGroupByHash;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;

public class DistinctLimitOperator
        implements Operator
{
    public static class DistinctLimitOperatorFactory
            implements OperatorFactory
    {
        private final int operatorId;
        private final PlanNodeId planNodeId;
        private final List<Integer> distinctChannels;
        private final List<Type> sourceTypes;
        private final long limit;
        private final Optional<Integer> hashChannel;
        private boolean closed;
        private final JoinCompiler joinCompiler;
        private final int timeoutMillis;

        public DistinctLimitOperatorFactory(
                int operatorId,
                PlanNodeId planNodeId,
                List<? extends Type> sourceTypes,
                List<Integer> distinctChannels,
                long limit,
                Optional<Integer> hashChannel,
                JoinCompiler joinCompiler)
        {
            this(operatorId, planNodeId, sourceTypes, distinctChannels, limit, hashChannel, joinCompiler, 0);
        }

        public DistinctLimitOperatorFactory(
                int operatorId,
                PlanNodeId planNodeId,
                List<? extends Type> sourceTypes,
                List<Integer> distinctChannels,
                long limit,
                Optional<Integer> hashChannel,
                JoinCompiler joinCompiler,
                int timeoutMillis)
        {
            this.operatorId = operatorId;
            this.planNodeId = requireNonNull(planNodeId, "planNodeId is null");
            this.sourceTypes = ImmutableList.copyOf(requireNonNull(sourceTypes, "sourceTypes is null"));
            this.distinctChannels = requireNonNull(distinctChannels, "distinctChannels is null");

            checkArgument(limit >= 0, "limit must be at least zero");
            this.limit = limit;
            this.hashChannel = requireNonNull(hashChannel, "hashChannel is null");
            this.joinCompiler = requireNonNull(joinCompiler, "joinCompiler is null");
            this.timeoutMillis = timeoutMillis;
        }

        @Override
        public Operator createOperator(DriverContext driverContext)
        {
            checkState(!closed, "Factory is already closed");
            OperatorContext operatorContext = driverContext.addOperatorContext(operatorId, planNodeId, DistinctLimitOperator.class.getSimpleName());
            List<Type> distinctTypes = distinctChannels.stream()
                    .map(sourceTypes::get)
                    .collect(toImmutableList());
            return new DistinctLimitOperator(operatorContext, distinctChannels, distinctTypes, limit, hashChannel, joinCompiler, timeoutMillis);
        }

        @Override
        public void noMoreOperators()
        {
            closed = true;
        }

        @Override
        public OperatorFactory duplicate()
        {
            return new DistinctLimitOperatorFactory(operatorId, planNodeId, sourceTypes, distinctChannels, limit, hashChannel, joinCompiler, timeoutMillis);
        }
    }

    private final OperatorContext operatorContext;
    private final LocalMemoryContext localUserMemoryContext;

    private Page inputPage;
    private long remainingLimit;

    private boolean finishing;

    private final int[] outputChannels;
    private final GroupByHash groupByHash;
    private long nextDistinctId;

    // for yield when memory is not available
    private GroupByIdBlock groupByIds;
    private Work<GroupByIdBlock> unfinishedWork;
    private final long timeoutMillis;

    public DistinctLimitOperator(OperatorContext operatorContext, List<Integer> distinctChannels, List<Type> distinctTypes, long limit, Optional<Integer> hashChannel, JoinCompiler joinCompiler, int timeout)
    {
        this.operatorContext = requireNonNull(operatorContext, "operatorContext is null");
        this.localUserMemoryContext = operatorContext.localUserMemoryContext();
        checkArgument(limit >= 0, "limit must be at least zero");
        requireNonNull(hashChannel, "hashChannel is null");

        int[] distinctChannelInts = Ints.toArray(requireNonNull(distinctChannels, "distinctChannels is null"));
        if (hashChannel.isPresent()) {
            outputChannels = Arrays.copyOf(distinctChannelInts, distinctChannelInts.length + 1);
            outputChannels[distinctChannelInts.length] = hashChannel.get();
        }
        else {
            outputChannels = distinctChannelInts.clone(); // defensive copy since this is passed into createGroupByHash
        }

        this.groupByHash = createGroupByHash(
                distinctTypes,
                distinctChannelInts,
                hashChannel,
                min((int) limit, 10_000),
                isDictionaryAggregationEnabled(operatorContext.getSession()),
                joinCompiler,
                this::updateMemoryReservation);
        remainingLimit = limit;
        if (timeout > 0) {
            this.timeoutMillis = System.currentTimeMillis() + timeout;
        }
        else {
            this.timeoutMillis = 0;
        }
    }

    private boolean finishIfTimedOut()
    {
        if (timeoutMillis > 0 && System.currentTimeMillis() >= timeoutMillis) {
            finish();
            return true;
        }

        return false;
    }

    @Override
    public OperatorContext getOperatorContext()
    {
        return operatorContext;
    }

    @Override
    public void finish()
    {
        finishing = true;
    }

    @Override
    public boolean isFinished()
    {
        return finishIfTimedOut() || (!hasUnfinishedInput() && (finishing || remainingLimit == 0));
    }

    @Override
    public boolean needsInput()
    {
        return !finishIfTimedOut() && !finishing && remainingLimit > 0 && !hasUnfinishedInput();
    }

    @Override
    public void addInput(Page page)
    {
        if (finishIfTimedOut()) {
            return;
        }

        checkState(needsInput());

        inputPage = page;
        unfinishedWork = groupByHash.getGroupIds(page);
        processUnfinishedWork();
        updateMemoryReservation();
    }

    @Override
    public Page getOutput()
    {
        if (unfinishedWork != null && !processUnfinishedWork()) {
            finishIfTimedOut();
            return null;
        }

        if (groupByIds == null) {
            return null;
        }

        verify(inputPage != null);

        long resultingPositions = min(groupByIds.getGroupCount() - nextDistinctId, remainingLimit);
        Page result = null;
        if (resultingPositions > 0) {
            int[] distinctPositions = new int[toIntExact(resultingPositions)];
            int distinctCount = 0;
            for (int position = 0; position < groupByIds.getPositionCount() && distinctCount < distinctPositions.length; position++) {
                if (groupByIds.getGroupId(position) == nextDistinctId) {
                    distinctPositions[distinctCount++] = position;
                    nextDistinctId++;
                }
            }
            verify(distinctCount == distinctPositions.length);
            remainingLimit -= distinctCount;
            result = inputPage.extractChannels(outputChannels).getPositions(distinctPositions, 0, distinctPositions.length);
        }

        groupByIds = null;
        inputPage = null;

        updateMemoryReservation();
        return result;
    }

    private boolean processUnfinishedWork()
    {
        verify(unfinishedWork != null);
        if (finishIfTimedOut() || !unfinishedWork.process()) {
            return false;
        }
        groupByIds = unfinishedWork.getResult();
        unfinishedWork = null;
        return true;
    }

    private boolean hasUnfinishedInput()
    {
        return !finishIfTimedOut() && inputPage != null || unfinishedWork != null;
    }

    /**
     * Update memory usage.
     *
     * @return true if the reservation is within the limit
     */
    // TODO: update in the interface after the new memory tracking framework is landed (#9049)
    // Essentially we would love to have clean interfaces to support both pushing and pulling memory usage
    // The following implementation is a hybrid model, where the push model is going to call the pull model causing reentrancy
    private boolean updateMemoryReservation()
    {
        // Operator/driver will be blocked on memory after we call localUserMemoryContext.setBytes().
        // If memory is not available, once we return, this operator will be blocked until memory is available.
        localUserMemoryContext.setBytes(groupByHash.getEstimatedSize());
        // If memory is not available, inform the caller that we cannot proceed for allocation.
        return operatorContext.isWaitingForMemory().isDone();
    }

    @VisibleForTesting
    public int getCapacity()
    {
        return groupByHash.getCapacity();
    }
}