ClusterMemoryLeakDetector.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.memory;

import com.facebook.airlift.log.Logger;
import com.facebook.presto.server.BasicQueryInfo;
import com.facebook.presto.spi.QueryId;
import com.google.common.collect.ImmutableSet;
import io.airlift.units.DataSize;

import javax.annotation.concurrent.GuardedBy;
import javax.annotation.concurrent.ThreadSafe;

import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;

import static com.facebook.presto.execution.QueryState.RUNNING;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.lang.System.currentTimeMillis;

@ThreadSafe
public class ClusterMemoryLeakDetector
{
    private static final Logger log = Logger.get(ClusterMemoryLeakDetector.class);

    // It may take some time to remove a query's memory reservations from the worker nodes, that's why
    // we check to see whether some time has passed after the query finishes to claim that it is leaked.
    private static final int DEFAULT_LEAK_CLAIM_DELTA_MILLIS = 60_000;

    @GuardedBy("this")
    private Set<QueryId> leakedQueries;

    @GuardedBy("this")
    private long leakedBytes;

    /**
     * @param queryIdToInfo All queries that the coordinator knows about, along with their optional query info.
     * @param queryMemoryReservations The memory reservations of queries in the GENERAL cluster memory pool.
     */
    void checkForMemoryLeaks(Map<QueryId, Optional<BasicQueryInfo>> queryIdToInfo, Map<QueryId, Long> queryMemoryReservations)
    {
        Map<QueryId, Long> leakedQueryReservations = queryMemoryReservations.entrySet()
                .stream()
                .filter(entry -> entry.getValue() > 0)
                .filter(entry -> isLeaked(queryIdToInfo, entry.getKey()))
                .collect(toImmutableMap(Entry::getKey, Entry::getValue));

        long leakedBytesThisTime = leakedQueryReservations.values().stream().reduce(0L, Long::sum);
        if (!leakedQueryReservations.isEmpty()) {
            log.warn("Memory leak of %s detected. The following queries are already finished, " +
                            "but they have memory reservations on some worker node(s): %s",
                    DataSize.succinctBytes(leakedBytes), leakedQueryReservations);
        }

        synchronized (this) {
            leakedQueries = ImmutableSet.copyOf(leakedQueryReservations.keySet());
            leakedBytes = leakedBytesThisTime;
        }
    }

    private static boolean isLeaked(Map<QueryId, Optional<BasicQueryInfo>> queryIdToInfo, QueryId queryId)
    {
        Optional<BasicQueryInfo> queryInfo = queryIdToInfo.get(queryId);

        // if the query is not even found then it is definitely leaked
        if (queryInfo == null) {
            return true;
        }

        Optional<Long> queryEndTimeInMillis = queryInfo.flatMap(qi -> Optional.ofNullable(qi.getState() == RUNNING ? null : qi.getQueryStats().getEndTimeInMillis()));

        return queryEndTimeInMillis.map(ts -> (currentTimeMillis() - ts) >= DEFAULT_LEAK_CLAIM_DELTA_MILLIS).orElse(false);
    }

    synchronized boolean wasQueryPossiblyLeaked(QueryId queryId)
    {
        return leakedQueries.contains(queryId);
    }

    synchronized int getNumberOfLeakedQueries()
    {
        return leakedQueries.size();
    }

    synchronized long getLeakedBytes()
    {
        return leakedBytes;
    }
}